Merge tag 'xfs-merge-6.14' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

+6

fs/xfs/Makefile

··· 51 51 xfs_rmap_btree.o \ 52 52 xfs_refcount.o \ 53 53 xfs_refcount_btree.o \ 54 + xfs_rtrefcount_btree.o \ 55 + xfs_rtrmap_btree.o \ 54 56 xfs_sb.o \ 55 57 xfs_symlink_remote.o \ 56 58 xfs_trans_inode.o \ ··· 195 193 xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ 196 194 rgsuper.o \ 197 195 rtbitmap.o \ 196 + rtrefcount.o \ 197 + rtrmap.o \ 198 198 rtsummary.o \ 199 199 ) 200 200 ··· 236 232 237 233 xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ 238 234 rtbitmap_repair.o \ 235 + rtrefcount_repair.o \ 236 + rtrmap_repair.o \ 239 237 rtsummary_repair.o \ 240 238 ) 241 239

+3

fs/xfs/libxfs/xfs_ag_resv.c

··· 114 114 case XFS_AG_RESV_RMAPBT: 115 115 len -= xfs_perag_resv(pag, type)->ar_reserved; 116 116 break; 117 + case XFS_AG_RESV_METAFILE: 117 118 case XFS_AG_RESV_NONE: 118 119 /* empty */ 119 120 break; ··· 348 347 349 348 switch (type) { 350 349 case XFS_AG_RESV_AGFL: 350 + case XFS_AG_RESV_METAFILE: 351 351 return; 352 352 case XFS_AG_RESV_METADATA: 353 353 case XFS_AG_RESV_RMAPBT: ··· 391 389 392 390 switch (type) { 393 391 case XFS_AG_RESV_AGFL: 392 + case XFS_AG_RESV_METAFILE: 394 393 return; 395 394 case XFS_AG_RESV_METADATA: 396 395 case XFS_AG_RESV_RMAPBT:

+1 -3

fs/xfs/libxfs/xfs_attr.c

··· 1004 1004 unsigned int blks; /* space reservation */ 1005 1005 int error; /* error return value */ 1006 1006 1007 - if (xfs_is_metadir_inode(ip)) 1008 - ASSERT(XFS_IS_DQDETACHED(ip)); 1009 - else 1007 + if (!xfs_is_metadir_inode(ip)) 1010 1008 ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); 1011 1009 1012 1010 blks = XFS_ADDAFORK_SPACE_RES(mp);

+19 -15

fs/xfs/libxfs/xfs_bmap.c

··· 615 615 xfs_trans_binval(tp, cbp); 616 616 if (cur->bc_levels[0].bp == cbp) 617 617 cur->bc_levels[0].bp = NULL; 618 - xfs_iroot_realloc(ip, -1, whichfork); 618 + xfs_bmap_broot_realloc(ip, whichfork, 0); 619 619 ASSERT(ifp->if_broot == NULL); 620 620 ifp->if_format = XFS_DINODE_FMT_EXTENTS; 621 621 *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork); ··· 659 659 * Make space in the inode incore. This needs to be undone if we fail 660 660 * to expand the root. 661 661 */ 662 - xfs_iroot_realloc(ip, 1, whichfork); 662 + block = xfs_bmap_broot_realloc(ip, whichfork, 1); 663 663 664 664 /* 665 665 * Fill in the root. 666 666 */ 667 - block = ifp->if_broot; 668 667 xfs_bmbt_init_block(ip, block, NULL, 1, 1); 669 668 /* 670 669 * Need a cursor. Can't allocate until bb_level is filled in. ··· 745 746 out_unreserve_dquot: 746 747 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); 747 748 out_root_realloc: 748 - xfs_iroot_realloc(ip, -1, whichfork); 749 + xfs_bmap_broot_realloc(ip, whichfork, 0); 749 750 ifp->if_format = XFS_DINODE_FMT_EXTENTS; 750 751 ASSERT(ifp->if_broot == NULL); 751 752 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); ··· 1042 1043 int error; /* error return value */ 1043 1044 1044 1045 xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); 1045 - if (xfs_is_metadir_inode(ip)) 1046 - ASSERT(XFS_IS_DQDETACHED(ip)); 1047 - else 1046 + if (!xfs_is_metadir_inode(ip)) 1048 1047 ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); 1049 1048 ASSERT(!xfs_inode_has_attr_fork(ip)); 1050 1049 ··· 4564 4567 * the refcount btree for orphan recovery. 4565 4568 */ 4566 4569 if (whichfork == XFS_COW_FORK) 4567 - xfs_refcount_alloc_cow_extent(tp, bma.blkno, 4568 - bma.length); 4570 + xfs_refcount_alloc_cow_extent(tp, 4571 + XFS_IS_REALTIME_INODE(ip), 4572 + bma.blkno, bma.length); 4569 4573 } 4570 4574 4571 4575 /* Deal with the allocated space we found. */ ··· 4741 4743 *seq = READ_ONCE(ifp->if_seq); 4742 4744 4743 4745 if (whichfork == XFS_COW_FORK) 4744 - xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length); 4746 + xfs_refcount_alloc_cow_extent(tp, XFS_IS_REALTIME_INODE(ip), 4747 + bma.blkno, bma.length); 4745 4748 4746 4749 error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, 4747 4750 whichfork); ··· 5390 5391 bool isrt = xfs_ifork_is_realtime(ip, whichfork); 5391 5392 5392 5393 if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { 5393 - xfs_refcount_decrease_extent(tp, del); 5394 + xfs_refcount_decrease_extent(tp, isrt, del); 5394 5395 } else if (isrt && !xfs_has_rtgroups(mp)) { 5395 5396 error = xfs_bmap_free_rtblocks(tp, del); 5396 5397 } else { ··· 6500 6501 * No point in aligning allocations if we need to COW to actually 6501 6502 * write to them. 6502 6503 */ 6503 - if (xfs_is_always_cow_inode(ip)) 6504 - return 0; 6505 - if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize) 6504 + if (!xfs_is_always_cow_inode(ip) && 6505 + (ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize) 6506 6506 return ip->i_extsize; 6507 6507 if (XFS_IS_REALTIME_INODE(ip) && 6508 6508 ip->i_mount->m_sb.sb_rextsize > 1) ··· 6524 6526 a = 0; 6525 6527 if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) 6526 6528 a = ip->i_cowextsize; 6527 - b = xfs_get_extsz_hint(ip); 6529 + if (XFS_IS_REALTIME_INODE(ip)) { 6530 + b = 0; 6531 + if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) 6532 + b = ip->i_extsize; 6533 + } else { 6534 + b = xfs_get_extsz_hint(ip); 6535 + } 6528 6536 6529 6537 a = max(a, b); 6530 6538 if (a == 0)

+111

fs/xfs/libxfs/xfs_bmap_btree.c

··· 516 516 be64_to_cpu(key2->bmbt.br_startoff)); 517 517 } 518 518 519 + static inline void 520 + xfs_bmbt_move_ptrs( 521 + struct xfs_mount *mp, 522 + struct xfs_btree_block *broot, 523 + short old_size, 524 + size_t new_size, 525 + unsigned int numrecs) 526 + { 527 + void *dptr; 528 + void *sptr; 529 + 530 + sptr = xfs_bmap_broot_ptr_addr(mp, broot, 1, old_size); 531 + dptr = xfs_bmap_broot_ptr_addr(mp, broot, 1, new_size); 532 + memmove(dptr, sptr, numrecs * sizeof(xfs_bmbt_ptr_t)); 533 + } 534 + 535 + /* 536 + * Reallocate the space for if_broot based on the number of records. Move the 537 + * records and pointers in if_broot to fit the new size. When shrinking this 538 + * will eliminate holes between the records and pointers created by the caller. 539 + * When growing this will create holes to be filled in by the caller. 540 + * 541 + * The caller must not request to add more records than would fit in the 542 + * on-disk inode root. If the if_broot is currently NULL, then if we are 543 + * adding records, one will be allocated. The caller must also not request 544 + * that the number of records go below zero, although it can go to zero. 545 + * 546 + * ip -- the inode whose if_broot area is changing 547 + * whichfork -- which inode fork to change 548 + * new_numrecs -- the new number of records requested for the if_broot array 549 + * 550 + * Returns the incore btree root block. 551 + */ 552 + struct xfs_btree_block * 553 + xfs_bmap_broot_realloc( 554 + struct xfs_inode *ip, 555 + int whichfork, 556 + unsigned int new_numrecs) 557 + { 558 + struct xfs_mount *mp = ip->i_mount; 559 + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); 560 + struct xfs_btree_block *broot; 561 + unsigned int new_size; 562 + unsigned int old_size = ifp->if_broot_bytes; 563 + 564 + /* 565 + * Block mapping btrees do not support storing zero records; if this 566 + * happens, the fork is being changed to FMT_EXTENTS. Free the broot 567 + * and get out. 568 + */ 569 + if (new_numrecs == 0) 570 + return xfs_broot_realloc(ifp, 0); 571 + 572 + new_size = xfs_bmap_broot_space_calc(mp, new_numrecs); 573 + 574 + /* Handle the nop case quietly. */ 575 + if (new_size == old_size) 576 + return ifp->if_broot; 577 + 578 + if (new_size > old_size) { 579 + unsigned int old_numrecs; 580 + 581 + /* 582 + * If there wasn't any memory allocated before, just 583 + * allocate it now and get out. 584 + */ 585 + if (old_size == 0) 586 + return xfs_broot_realloc(ifp, new_size); 587 + 588 + /* 589 + * If there is already an existing if_broot, then we need 590 + * to realloc() it and shift the pointers to their new 591 + * location. The records don't change location because 592 + * they are kept butted up against the btree block header. 593 + */ 594 + old_numrecs = xfs_bmbt_maxrecs(mp, old_size, false); 595 + broot = xfs_broot_realloc(ifp, new_size); 596 + ASSERT(xfs_bmap_bmdr_space(broot) <= 597 + xfs_inode_fork_size(ip, whichfork)); 598 + xfs_bmbt_move_ptrs(mp, broot, old_size, new_size, old_numrecs); 599 + return broot; 600 + } 601 + 602 + /* 603 + * We're reducing, but not totally eliminating, numrecs. In this case, 604 + * we are shrinking the if_broot buffer, so it must already exist. 605 + */ 606 + ASSERT(ifp->if_broot != NULL && old_size > 0 && new_size > 0); 607 + 608 + /* 609 + * Shrink the btree root by moving the bmbt pointers, since they are 610 + * not butted up against the btree block header, then reallocating 611 + * broot. 612 + */ 613 + xfs_bmbt_move_ptrs(mp, ifp->if_broot, old_size, new_size, new_numrecs); 614 + broot = xfs_broot_realloc(ifp, new_size); 615 + ASSERT(xfs_bmap_bmdr_space(broot) <= 616 + xfs_inode_fork_size(ip, whichfork)); 617 + return broot; 618 + } 619 + 620 + static struct xfs_btree_block * 621 + xfs_bmbt_broot_realloc( 622 + struct xfs_btree_cur *cur, 623 + unsigned int new_numrecs) 624 + { 625 + return xfs_bmap_broot_realloc(cur->bc_ino.ip, cur->bc_ino.whichfork, 626 + new_numrecs); 627 + } 628 + 519 629 const struct xfs_btree_ops xfs_bmbt_ops = { 520 630 .name = "bmap", 521 631 .type = XFS_BTREE_TYPE_INODE, ··· 653 543 .keys_inorder = xfs_bmbt_keys_inorder, 654 544 .recs_inorder = xfs_bmbt_recs_inorder, 655 545 .keys_contiguous = xfs_bmbt_keys_contiguous, 546 + .broot_realloc = xfs_bmbt_broot_realloc, 656 547 }; 657 548 658 549 /*

+3

fs/xfs/libxfs/xfs_bmap_btree.h

··· 198 198 return xfs_bmdr_space_calc(be16_to_cpu(bb->bb_numrecs)); 199 199 } 200 200 201 + struct xfs_btree_block *xfs_bmap_broot_realloc(struct xfs_inode *ip, 202 + int whichfork, unsigned int new_numrecs); 203 + 201 204 #endif /* __XFS_BMAP_BTREE_H__ */

+328 -83

fs/xfs/libxfs/xfs_btree.c

··· 30 30 #include "xfs_health.h" 31 31 #include "xfs_buf_mem.h" 32 32 #include "xfs_btree_mem.h" 33 + #include "xfs_rtrmap_btree.h" 34 + #include "xfs_bmap.h" 35 + #include "xfs_rmap.h" 36 + #include "xfs_quota.h" 37 + #include "xfs_metafile.h" 38 + #include "xfs_rtrefcount_btree.h" 33 39 34 40 /* 35 41 * Btree magic numbers. ··· 1543 1537 int first, 1544 1538 int last) 1545 1539 { 1540 + if (!bp) { 1541 + xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1542 + xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1543 + return; 1544 + } 1546 1545 1547 1546 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1548 1547 xfs_trans_log_buf(cur->bc_tp, bp, 1549 1548 xfs_btree_rec_offset(cur, first), 1550 1549 xfs_btree_rec_offset(cur, last + 1) - 1); 1551 - 1552 1550 } 1553 1551 1554 1552 /* ··· 3088 3078 #define xfs_btree_split __xfs_btree_split 3089 3079 #endif /* __KERNEL__ */ 3090 3080 3081 + /* Move the records from a root leaf block to a separate block. */ 3082 + STATIC void 3083 + xfs_btree_promote_leaf_iroot( 3084 + struct xfs_btree_cur *cur, 3085 + struct xfs_btree_block *block, 3086 + struct xfs_buf *cbp, 3087 + union xfs_btree_ptr *cptr, 3088 + struct xfs_btree_block *cblock) 3089 + { 3090 + union xfs_btree_rec *rp; 3091 + union xfs_btree_rec *crp; 3092 + union xfs_btree_key *kp; 3093 + union xfs_btree_ptr *pp; 3094 + struct xfs_btree_block *broot; 3095 + int numrecs = xfs_btree_get_numrecs(block); 3096 + 3097 + /* Copy the records from the leaf broot into the new child block. */ 3098 + rp = xfs_btree_rec_addr(cur, 1, block); 3099 + crp = xfs_btree_rec_addr(cur, 1, cblock); 3100 + xfs_btree_copy_recs(cur, crp, rp, numrecs); 3101 + 3102 + /* 3103 + * Increment the tree height. 3104 + * 3105 + * Trickery here: The amount of memory that we need per record for the 3106 + * ifork's btree root block may change when we convert the broot from a 3107 + * leaf to a node block. Free the existing leaf broot so that nobody 3108 + * thinks we need to migrate node pointers when we realloc the broot 3109 + * buffer after bumping nlevels. 3110 + */ 3111 + cur->bc_ops->broot_realloc(cur, 0); 3112 + cur->bc_nlevels++; 3113 + cur->bc_levels[1].ptr = 1; 3114 + 3115 + /* 3116 + * Allocate a new node broot and initialize it to point to the new 3117 + * child block. 3118 + */ 3119 + broot = cur->bc_ops->broot_realloc(cur, 1); 3120 + xfs_btree_init_block(cur->bc_mp, broot, cur->bc_ops, 3121 + cur->bc_nlevels - 1, 1, cur->bc_ino.ip->i_ino); 3122 + 3123 + pp = xfs_btree_ptr_addr(cur, 1, broot); 3124 + kp = xfs_btree_key_addr(cur, 1, broot); 3125 + xfs_btree_copy_ptrs(cur, pp, cptr, 1); 3126 + xfs_btree_get_keys(cur, cblock, kp); 3127 + 3128 + /* Attach the new block to the cursor and log it. */ 3129 + xfs_btree_setbuf(cur, 0, cbp); 3130 + xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); 3131 + xfs_btree_log_recs(cur, cbp, 1, numrecs); 3132 + } 3133 + 3134 + /* 3135 + * Move the keys and pointers from a root block to a separate block. 3136 + * 3137 + * Since the keyptr size does not change, all we have to do is increase the 3138 + * tree height, copy the keyptrs to the new internal node (cblock), shrink 3139 + * the root, and copy the pointers there. 3140 + */ 3141 + STATIC int 3142 + xfs_btree_promote_node_iroot( 3143 + struct xfs_btree_cur *cur, 3144 + struct xfs_btree_block *block, 3145 + int level, 3146 + struct xfs_buf *cbp, 3147 + union xfs_btree_ptr *cptr, 3148 + struct xfs_btree_block *cblock) 3149 + { 3150 + union xfs_btree_key *ckp; 3151 + union xfs_btree_key *kp; 3152 + union xfs_btree_ptr *cpp; 3153 + union xfs_btree_ptr *pp; 3154 + int i; 3155 + int error; 3156 + int numrecs = xfs_btree_get_numrecs(block); 3157 + 3158 + /* 3159 + * Increase tree height, adjusting the root block level to match. 3160 + * We cannot change the root btree node size until we've copied the 3161 + * block contents to the new child block. 3162 + */ 3163 + be16_add_cpu(&block->bb_level, 1); 3164 + cur->bc_nlevels++; 3165 + cur->bc_levels[level + 1].ptr = 1; 3166 + 3167 + /* 3168 + * Adjust the root btree record count, then copy the keys from the old 3169 + * root to the new child block. 3170 + */ 3171 + xfs_btree_set_numrecs(block, 1); 3172 + kp = xfs_btree_key_addr(cur, 1, block); 3173 + ckp = xfs_btree_key_addr(cur, 1, cblock); 3174 + xfs_btree_copy_keys(cur, ckp, kp, numrecs); 3175 + 3176 + /* Check the pointers and copy them to the new child block. */ 3177 + pp = xfs_btree_ptr_addr(cur, 1, block); 3178 + cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3179 + for (i = 0; i < numrecs; i++) { 3180 + error = xfs_btree_debug_check_ptr(cur, pp, i, level); 3181 + if (error) 3182 + return error; 3183 + } 3184 + xfs_btree_copy_ptrs(cur, cpp, pp, numrecs); 3185 + 3186 + /* 3187 + * Set the first keyptr to point to the new child block, then shrink 3188 + * the memory buffer for the root block. 3189 + */ 3190 + error = xfs_btree_debug_check_ptr(cur, cptr, 0, level); 3191 + if (error) 3192 + return error; 3193 + xfs_btree_copy_ptrs(cur, pp, cptr, 1); 3194 + xfs_btree_get_keys(cur, cblock, kp); 3195 + 3196 + cur->bc_ops->broot_realloc(cur, 1); 3197 + 3198 + /* Attach the new block to the cursor and log it. */ 3199 + xfs_btree_setbuf(cur, level, cbp); 3200 + xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); 3201 + xfs_btree_log_keys(cur, cbp, 1, numrecs); 3202 + xfs_btree_log_ptrs(cur, cbp, 1, numrecs); 3203 + return 0; 3204 + } 3205 + 3091 3206 /* 3092 3207 * Copy the old inode root contents into a real block and make the 3093 3208 * broot point to it. ··· 3226 3091 struct xfs_buf *cbp; /* buffer for cblock */ 3227 3092 struct xfs_btree_block *block; /* btree block */ 3228 3093 struct xfs_btree_block *cblock; /* child btree block */ 3229 - union xfs_btree_key *ckp; /* child key pointer */ 3230 - union xfs_btree_ptr *cpp; /* child ptr pointer */ 3231 - union xfs_btree_key *kp; /* pointer to btree key */ 3232 - union xfs_btree_ptr *pp; /* pointer to block addr */ 3094 + union xfs_btree_ptr aptr; 3233 3095 union xfs_btree_ptr nptr; /* new block addr */ 3234 3096 int level; /* btree level */ 3235 3097 int error; /* error return code */ 3236 - int i; /* loop counter */ 3237 3098 3238 3099 XFS_BTREE_STATS_INC(cur, newroot); 3239 3100 ··· 3238 3107 level = cur->bc_nlevels - 1; 3239 3108 3240 3109 block = xfs_btree_get_iroot(cur); 3241 - pp = xfs_btree_ptr_addr(cur, 1, block); 3110 + ASSERT(level > 0 || (cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS)); 3111 + if (level > 0) 3112 + aptr = *xfs_btree_ptr_addr(cur, 1, block); 3113 + else 3114 + aptr.l = cpu_to_be64(XFS_INO_TO_FSB(cur->bc_mp, 3115 + cur->bc_ino.ip->i_ino)); 3242 3116 3243 3117 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 3244 - error = xfs_btree_alloc_block(cur, pp, &nptr, stat); 3118 + error = xfs_btree_alloc_block(cur, &aptr, &nptr, stat); 3245 3119 if (error) 3246 3120 goto error0; 3247 3121 if (*stat == 0) ··· 3272 3136 cblock->bb_u.s.bb_blkno = bno; 3273 3137 } 3274 3138 3275 - be16_add_cpu(&block->bb_level, 1); 3276 - xfs_btree_set_numrecs(block, 1); 3277 - cur->bc_nlevels++; 3278 - ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 3279 - cur->bc_levels[level + 1].ptr = 1; 3280 - 3281 - kp = xfs_btree_key_addr(cur, 1, block); 3282 - ckp = xfs_btree_key_addr(cur, 1, cblock); 3283 - xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock)); 3284 - 3285 - cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3286 - for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { 3287 - error = xfs_btree_debug_check_ptr(cur, pp, i, level); 3139 + if (level > 0) { 3140 + error = xfs_btree_promote_node_iroot(cur, block, level, cbp, 3141 + &nptr, cblock); 3288 3142 if (error) 3289 3143 goto error0; 3144 + } else { 3145 + xfs_btree_promote_leaf_iroot(cur, block, cbp, &nptr, cblock); 3290 3146 } 3291 3147 3292 - xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock)); 3293 - 3294 - error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level); 3295 - if (error) 3296 - goto error0; 3297 - 3298 - xfs_btree_copy_ptrs(cur, pp, &nptr, 1); 3299 - 3300 - xfs_iroot_realloc(cur->bc_ino.ip, 3301 - 1 - xfs_btree_get_numrecs(cblock), 3302 - cur->bc_ino.whichfork); 3303 - 3304 - xfs_btree_setbuf(cur, level, cbp); 3305 - 3306 - /* 3307 - * Do all this logging at the end so that 3308 - * the root is at the right level. 3309 - */ 3310 - xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); 3311 - xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); 3312 - xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); 3313 - 3314 - *logflags |= 3315 - XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); 3148 + *logflags |= XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); 3316 3149 *stat = 1; 3317 3150 return 0; 3318 3151 error0: ··· 3452 3347 3453 3348 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { 3454 3349 /* A root block that can be made bigger. */ 3455 - xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork); 3350 + cur->bc_ops->broot_realloc(cur, numrecs + 1); 3456 3351 *stat = 1; 3457 3352 } else { 3458 3353 /* A root block that needs replacing */ ··· 3798 3693 return error; 3799 3694 } 3800 3695 3696 + /* Move the records from a child leaf block to the root block. */ 3697 + STATIC void 3698 + xfs_btree_demote_leaf_child( 3699 + struct xfs_btree_cur *cur, 3700 + struct xfs_btree_block *cblock, 3701 + int numrecs) 3702 + { 3703 + union xfs_btree_rec *rp; 3704 + union xfs_btree_rec *crp; 3705 + struct xfs_btree_block *broot; 3706 + 3707 + /* 3708 + * Decrease the tree height. 3709 + * 3710 + * Trickery here: The amount of memory that we need per record for the 3711 + * ifork's btree root block may change when we convert the broot from a 3712 + * node to a leaf. Free the old node broot so that we can get a fresh 3713 + * leaf broot. 3714 + */ 3715 + cur->bc_ops->broot_realloc(cur, 0); 3716 + cur->bc_nlevels--; 3717 + 3718 + /* 3719 + * Allocate a new leaf broot and copy the records from the old child. 3720 + * Detach the old child from the cursor. 3721 + */ 3722 + broot = cur->bc_ops->broot_realloc(cur, numrecs); 3723 + xfs_btree_init_block(cur->bc_mp, broot, cur->bc_ops, 0, numrecs, 3724 + cur->bc_ino.ip->i_ino); 3725 + 3726 + rp = xfs_btree_rec_addr(cur, 1, broot); 3727 + crp = xfs_btree_rec_addr(cur, 1, cblock); 3728 + xfs_btree_copy_recs(cur, rp, crp, numrecs); 3729 + 3730 + cur->bc_levels[0].bp = NULL; 3731 + } 3732 + 3733 + /* 3734 + * Move the keyptrs from a child node block to the root block. 3735 + * 3736 + * Since the keyptr size does not change, all we have to do is increase the 3737 + * tree height, copy the keyptrs to the new internal node (cblock), shrink 3738 + * the root, and copy the pointers there. 3739 + */ 3740 + STATIC int 3741 + xfs_btree_demote_node_child( 3742 + struct xfs_btree_cur *cur, 3743 + struct xfs_btree_block *cblock, 3744 + int level, 3745 + int numrecs) 3746 + { 3747 + struct xfs_btree_block *block; 3748 + union xfs_btree_key *ckp; 3749 + union xfs_btree_key *kp; 3750 + union xfs_btree_ptr *cpp; 3751 + union xfs_btree_ptr *pp; 3752 + int i; 3753 + int error; 3754 + 3755 + /* 3756 + * Adjust the root btree node size and the record count to match the 3757 + * doomed child so that we can copy the keyptrs ahead of changing the 3758 + * tree shape. 3759 + */ 3760 + block = cur->bc_ops->broot_realloc(cur, numrecs); 3761 + 3762 + xfs_btree_set_numrecs(block, numrecs); 3763 + ASSERT(block->bb_numrecs == cblock->bb_numrecs); 3764 + 3765 + /* Copy keys from the doomed block. */ 3766 + kp = xfs_btree_key_addr(cur, 1, block); 3767 + ckp = xfs_btree_key_addr(cur, 1, cblock); 3768 + xfs_btree_copy_keys(cur, kp, ckp, numrecs); 3769 + 3770 + /* Copy pointers from the doomed block. */ 3771 + pp = xfs_btree_ptr_addr(cur, 1, block); 3772 + cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3773 + for (i = 0; i < numrecs; i++) { 3774 + error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1); 3775 + if (error) 3776 + return error; 3777 + } 3778 + xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); 3779 + 3780 + /* Decrease tree height, adjusting the root block level to match. */ 3781 + cur->bc_levels[level - 1].bp = NULL; 3782 + be16_add_cpu(&block->bb_level, -1); 3783 + cur->bc_nlevels--; 3784 + return 0; 3785 + } 3786 + 3801 3787 /* 3802 3788 * Try to merge a non-leaf block back into the inode root. 3803 3789 * ··· 3901 3705 xfs_btree_kill_iroot( 3902 3706 struct xfs_btree_cur *cur) 3903 3707 { 3904 - int whichfork = cur->bc_ino.whichfork; 3905 3708 struct xfs_inode *ip = cur->bc_ino.ip; 3906 - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); 3907 3709 struct xfs_btree_block *block; 3908 3710 struct xfs_btree_block *cblock; 3909 - union xfs_btree_key *kp; 3910 - union xfs_btree_key *ckp; 3911 - union xfs_btree_ptr *pp; 3912 - union xfs_btree_ptr *cpp; 3913 3711 struct xfs_buf *cbp; 3914 3712 int level; 3915 - int index; 3916 3713 int numrecs; 3917 3714 int error; 3918 3715 #ifdef DEBUG 3919 3716 union xfs_btree_ptr ptr; 3920 3717 #endif 3921 - int i; 3922 3718 3923 3719 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 3924 - ASSERT(cur->bc_nlevels > 1); 3720 + ASSERT((cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS) || 3721 + cur->bc_nlevels > 1); 3925 3722 3926 3723 /* 3927 3724 * Don't deal with the root block needs to be a leaf case. 3928 3725 * We're just going to turn the thing back into extents anyway. 3929 3726 */ 3930 3727 level = cur->bc_nlevels - 1; 3931 - if (level == 1) 3728 + if (level == 1 && !(cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS)) 3729 + goto out0; 3730 + 3731 + /* If we're already a leaf, jump out. */ 3732 + if (level == 0) 3932 3733 goto out0; 3933 3734 3934 3735 /* ··· 3955 3762 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 3956 3763 #endif 3957 3764 3958 - index = numrecs - cur->bc_ops->get_maxrecs(cur, level); 3959 - if (index) { 3960 - xfs_iroot_realloc(cur->bc_ino.ip, index, 3961 - cur->bc_ino.whichfork); 3962 - block = ifp->if_broot; 3963 - } 3964 - 3965 - be16_add_cpu(&block->bb_numrecs, index); 3966 - ASSERT(block->bb_numrecs == cblock->bb_numrecs); 3967 - 3968 - kp = xfs_btree_key_addr(cur, 1, block); 3969 - ckp = xfs_btree_key_addr(cur, 1, cblock); 3970 - xfs_btree_copy_keys(cur, kp, ckp, numrecs); 3971 - 3972 - pp = xfs_btree_ptr_addr(cur, 1, block); 3973 - cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3974 - 3975 - for (i = 0; i < numrecs; i++) { 3976 - error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1); 3765 + if (level > 1) { 3766 + error = xfs_btree_demote_node_child(cur, cblock, level, 3767 + numrecs); 3977 3768 if (error) 3978 3769 return error; 3979 - } 3980 - 3981 - xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); 3770 + } else 3771 + xfs_btree_demote_leaf_child(cur, cblock, numrecs); 3982 3772 3983 3773 error = xfs_btree_free_block(cur, cbp); 3984 3774 if (error) 3985 3775 return error; 3986 3776 3987 - cur->bc_levels[level - 1].bp = NULL; 3988 - be16_add_cpu(&block->bb_level, -1); 3989 3777 xfs_trans_log_inode(cur->bc_tp, ip, 3990 3778 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork)); 3991 - cur->bc_nlevels--; 3992 3779 out0: 3993 3780 return 0; 3994 3781 } ··· 4122 3949 /* 4123 3950 * We're at the root level. First, shrink the root block in-memory. 4124 3951 * Try to get rid of the next level down. If we can't then there's 4125 - * nothing left to do. 3952 + * nothing left to do. numrecs was decremented above. 4126 3953 */ 4127 3954 if (xfs_btree_at_iroot(cur, level)) { 4128 - xfs_iroot_realloc(cur->bc_ino.ip, -1, cur->bc_ino.whichfork); 3955 + cur->bc_ops->broot_realloc(cur, numrecs); 4129 3956 4130 3957 error = xfs_btree_kill_iroot(cur); 4131 3958 if (error) ··· 5533 5360 error = xfs_refcountbt_init_cur_cache(); 5534 5361 if (error) 5535 5362 goto err; 5363 + error = xfs_rtrmapbt_init_cur_cache(); 5364 + if (error) 5365 + goto err; 5366 + error = xfs_rtrefcountbt_init_cur_cache(); 5367 + if (error) 5368 + goto err; 5536 5369 5537 5370 return 0; 5538 5371 err: ··· 5555 5376 xfs_bmbt_destroy_cur_cache(); 5556 5377 xfs_rmapbt_destroy_cur_cache(); 5557 5378 xfs_refcountbt_destroy_cur_cache(); 5379 + xfs_rtrmapbt_destroy_cur_cache(); 5380 + xfs_rtrefcountbt_destroy_cur_cache(); 5558 5381 } 5559 5382 5560 5383 /* Move the btree cursor before the first record. */ ··· 5583 5402 return -EFSCORRUPTED; 5584 5403 } 5585 5404 5405 + return 0; 5406 + } 5407 + 5408 + /* Allocate a block for an inode-rooted metadata btree. */ 5409 + int 5410 + xfs_btree_alloc_metafile_block( 5411 + struct xfs_btree_cur *cur, 5412 + const union xfs_btree_ptr *start, 5413 + union xfs_btree_ptr *new, 5414 + int *stat) 5415 + { 5416 + struct xfs_alloc_arg args = { 5417 + .mp = cur->bc_mp, 5418 + .tp = cur->bc_tp, 5419 + .resv = XFS_AG_RESV_METAFILE, 5420 + .minlen = 1, 5421 + .maxlen = 1, 5422 + .prod = 1, 5423 + }; 5424 + struct xfs_inode *ip = cur->bc_ino.ip; 5425 + int error; 5426 + 5427 + ASSERT(xfs_is_metadir_inode(ip)); 5428 + 5429 + xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, cur->bc_ino.whichfork); 5430 + error = xfs_alloc_vextent_start_ag(&args, 5431 + XFS_INO_TO_FSB(cur->bc_mp, ip->i_ino)); 5432 + if (error) 5433 + return error; 5434 + if (args.fsbno == NULLFSBLOCK) { 5435 + *stat = 0; 5436 + return 0; 5437 + } 5438 + ASSERT(args.len == 1); 5439 + 5440 + xfs_metafile_resv_alloc_space(ip, &args); 5441 + 5442 + new->l = cpu_to_be64(args.fsbno); 5443 + *stat = 1; 5444 + return 0; 5445 + } 5446 + 5447 + /* Free a block from an inode-rooted metadata btree. */ 5448 + int 5449 + xfs_btree_free_metafile_block( 5450 + struct xfs_btree_cur *cur, 5451 + struct xfs_buf *bp) 5452 + { 5453 + struct xfs_owner_info oinfo; 5454 + struct xfs_mount *mp = cur->bc_mp; 5455 + struct xfs_inode *ip = cur->bc_ino.ip; 5456 + struct xfs_trans *tp = cur->bc_tp; 5457 + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); 5458 + int error; 5459 + 5460 + ASSERT(xfs_is_metadir_inode(ip)); 5461 + 5462 + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); 5463 + error = xfs_free_extent_later(tp, fsbno, 1, &oinfo, XFS_AG_RESV_METAFILE, 5464 + 0); 5465 + if (error) 5466 + return error; 5467 + 5468 + xfs_metafile_resv_free_space(ip, tp, 1); 5586 5469 return 0; 5587 5470 }

+25 -3

fs/xfs/libxfs/xfs_btree.h

··· 135 135 /* offset of btree stats array */ 136 136 unsigned int statoff; 137 137 138 - /* sick mask for health reporting (only for XFS_BTREE_TYPE_AG) */ 138 + /* sick mask for health reporting (not for bmap btrees) */ 139 139 unsigned int sick_mask; 140 140 141 141 /* cursor operations */ ··· 213 213 const union xfs_btree_key *key1, 214 214 const union xfs_btree_key *key2, 215 215 const union xfs_btree_key *mask); 216 + 217 + /* 218 + * Reallocate the space for if_broot to fit the number of records. 219 + * Move the records and pointers in if_broot to fit the new size. When 220 + * shrinking this will eliminate holes between the records and pointers 221 + * created by the caller. When growing this will create holes to be 222 + * filled in by the caller. 223 + * 224 + * The caller must not request to add more records than would fit in 225 + * the on-disk inode root. If the if_broot is currently NULL, then if 226 + * we are adding records, one will be allocated. The caller must also 227 + * not request that the number of records go below zero, although it 228 + * can go to zero. 229 + */ 230 + struct xfs_btree_block *(*broot_realloc)(struct xfs_btree_cur *cur, 231 + unsigned int new_numrecs); 216 232 }; 217 233 218 234 /* btree geometry flags */ 219 235 #define XFS_BTGEO_OVERLAPPING (1U << 0) /* overlapping intervals */ 220 - 236 + #define XFS_BTGEO_IROOT_RECORDS (1U << 1) /* iroot can store records */ 221 237 222 238 union xfs_btree_irec { 223 239 struct xfs_alloc_rec_incore a; ··· 297 281 struct { 298 282 unsigned int nr_ops; /* # record updates */ 299 283 unsigned int shape_changes; /* # of extent splits */ 300 - } bc_refc; /* refcountbt */ 284 + } bc_refc; /* refcountbt/rtrefcountbt */ 301 285 }; 302 286 303 287 /* Must be at the end of the struct! */ ··· 702 686 return cur->bc_ops->type == XFS_BTREE_TYPE_INODE && 703 687 level == cur->bc_nlevels - 1; 704 688 } 689 + 690 + int xfs_btree_alloc_metafile_block(struct xfs_btree_cur *cur, 691 + const union xfs_btree_ptr *start, union xfs_btree_ptr *newp, 692 + int *stat); 693 + int xfs_btree_free_metafile_block(struct xfs_btree_cur *cur, 694 + struct xfs_buf *bp); 705 695 706 696 #endif /* __XFS_BTREE_H__ */

+1

fs/xfs/libxfs/xfs_btree_mem.c

··· 18 18 #include "xfs_ag.h" 19 19 #include "xfs_buf_item.h" 20 20 #include "xfs_trace.h" 21 + #include "xfs_rtgroup.h" 21 22 22 23 /* Set the root of an in-memory btree. */ 23 24 void

+7 -3

fs/xfs/libxfs/xfs_btree_staging.c

··· 134 134 cur->bc_ino.ifake = ifake; 135 135 cur->bc_nlevels = ifake->if_levels; 136 136 cur->bc_ino.forksize = ifake->if_fork_size; 137 + cur->bc_ino.whichfork = XFS_STAGING_FORK; 137 138 cur->bc_flags |= XFS_BTREE_STAGING; 138 139 } 139 140 ··· 574 573 struct xfs_btree_bload *bbl, 575 574 uint64_t nr_records) 576 575 { 576 + const struct xfs_btree_ops *ops = cur->bc_ops; 577 577 uint64_t nr_blocks = 0; 578 578 uint64_t nr_this_level; 579 579 ··· 601 599 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, 602 600 &avg_per_block, &level_blocks, &dontcare64); 603 601 604 - if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { 602 + if (ops->type == XFS_BTREE_TYPE_INODE) { 605 603 /* 606 604 * If all the items we want to store at this level 607 605 * would fit in the inode root block, then we have our ··· 609 607 * 610 608 * Note that bmap btrees forbid records in the root. 611 609 */ 612 - if (level != 0 && nr_this_level <= avg_per_block) { 610 + if ((level != 0 || 611 + (ops->geom_flags & XFS_BTGEO_IROOT_RECORDS)) && 612 + nr_this_level <= avg_per_block) { 613 613 nr_blocks++; 614 614 break; 615 615 } ··· 662 658 return -EOVERFLOW; 663 659 664 660 bbl->btree_height = cur->bc_nlevels; 665 - if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) 661 + if (ops->type == XFS_BTREE_TYPE_INODE) 666 662 bbl->nr_blocks = nr_blocks - 1; 667 663 else 668 664 bbl->nr_blocks = nr_blocks;

+2

fs/xfs/libxfs/xfs_defer.h

··· 68 68 69 69 extern const struct xfs_defer_op_type xfs_bmap_update_defer_type; 70 70 extern const struct xfs_defer_op_type xfs_refcount_update_defer_type; 71 + extern const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type; 71 72 extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; 73 + extern const struct xfs_defer_op_type xfs_rtrmap_update_defer_type; 72 74 extern const struct xfs_defer_op_type xfs_extent_free_defer_type; 73 75 extern const struct xfs_defer_op_type xfs_agfl_free_defer_type; 74 76 extern const struct xfs_defer_op_type xfs_rtextent_free_defer_type;

+4 -5

fs/xfs/libxfs/xfs_dir2.c

··· 197 197 /* 198 198 * Return 1 if directory contains only "." and "..". 199 199 */ 200 - int 200 + static bool 201 201 xfs_dir_isempty( 202 202 xfs_inode_t *dp) 203 203 { ··· 205 205 206 206 ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); 207 207 if (dp->i_disk_size == 0) /* might happen during shutdown. */ 208 - return 1; 208 + return true; 209 209 if (dp->i_disk_size > xfs_inode_data_fork_size(dp)) 210 - return 0; 210 + return false; 211 211 sfp = dp->i_df.if_data; 212 212 return !sfp->count; 213 213 } ··· 379 379 !(args->op_flags & XFS_DA_OP_CILOOKUP)) 380 380 return -EEXIST; 381 381 382 - args->value = kmalloc(len, 382 + args->value = kmemdup(name, len, 383 383 GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_RETRY_MAYFAIL); 384 384 if (!args->value) 385 385 return -ENOMEM; 386 386 387 - memcpy(args->value, name, len); 388 387 args->valuelen = len; 389 388 return -EEXIST; 390 389 }

-1

fs/xfs/libxfs/xfs_dir2.h

··· 58 58 extern int xfs_da_mount(struct xfs_mount *mp); 59 59 extern void xfs_da_unmount(struct xfs_mount *mp); 60 60 61 - extern int xfs_dir_isempty(struct xfs_inode *dp); 62 61 extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp, 63 62 struct xfs_inode *pdp); 64 63 extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,

+3 -1

fs/xfs/libxfs/xfs_errortag.h

··· 64 64 #define XFS_ERRTAG_WB_DELAY_MS 42 65 65 #define XFS_ERRTAG_WRITE_DELAY_MS 43 66 66 #define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44 67 - #define XFS_ERRTAG_MAX 45 67 + #define XFS_ERRTAG_METAFILE_RESV_CRITICAL 45 68 + #define XFS_ERRTAG_MAX 46 68 69 69 70 /* 70 71 * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. ··· 114 113 #define XFS_RANDOM_WB_DELAY_MS 3000 115 114 #define XFS_RANDOM_WRITE_DELAY_MS 3000 116 115 #define XFS_RANDOM_EXCHMAPS_FINISH_ONE 1 116 + #define XFS_RANDOM_METAFILE_RESV_CRITICAL 4 117 117 118 118 #endif /* __XFS_ERRORTAG_H_ */

+3 -1

fs/xfs/libxfs/xfs_exchmaps.c

··· 662 662 if (!xfs_has_rmapbt(mp)) 663 663 return 0; 664 664 if (XFS_IS_REALTIME_INODE(req->ip1)) 665 - return 0; 665 + return howmany_64(req->nr_exchanges, 666 + XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp)) * 667 + XFS_RTRMAPADD_SPACE_RES(mp); 666 668 667 669 return howmany_64(req->nr_exchanges, 668 670 XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) *

+46 -5

fs/xfs/libxfs/xfs_format.h

··· 857 857 XFS_METAFILE_PRJQUOTA, /* project quota */ 858 858 XFS_METAFILE_RTBITMAP, /* rt bitmap */ 859 859 XFS_METAFILE_RTSUMMARY, /* rt summary */ 860 + XFS_METAFILE_RTRMAP, /* rt rmap */ 861 + XFS_METAFILE_RTREFCOUNT, /* rt refcount */ 860 862 861 863 XFS_METAFILE_MAX 862 864 } __packed; ··· 870 868 { XFS_METAFILE_GRPQUOTA, "grpquota" }, \ 871 869 { XFS_METAFILE_PRJQUOTA, "prjquota" }, \ 872 870 { XFS_METAFILE_RTBITMAP, "rtbitmap" }, \ 873 - { XFS_METAFILE_RTSUMMARY, "rtsummary" } 871 + { XFS_METAFILE_RTSUMMARY, "rtsummary" }, \ 872 + { XFS_METAFILE_RTRMAP, "rtrmap" }, \ 873 + { XFS_METAFILE_RTREFCOUNT, "rtrefcount" } 874 874 875 875 /* 876 876 * On-disk inode structure. ··· 1001 997 XFS_DINODE_FMT_LOCAL, /* bulk data */ 1002 998 XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */ 1003 999 XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */ 1004 - XFS_DINODE_FMT_UUID /* added long ago, but never used */ 1000 + XFS_DINODE_FMT_UUID, /* added long ago, but never used */ 1001 + XFS_DINODE_FMT_META_BTREE, /* metadata btree */ 1005 1002 }; 1006 1003 1007 1004 #define XFS_INODE_FORMAT_STR \ ··· 1010 1005 { XFS_DINODE_FMT_LOCAL, "local" }, \ 1011 1006 { XFS_DINODE_FMT_EXTENTS, "extent" }, \ 1012 1007 { XFS_DINODE_FMT_BTREE, "btree" }, \ 1013 - { XFS_DINODE_FMT_UUID, "uuid" } 1008 + { XFS_DINODE_FMT_UUID, "uuid" }, \ 1009 + { XFS_DINODE_FMT_META_BTREE, "meta_btree" } 1014 1010 1015 1011 /* 1016 1012 * Max values for extnum and aextnum. ··· 1732 1726 XFS_IBT_BLOCK(mp) + 1) 1733 1727 1734 1728 /* 1729 + * Realtime Reverse mapping btree format definitions 1730 + * 1731 + * This is a btree for reverse mapping records for realtime volumes 1732 + */ 1733 + #define XFS_RTRMAP_CRC_MAGIC 0x4d415052 /* 'MAPR' */ 1734 + 1735 + /* 1736 + * rtrmap root header, on-disk form only. 1737 + */ 1738 + struct xfs_rtrmap_root { 1739 + __be16 bb_level; /* 0 is a leaf */ 1740 + __be16 bb_numrecs; /* current # of data records */ 1741 + }; 1742 + 1743 + /* inode-based btree pointer type */ 1744 + typedef __be64 xfs_rtrmap_ptr_t; 1745 + 1746 + /* 1735 1747 * Reference Count Btree format definitions 1736 1748 * 1737 1749 */ ··· 1792 1768 __be32 rc_startblock; /* starting block number */ 1793 1769 }; 1794 1770 1795 - #define MAXREFCOUNT ((xfs_nlink_t)~0U) 1796 - #define MAXREFCEXTLEN ((xfs_extlen_t)~0U) 1771 + #define XFS_REFC_REFCOUNT_MAX ((xfs_nlink_t)~0U) 1772 + #define XFS_REFC_LEN_MAX ((xfs_extlen_t)~0U) 1797 1773 1798 1774 /* btree pointer type */ 1799 1775 typedef __be32 xfs_refcount_ptr_t; 1800 1776 1777 + /* 1778 + * Realtime Reference Count btree format definitions 1779 + * 1780 + * This is a btree for reference count records for realtime volumes 1781 + */ 1782 + #define XFS_RTREFC_CRC_MAGIC 0x52434e54 /* 'RCNT' */ 1783 + 1784 + /* 1785 + * rt refcount root header, on-disk form only. 1786 + */ 1787 + struct xfs_rtrefcount_root { 1788 + __be16 bb_level; /* 0 is a leaf */ 1789 + __be16 bb_numrecs; /* current # of data records */ 1790 + }; 1791 + 1792 + /* inode-rooted btree pointer type */ 1793 + typedef __be64 xfs_rtrefcount_ptr_t; 1801 1794 1802 1795 /* 1803 1796 * BMAP Btree format definitions

+8 -2

fs/xfs/libxfs/xfs_fs.h

··· 737 737 #define XFS_SCRUB_TYPE_DIRTREE 28 /* directory tree structure */ 738 738 #define XFS_SCRUB_TYPE_METAPATH 29 /* metadata directory tree paths */ 739 739 #define XFS_SCRUB_TYPE_RGSUPER 30 /* realtime superblock */ 740 + #define XFS_SCRUB_TYPE_RTRMAPBT 31 /* rtgroup reverse mapping btree */ 741 + #define XFS_SCRUB_TYPE_RTREFCBT 32 /* realtime reference count btree */ 740 742 741 743 /* Number of scrub subcommands. */ 742 - #define XFS_SCRUB_TYPE_NR 31 744 + #define XFS_SCRUB_TYPE_NR 33 743 745 744 746 /* 745 747 * This special type code only applies to the vectored scrub implementation. ··· 831 829 #define XFS_SCRUB_METAPATH_USRQUOTA (5) /* user quota */ 832 830 #define XFS_SCRUB_METAPATH_GRPQUOTA (6) /* group quota */ 833 831 #define XFS_SCRUB_METAPATH_PRJQUOTA (7) /* project quota */ 832 + #define XFS_SCRUB_METAPATH_RTRMAPBT (8) /* realtime reverse mapping */ 833 + #define XFS_SCRUB_METAPATH_RTREFCOUNTBT (9) /* realtime refcount */ 834 834 835 835 /* Number of metapath sm_ino values */ 836 - #define XFS_SCRUB_METAPATH_NR (8) 836 + #define XFS_SCRUB_METAPATH_NR (10) 837 837 838 838 /* 839 839 * ioctl limits ··· 997 993 #define XFS_RTGROUP_GEOM_SICK_SUPER (1U << 0) /* superblock */ 998 994 #define XFS_RTGROUP_GEOM_SICK_BITMAP (1U << 1) /* rtbitmap */ 999 995 #define XFS_RTGROUP_GEOM_SICK_SUMMARY (1U << 2) /* rtsummary */ 996 + #define XFS_RTGROUP_GEOM_SICK_RMAPBT (1U << 3) /* reverse mappings */ 997 + #define XFS_RTGROUP_GEOM_SICK_REFCNTBT (1U << 4) /* reference counts */ 1000 998 1001 999 /* 1002 1000 * ioctl commands that are used by Linux filesystems

+5 -1

fs/xfs/libxfs/xfs_health.h

··· 70 70 #define XFS_SICK_RG_SUPER (1 << 0) /* rt group superblock */ 71 71 #define XFS_SICK_RG_BITMAP (1 << 1) /* rt group bitmap */ 72 72 #define XFS_SICK_RG_SUMMARY (1 << 2) /* rt groups summary */ 73 + #define XFS_SICK_RG_RMAPBT (1 << 3) /* reverse mappings */ 74 + #define XFS_SICK_RG_REFCNTBT (1 << 4) /* reference counts */ 73 75 74 76 /* Observable health issues for AG metadata. */ 75 77 #define XFS_SICK_AG_SB (1 << 0) /* superblock */ ··· 117 115 118 116 #define XFS_SICK_RG_PRIMARY (XFS_SICK_RG_SUPER | \ 119 117 XFS_SICK_RG_BITMAP | \ 120 - XFS_SICK_RG_SUMMARY) 118 + XFS_SICK_RG_SUMMARY | \ 119 + XFS_SICK_RG_RMAPBT | \ 120 + XFS_SICK_RG_REFCNTBT) 121 121 122 122 #define XFS_SICK_AG_PRIMARY (XFS_SICK_AG_SB | \ 123 123 XFS_SICK_AG_AGF | \

+56 -9

fs/xfs/libxfs/xfs_inode_buf.c

··· 441 441 if (di_nextents > max_extents) 442 442 return __this_address; 443 443 break; 444 + case XFS_DINODE_FMT_META_BTREE: 445 + if (!xfs_has_metadir(mp)) 446 + return __this_address; 447 + if (!(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADATA))) 448 + return __this_address; 449 + switch (be16_to_cpu(dip->di_metatype)) { 450 + case XFS_METAFILE_RTRMAP: 451 + /* 452 + * growfs must create the rtrmap inodes before adding a 453 + * realtime volume to the filesystem, so we cannot use 454 + * the rtrmapbt predicate here. 455 + */ 456 + if (!xfs_has_rmapbt(mp)) 457 + return __this_address; 458 + break; 459 + case XFS_METAFILE_RTREFCOUNT: 460 + /* same comment about growfs and rmap inodes applies */ 461 + if (!xfs_has_reflink(mp)) 462 + return __this_address; 463 + break; 464 + default: 465 + return __this_address; 466 + } 467 + break; 444 468 default: 445 469 return __this_address; 446 470 } ··· 484 460 if (dip->di_forkoff != (roundup(sizeof(xfs_dev_t), 8) >> 3)) 485 461 return __this_address; 486 462 break; 463 + case XFS_DINODE_FMT_META_BTREE: 464 + if (!xfs_has_metadir(mp) || !xfs_has_parent(mp)) 465 + return __this_address; 466 + fallthrough; 487 467 case XFS_DINODE_FMT_LOCAL: /* fall through ... */ 488 468 case XFS_DINODE_FMT_EXTENTS: /* fall through ... */ 489 469 case XFS_DINODE_FMT_BTREE: ··· 665 637 if (mode && nextents + naextents > nblocks) 666 638 return __this_address; 667 639 668 - if (nextents + naextents == 0 && nblocks != 0) 669 - return __this_address; 670 - 671 640 if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents) 672 641 return __this_address; 673 642 ··· 748 723 return __this_address; 749 724 750 725 /* don't let reflink and realtime mix */ 751 - if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME)) 726 + if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME) && 727 + !xfs_has_rtreflink(mp)) 752 728 return __this_address; 753 729 754 730 /* COW extent size hint validation */ ··· 767 741 fa = xfs_dinode_verify_metadir(mp, dip, mode, flags, flags2); 768 742 if (fa) 769 743 return fa; 744 + } 745 + 746 + /* metadata inodes containing btrees always have zero extent count */ 747 + if (XFS_DFORK_FORMAT(dip, XFS_DATA_FORK) != XFS_DINODE_FMT_META_BTREE) { 748 + if (nextents + naextents == 0 && nblocks != 0) 749 + return __this_address; 770 750 } 771 751 772 752 return NULL; ··· 910 878 bool rt_flag; 911 879 bool hint_flag; 912 880 uint32_t cowextsize_bytes; 881 + uint32_t blocksize_bytes; 913 882 914 883 rt_flag = (flags & XFS_DIFLAG_REALTIME); 915 884 hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE); 916 885 cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize); 886 + 887 + /* 888 + * Similar to extent size hints, a directory can be configured to 889 + * propagate realtime status and a CoW extent size hint to newly 890 + * created files even if there is no realtime device, and the hints on 891 + * disk can become misaligned if the sysadmin changes the rt extent 892 + * size while adding the realtime device. 893 + * 894 + * Therefore, we can only enforce the rextsize alignment check against 895 + * regular realtime files, and rely on callers to decide when alignment 896 + * checks are appropriate, and fix things up as needed. 897 + */ 898 + 899 + if (rt_flag) 900 + blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); 901 + else 902 + blocksize_bytes = mp->m_sb.sb_blocksize; 917 903 918 904 if (hint_flag && !xfs_has_reflink(mp)) 919 905 return __this_address; ··· 946 896 if (mode && !hint_flag && cowextsize != 0) 947 897 return __this_address; 948 898 949 - if (hint_flag && rt_flag) 950 - return __this_address; 951 - 952 - if (cowextsize_bytes % mp->m_sb.sb_blocksize) 899 + if (cowextsize_bytes % blocksize_bytes) 953 900 return __this_address; 954 901 955 902 if (cowextsize > XFS_MAX_BMBT_EXTLEN) 956 903 return __this_address; 957 904 958 - if (cowextsize > mp->m_sb.sb_agblocks / 2) 905 + if (!rt_flag && cowextsize > mp->m_sb.sb_agblocks / 2) 959 906 return __this_address; 960 907 961 908 return NULL;

+90 -129

fs/xfs/libxfs/xfs_inode_fork.c

··· 27 27 #include "xfs_errortag.h" 28 28 #include "xfs_health.h" 29 29 #include "xfs_symlink_remote.h" 30 + #include "xfs_rtrmap_btree.h" 31 + #include "xfs_rtrefcount_btree.h" 30 32 31 33 struct kmem_cache *xfs_ifork_cache; 32 34 ··· 180 178 struct xfs_mount *mp = ip->i_mount; 181 179 xfs_bmdr_block_t *dfp; 182 180 struct xfs_ifork *ifp; 183 - /* REFERENCED */ 181 + struct xfs_btree_block *broot; 184 182 int nrecs; 185 183 int size; 186 184 int level; ··· 213 211 return -EFSCORRUPTED; 214 212 } 215 213 216 - ifp->if_broot_bytes = size; 217 - ifp->if_broot = kmalloc(size, 218 - GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); 219 - ASSERT(ifp->if_broot != NULL); 214 + broot = xfs_broot_alloc(ifp, size); 220 215 /* 221 216 * Copy and convert from the on-disk structure 222 217 * to the in-memory structure. 223 218 */ 224 219 xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), 225 - ifp->if_broot, size); 220 + broot, size); 226 221 227 222 ifp->if_bytes = 0; 228 223 ifp->if_data = NULL; ··· 269 270 return xfs_iformat_extents(ip, dip, XFS_DATA_FORK); 270 271 case XFS_DINODE_FMT_BTREE: 271 272 return xfs_iformat_btree(ip, dip, XFS_DATA_FORK); 273 + case XFS_DINODE_FMT_META_BTREE: 274 + switch (ip->i_metatype) { 275 + case XFS_METAFILE_RTRMAP: 276 + return xfs_iformat_rtrmap(ip, dip); 277 + case XFS_METAFILE_RTREFCOUNT: 278 + return xfs_iformat_rtrefcount(ip, dip); 279 + default: 280 + break; 281 + } 282 + fallthrough; 272 283 default: 273 284 xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, 274 285 dip, sizeof(*dip), __this_address); ··· 372 363 } 373 364 374 365 /* 375 - * Reallocate the space for if_broot based on the number of records 376 - * being added or deleted as indicated in rec_diff. Move the records 377 - * and pointers in if_broot to fit the new size. When shrinking this 378 - * will eliminate holes between the records and pointers created by 379 - * the caller. When growing this will create holes to be filled in 380 - * by the caller. 381 - * 382 - * The caller must not request to add more records than would fit in 383 - * the on-disk inode root. If the if_broot is currently NULL, then 384 - * if we are adding records, one will be allocated. The caller must also 385 - * not request that the number of records go below zero, although 386 - * it can go to zero. 387 - * 388 - * ip -- the inode whose if_broot area is changing 389 - * ext_diff -- the change in the number of records, positive or negative, 390 - * requested for the if_broot array. 366 + * Allocate the if_broot component of an inode fork so that it is @new_size 367 + * bytes in size, using __GFP_NOLOCKDEP like all the other code that 368 + * initializes a broot during inode load. Returns if_broot. 391 369 */ 392 - void 393 - xfs_iroot_realloc( 394 - xfs_inode_t *ip, 395 - int rec_diff, 396 - int whichfork) 370 + struct xfs_btree_block * 371 + xfs_broot_alloc( 372 + struct xfs_ifork *ifp, 373 + size_t new_size) 397 374 { 398 - struct xfs_mount *mp = ip->i_mount; 399 - int cur_max; 400 - struct xfs_ifork *ifp; 401 - struct xfs_btree_block *new_broot; 402 - int new_max; 403 - size_t new_size; 404 - char *np; 405 - char *op; 375 + ASSERT(ifp->if_broot == NULL); 406 376 407 - /* 408 - * Handle the degenerate case quietly. 409 - */ 410 - if (rec_diff == 0) { 411 - return; 412 - } 413 - 414 - ifp = xfs_ifork_ptr(ip, whichfork); 415 - if (rec_diff > 0) { 416 - /* 417 - * If there wasn't any memory allocated before, just 418 - * allocate it now and get out. 419 - */ 420 - if (ifp->if_broot_bytes == 0) { 421 - new_size = xfs_bmap_broot_space_calc(mp, rec_diff); 422 - ifp->if_broot = kmalloc(new_size, 423 - GFP_KERNEL | __GFP_NOFAIL); 424 - ifp->if_broot_bytes = (int)new_size; 425 - return; 426 - } 427 - 428 - /* 429 - * If there is already an existing if_broot, then we need 430 - * to realloc() it and shift the pointers to their new 431 - * location. The records don't change location because 432 - * they are kept butted up against the btree block header. 433 - */ 434 - cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, false); 435 - new_max = cur_max + rec_diff; 436 - new_size = xfs_bmap_broot_space_calc(mp, new_max); 437 - ifp->if_broot = krealloc(ifp->if_broot, new_size, 438 - GFP_KERNEL | __GFP_NOFAIL); 439 - op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1, 440 - ifp->if_broot_bytes); 441 - np = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1, 442 - (int)new_size); 443 - ifp->if_broot_bytes = (int)new_size; 444 - ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <= 445 - xfs_inode_fork_size(ip, whichfork)); 446 - memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t)); 447 - return; 448 - } 449 - 450 - /* 451 - * rec_diff is less than 0. In this case, we are shrinking the 452 - * if_broot buffer. It must already exist. If we go to zero 453 - * records, just get rid of the root and clear the status bit. 454 - */ 455 - ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); 456 - cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, false); 457 - new_max = cur_max + rec_diff; 458 - ASSERT(new_max >= 0); 459 - if (new_max > 0) 460 - new_size = xfs_bmap_broot_space_calc(mp, new_max); 461 - else 462 - new_size = 0; 463 - if (new_size > 0) { 464 - new_broot = kmalloc(new_size, GFP_KERNEL | __GFP_NOFAIL); 465 - /* 466 - * First copy over the btree block header. 467 - */ 468 - memcpy(new_broot, ifp->if_broot, 469 - xfs_bmbt_block_len(ip->i_mount)); 470 - } else { 471 - new_broot = NULL; 472 - } 473 - 474 - /* 475 - * Only copy the keys and pointers if there are any. 476 - */ 477 - if (new_max > 0) { 478 - /* 479 - * First copy the keys. 480 - */ 481 - op = (char *)xfs_bmbt_key_addr(mp, ifp->if_broot, 1); 482 - np = (char *)xfs_bmbt_key_addr(mp, new_broot, 1); 483 - memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_key_t)); 484 - 485 - /* 486 - * Then copy the pointers. 487 - */ 488 - op = (char *)xfs_bmap_broot_ptr_addr(mp, ifp->if_broot, 1, 489 - ifp->if_broot_bytes); 490 - np = (char *)xfs_bmap_broot_ptr_addr(mp, new_broot, 1, 491 - (int)new_size); 492 - memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t)); 493 - } 494 - kfree(ifp->if_broot); 495 - ifp->if_broot = new_broot; 496 - ifp->if_broot_bytes = (int)new_size; 497 - if (ifp->if_broot) 498 - ASSERT(xfs_bmap_bmdr_space(ifp->if_broot) <= 499 - xfs_inode_fork_size(ip, whichfork)); 500 - return; 377 + ifp->if_broot = kmalloc(new_size, 378 + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); 379 + ifp->if_broot_bytes = new_size; 380 + return ifp->if_broot; 501 381 } 502 382 383 + /* 384 + * Reallocate the if_broot component of an inode fork so that it is @new_size 385 + * bytes in size. Returns if_broot. 386 + */ 387 + struct xfs_btree_block * 388 + xfs_broot_realloc( 389 + struct xfs_ifork *ifp, 390 + size_t new_size) 391 + { 392 + /* No size change? No action needed. */ 393 + if (new_size == ifp->if_broot_bytes) 394 + return ifp->if_broot; 395 + 396 + /* New size is zero, free it. */ 397 + if (new_size == 0) { 398 + ifp->if_broot_bytes = 0; 399 + kfree(ifp->if_broot); 400 + ifp->if_broot = NULL; 401 + return NULL; 402 + } 403 + 404 + /* 405 + * Shrinking the iroot means we allocate a new smaller object and copy 406 + * it. We don't trust krealloc not to nop on realloc-down. 407 + */ 408 + if (ifp->if_broot_bytes > 0 && ifp->if_broot_bytes > new_size) { 409 + struct xfs_btree_block *old_broot = ifp->if_broot; 410 + 411 + ifp->if_broot = kmalloc(new_size, GFP_KERNEL | __GFP_NOFAIL); 412 + ifp->if_broot_bytes = new_size; 413 + memcpy(ifp->if_broot, old_broot, new_size); 414 + kfree(old_broot); 415 + return ifp->if_broot; 416 + } 417 + 418 + /* 419 + * Growing the iroot means we can krealloc. This may get us the same 420 + * object. 421 + */ 422 + ifp->if_broot = krealloc(ifp->if_broot, new_size, 423 + GFP_KERNEL | __GFP_NOFAIL); 424 + ifp->if_broot_bytes = new_size; 425 + return ifp->if_broot; 426 + } 503 427 504 428 /* 505 429 * This is called when the amount of space needed for if_data ··· 610 668 ASSERT(whichfork == XFS_DATA_FORK); 611 669 xfs_dinode_put_rdev(dip, 612 670 linux_to_xfs_dev_t(VFS_I(ip)->i_rdev)); 671 + } 672 + break; 673 + 674 + case XFS_DINODE_FMT_META_BTREE: 675 + ASSERT(whichfork == XFS_DATA_FORK); 676 + 677 + if (!(iip->ili_fields & brootflag[whichfork])) 678 + break; 679 + 680 + switch (ip->i_metatype) { 681 + case XFS_METAFILE_RTRMAP: 682 + xfs_iflush_rtrmap(ip, dip); 683 + break; 684 + case XFS_METAFILE_RTREFCOUNT: 685 + xfs_iflush_rtrefcount(ip, dip); 686 + break; 687 + default: 688 + ASSERT(0); 689 + break; 613 690 } 614 691 break; 615 692

+5 -1

fs/xfs/libxfs/xfs_inode_fork.h

··· 170 170 void xfs_idestroy_fork(struct xfs_ifork *ifp); 171 171 void * xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff, 172 172 int whichfork); 173 - void xfs_iroot_realloc(struct xfs_inode *, int, int); 173 + struct xfs_btree_block *xfs_broot_alloc(struct xfs_ifork *ifp, 174 + size_t new_size); 175 + struct xfs_btree_block *xfs_broot_realloc(struct xfs_ifork *ifp, 176 + size_t new_size); 177 + 174 178 int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); 175 179 int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *, 176 180 int);

+9 -7

fs/xfs/libxfs/xfs_log_format.h

··· 250 250 #define XFS_LI_XMD 0x1249 /* mapping exchange done */ 251 251 #define XFS_LI_EFI_RT 0x124a /* realtime extent free intent */ 252 252 #define XFS_LI_EFD_RT 0x124b /* realtime extent free done */ 253 + #define XFS_LI_RUI_RT 0x124c /* realtime rmap update intent */ 254 + #define XFS_LI_RUD_RT 0x124d /* realtime rmap update done */ 255 + #define XFS_LI_CUI_RT 0x124e /* realtime refcount update intent */ 256 + #define XFS_LI_CUD_RT 0x124f /* realtime refcount update done */ 253 257 254 258 #define XFS_LI_TYPE_DESC \ 255 259 { XFS_LI_EFI, "XFS_LI_EFI" }, \ ··· 275 271 { XFS_LI_XMI, "XFS_LI_XMI" }, \ 276 272 { XFS_LI_XMD, "XFS_LI_XMD" }, \ 277 273 { XFS_LI_EFI_RT, "XFS_LI_EFI_RT" }, \ 278 - { XFS_LI_EFD_RT, "XFS_LI_EFD_RT" } 274 + { XFS_LI_EFD_RT, "XFS_LI_EFD_RT" }, \ 275 + { XFS_LI_RUI_RT, "XFS_LI_RUI_RT" }, \ 276 + { XFS_LI_RUD_RT, "XFS_LI_RUD_RT" }, \ 277 + { XFS_LI_CUI_RT, "XFS_LI_CUI_RT" }, \ 278 + { XFS_LI_CUD_RT, "XFS_LI_CUD_RT" } 279 279 280 280 /* 281 281 * Inode Log Item Format definitions. ··· 358 350 * in the inode item correctly. 359 351 */ 360 352 #define XFS_ILOG_IVERSION 0x8000 361 - 362 - #define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ 363 - XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ 364 - XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ 365 - XFS_ILOG_ABROOT | XFS_ILOG_DOWNER | \ 366 - XFS_ILOG_AOWNER) 367 353 368 354 #define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ 369 355 XFS_ILOG_DBROOT)

+4

fs/xfs/libxfs/xfs_log_recover.h

··· 79 79 extern const struct xlog_recover_item_ops xlog_xmd_item_ops; 80 80 extern const struct xlog_recover_item_ops xlog_rtefi_item_ops; 81 81 extern const struct xlog_recover_item_ops xlog_rtefd_item_ops; 82 + extern const struct xlog_recover_item_ops xlog_rtrui_item_ops; 83 + extern const struct xlog_recover_item_ops xlog_rtrud_item_ops; 84 + extern const struct xlog_recover_item_ops xlog_rtcui_item_ops; 85 + extern const struct xlog_recover_item_ops xlog_rtcud_item_ops; 82 86 83 87 /* 84 88 * Macros, structures, prototypes for internal log manager use.

+4

fs/xfs/libxfs/xfs_metadir.c

··· 29 29 #include "xfs_dir2_priv.h" 30 30 #include "xfs_parent.h" 31 31 #include "xfs_health.h" 32 + #include "xfs_errortag.h" 33 + #include "xfs_error.h" 34 + #include "xfs_btree.h" 35 + #include "xfs_alloc.h" 32 36 33 37 /* 34 38 * Metadata Directory Tree

+223

fs/xfs/libxfs/xfs_metafile.c

··· 17 17 #include "xfs_metafile.h" 18 18 #include "xfs_trace.h" 19 19 #include "xfs_inode.h" 20 + #include "xfs_quota.h" 21 + #include "xfs_errortag.h" 22 + #include "xfs_error.h" 23 + #include "xfs_alloc.h" 24 + 25 + static const struct { 26 + enum xfs_metafile_type mtype; 27 + const char *name; 28 + } xfs_metafile_type_strs[] = { XFS_METAFILE_TYPE_STR }; 29 + 30 + const char * 31 + xfs_metafile_type_str(enum xfs_metafile_type metatype) 32 + { 33 + unsigned int i; 34 + 35 + for (i = 0; i < ARRAY_SIZE(xfs_metafile_type_strs); i++) { 36 + if (xfs_metafile_type_strs[i].mtype == metatype) 37 + return xfs_metafile_type_strs[i].name; 38 + } 39 + 40 + return NULL; 41 + } 20 42 21 43 /* Set up an inode to be recognized as a metadata directory inode. */ 22 44 void ··· 71 49 72 50 ip->i_diflags2 &= ~XFS_DIFLAG2_METADATA; 73 51 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 52 + } 53 + 54 + /* 55 + * Is the amount of space that could be allocated towards a given metadata 56 + * file at or beneath a certain threshold? 57 + */ 58 + static inline bool 59 + xfs_metafile_resv_can_cover( 60 + struct xfs_inode *ip, 61 + int64_t rhs) 62 + { 63 + /* 64 + * The amount of space that can be allocated to this metadata file is 65 + * the remaining reservation for the particular metadata file + the 66 + * global free block count. Take care of the first case to avoid 67 + * touching the per-cpu counter. 68 + */ 69 + if (ip->i_delayed_blks >= rhs) 70 + return true; 71 + 72 + /* 73 + * There aren't enough blocks left in the inode's reservation, but it 74 + * isn't critical unless there also isn't enough free space. 75 + */ 76 + return __percpu_counter_compare(&ip->i_mount->m_fdblocks, 77 + rhs - ip->i_delayed_blks, 2048) >= 0; 78 + } 79 + 80 + /* 81 + * Is this metadata file critically low on blocks? For now we'll define that 82 + * as the number of blocks we can get our hands on being less than 10% of what 83 + * we reserved or less than some arbitrary number (maximum btree height). 84 + */ 85 + bool 86 + xfs_metafile_resv_critical( 87 + struct xfs_inode *ip) 88 + { 89 + uint64_t asked_low_water; 90 + 91 + if (!ip) 92 + return false; 93 + 94 + ASSERT(xfs_is_metadir_inode(ip)); 95 + trace_xfs_metafile_resv_critical(ip, 0); 96 + 97 + if (!xfs_metafile_resv_can_cover(ip, ip->i_mount->m_rtbtree_maxlevels)) 98 + return true; 99 + 100 + asked_low_water = div_u64(ip->i_meta_resv_asked, 10); 101 + if (!xfs_metafile_resv_can_cover(ip, asked_low_water)) 102 + return true; 103 + 104 + return XFS_TEST_ERROR(false, ip->i_mount, 105 + XFS_ERRTAG_METAFILE_RESV_CRITICAL); 106 + } 107 + 108 + /* Allocate a block from the metadata file's reservation. */ 109 + void 110 + xfs_metafile_resv_alloc_space( 111 + struct xfs_inode *ip, 112 + struct xfs_alloc_arg *args) 113 + { 114 + int64_t len = args->len; 115 + 116 + ASSERT(xfs_is_metadir_inode(ip)); 117 + ASSERT(args->resv == XFS_AG_RESV_METAFILE); 118 + 119 + trace_xfs_metafile_resv_alloc_space(ip, args->len); 120 + 121 + /* 122 + * Allocate the blocks from the metadata inode's block reservation 123 + * and update the ondisk sb counter. 124 + */ 125 + if (ip->i_delayed_blks > 0) { 126 + int64_t from_resv; 127 + 128 + from_resv = min_t(int64_t, len, ip->i_delayed_blks); 129 + ip->i_delayed_blks -= from_resv; 130 + xfs_mod_delalloc(ip, 0, -from_resv); 131 + xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, 132 + -from_resv); 133 + len -= from_resv; 134 + } 135 + 136 + /* 137 + * Any allocation in excess of the reservation requires in-core and 138 + * on-disk fdblocks updates. If we can grab @len blocks from the 139 + * in-core fdblocks then all we need to do is update the on-disk 140 + * superblock; if not, then try to steal some from the transaction's 141 + * block reservation. Overruns are only expected for rmap btrees. 142 + */ 143 + if (len) { 144 + unsigned int field; 145 + int error; 146 + 147 + error = xfs_dec_fdblocks(ip->i_mount, len, true); 148 + if (error) 149 + field = XFS_TRANS_SB_FDBLOCKS; 150 + else 151 + field = XFS_TRANS_SB_RES_FDBLOCKS; 152 + 153 + xfs_trans_mod_sb(args->tp, field, -len); 154 + } 155 + 156 + ip->i_nblocks += args->len; 157 + xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE); 158 + } 159 + 160 + /* Free a block to the metadata file's reservation. */ 161 + void 162 + xfs_metafile_resv_free_space( 163 + struct xfs_inode *ip, 164 + struct xfs_trans *tp, 165 + xfs_filblks_t len) 166 + { 167 + int64_t to_resv; 168 + 169 + ASSERT(xfs_is_metadir_inode(ip)); 170 + trace_xfs_metafile_resv_free_space(ip, len); 171 + 172 + ip->i_nblocks -= len; 173 + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 174 + 175 + /* 176 + * Add the freed blocks back into the inode's delalloc reservation 177 + * until it reaches the maximum size. Update the ondisk fdblocks only. 178 + */ 179 + to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks); 180 + if (to_resv > 0) { 181 + to_resv = min_t(int64_t, to_resv, len); 182 + ip->i_delayed_blks += to_resv; 183 + xfs_mod_delalloc(ip, 0, to_resv); 184 + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv); 185 + len -= to_resv; 186 + } 187 + 188 + /* 189 + * Everything else goes back to the filesystem, so update the in-core 190 + * and on-disk counters. 191 + */ 192 + if (len) 193 + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len); 194 + } 195 + 196 + /* Release a metadata file's space reservation. */ 197 + void 198 + xfs_metafile_resv_free( 199 + struct xfs_inode *ip) 200 + { 201 + /* Non-btree metadata inodes don't need space reservations. */ 202 + if (!ip || !ip->i_meta_resv_asked) 203 + return; 204 + 205 + ASSERT(xfs_is_metadir_inode(ip)); 206 + trace_xfs_metafile_resv_free(ip, 0); 207 + 208 + if (ip->i_delayed_blks) { 209 + xfs_mod_delalloc(ip, 0, -ip->i_delayed_blks); 210 + xfs_add_fdblocks(ip->i_mount, ip->i_delayed_blks); 211 + ip->i_delayed_blks = 0; 212 + } 213 + ip->i_meta_resv_asked = 0; 214 + } 215 + 216 + /* Set up a metadata file's space reservation. */ 217 + int 218 + xfs_metafile_resv_init( 219 + struct xfs_inode *ip, 220 + xfs_filblks_t ask) 221 + { 222 + xfs_filblks_t hidden_space; 223 + xfs_filblks_t used; 224 + int error; 225 + 226 + if (!ip || ip->i_meta_resv_asked > 0) 227 + return 0; 228 + 229 + ASSERT(xfs_is_metadir_inode(ip)); 230 + 231 + /* 232 + * Space taken by all other metadata btrees are accounted on-disk as 233 + * used space. We therefore only hide the space that is reserved but 234 + * not used by the trees. 235 + */ 236 + used = ip->i_nblocks; 237 + if (used > ask) 238 + ask = used; 239 + hidden_space = ask - used; 240 + 241 + error = xfs_dec_fdblocks(ip->i_mount, hidden_space, true); 242 + if (error) { 243 + trace_xfs_metafile_resv_init_error(ip, error, _RET_IP_); 244 + return error; 245 + } 246 + 247 + xfs_mod_delalloc(ip, 0, hidden_space); 248 + ip->i_delayed_blks = hidden_space; 249 + ip->i_meta_resv_asked = ask; 250 + 251 + trace_xfs_metafile_resv_init(ip, ask); 252 + return 0; 74 253 }

+13

fs/xfs/libxfs/xfs_metafile.h

··· 6 6 #ifndef __XFS_METAFILE_H__ 7 7 #define __XFS_METAFILE_H__ 8 8 9 + const char *xfs_metafile_type_str(enum xfs_metafile_type metatype); 10 + 9 11 /* All metadata files must have these flags set. */ 10 12 #define XFS_METAFILE_DIFLAGS (XFS_DIFLAG_IMMUTABLE | \ 11 13 XFS_DIFLAG_SYNC | \ ··· 22 20 void xfs_metafile_set_iflag(struct xfs_trans *tp, struct xfs_inode *ip, 23 21 enum xfs_metafile_type metafile_type); 24 22 void xfs_metafile_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip); 23 + 24 + /* Space reservations for metadata inodes. */ 25 + struct xfs_alloc_arg; 26 + 27 + bool xfs_metafile_resv_critical(struct xfs_inode *ip); 28 + void xfs_metafile_resv_alloc_space(struct xfs_inode *ip, 29 + struct xfs_alloc_arg *args); 30 + void xfs_metafile_resv_free_space(struct xfs_inode *ip, struct xfs_trans *tp, 31 + xfs_filblks_t len); 32 + void xfs_metafile_resv_free(struct xfs_inode *ip); 33 + int xfs_metafile_resv_init(struct xfs_inode *ip, xfs_filblks_t ask); 25 34 26 35 /* Code specific to kernel/userspace; must be provided externally. */ 27 36

+4

fs/xfs/libxfs/xfs_ondisk.h

··· 83 83 XFS_CHECK_STRUCT_SIZE(union xfs_rtword_raw, 4); 84 84 XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw, 4); 85 85 XFS_CHECK_STRUCT_SIZE(struct xfs_rtbuf_blkinfo, 48); 86 + XFS_CHECK_STRUCT_SIZE(xfs_rtrmap_ptr_t, 8); 87 + XFS_CHECK_STRUCT_SIZE(struct xfs_rtrmap_root, 4); 88 + XFS_CHECK_STRUCT_SIZE(xfs_rtrefcount_ptr_t, 8); 89 + XFS_CHECK_STRUCT_SIZE(struct xfs_rtrefcount_root, 4); 86 90 87 91 /* 88 92 * m68k has problems with struct xfs_attr_leaf_name_remote, but we pad

+228 -50

fs/xfs/libxfs/xfs_refcount.c

··· 25 25 #include "xfs_ag.h" 26 26 #include "xfs_health.h" 27 27 #include "xfs_refcount_item.h" 28 + #include "xfs_rtgroup.h" 29 + #include "xfs_rtalloc.h" 30 + #include "xfs_rtrefcount_btree.h" 28 31 29 32 struct kmem_cache *xfs_refcount_intent_cache; 30 33 ··· 131 128 struct xfs_perag *pag, 132 129 const struct xfs_refcount_irec *irec) 133 130 { 134 - if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) 131 + if (irec->rc_blockcount == 0 || irec->rc_blockcount > XFS_REFC_LEN_MAX) 135 132 return __this_address; 136 133 137 134 if (!xfs_refcount_check_domain(irec)) ··· 141 138 if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount)) 142 139 return __this_address; 143 140 144 - if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT) 141 + if (irec->rc_refcount == 0 || irec->rc_refcount > XFS_REFC_REFCOUNT_MAX) 145 142 return __this_address; 146 143 147 144 return NULL; 145 + } 146 + 147 + xfs_failaddr_t 148 + xfs_rtrefcount_check_irec( 149 + struct xfs_rtgroup *rtg, 150 + const struct xfs_refcount_irec *irec) 151 + { 152 + if (irec->rc_blockcount == 0 || irec->rc_blockcount > XFS_REFC_LEN_MAX) 153 + return __this_address; 154 + 155 + if (!xfs_refcount_check_domain(irec)) 156 + return __this_address; 157 + 158 + /* check for valid extent range, including overflow */ 159 + if (!xfs_verify_rgbext(rtg, irec->rc_startblock, irec->rc_blockcount)) 160 + return __this_address; 161 + 162 + if (irec->rc_refcount == 0 || irec->rc_refcount > XFS_REFC_REFCOUNT_MAX) 163 + return __this_address; 164 + 165 + return NULL; 166 + } 167 + 168 + static inline xfs_failaddr_t 169 + xfs_refcount_check_btrec( 170 + struct xfs_btree_cur *cur, 171 + const struct xfs_refcount_irec *irec) 172 + { 173 + if (xfs_btree_is_rtrefcount(cur->bc_ops)) 174 + return xfs_rtrefcount_check_irec(to_rtg(cur->bc_group), irec); 175 + return xfs_refcount_check_irec(to_perag(cur->bc_group), irec); 148 176 } 149 177 150 178 static inline int ··· 186 152 { 187 153 struct xfs_mount *mp = cur->bc_mp; 188 154 189 - xfs_warn(mp, 155 + if (xfs_btree_is_rtrefcount(cur->bc_ops)) { 156 + xfs_warn(mp, 157 + "RT Refcount BTree record corruption in rtgroup %u detected at %pS!", 158 + cur->bc_group->xg_gno, fa); 159 + } else { 160 + xfs_warn(mp, 190 161 "Refcount BTree record corruption in AG %d detected at %pS!", 191 162 cur->bc_group->xg_gno, fa); 163 + } 192 164 xfs_warn(mp, 193 165 "Start block 0x%x, block count 0x%x, references 0x%x", 194 166 irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount); ··· 220 180 return error; 221 181 222 182 xfs_refcount_btrec_to_irec(rec, irec); 223 - fa = xfs_refcount_check_irec(to_perag(cur->bc_group), irec); 183 + fa = xfs_refcount_check_btrec(cur, irec); 224 184 if (fa) 225 185 return xfs_refcount_complain_bad_rec(cur, fa, irec); 226 186 ··· 893 853 const struct xfs_refcount_irec *irec, 894 854 enum xfs_refc_adjust_op adjust) 895 855 { 896 - /* Once a record hits MAXREFCOUNT, it is pinned there forever */ 897 - if (irec->rc_refcount == MAXREFCOUNT) 898 - return MAXREFCOUNT; 856 + /* Once a record hits XFS_REFC_REFCOUNT_MAX, it is pinned forever */ 857 + if (irec->rc_refcount == XFS_REFC_REFCOUNT_MAX) 858 + return XFS_REFC_REFCOUNT_MAX; 899 859 return irec->rc_refcount + adjust; 900 860 } 901 861 ··· 938 898 * hence we need to catch u32 addition overflows here. 939 899 */ 940 900 ulen += cleft->rc_blockcount + right->rc_blockcount; 941 - if (ulen >= MAXREFCEXTLEN) 901 + if (ulen >= XFS_REFC_LEN_MAX) 942 902 return false; 943 903 944 904 *ulenp = ulen; ··· 973 933 * hence we need to catch u32 addition overflows here. 974 934 */ 975 935 ulen += cleft->rc_blockcount; 976 - if (ulen >= MAXREFCEXTLEN) 936 + if (ulen >= XFS_REFC_LEN_MAX) 977 937 return false; 978 938 979 939 return true; ··· 1007 967 * hence we need to catch u32 addition overflows here. 1008 968 */ 1009 969 ulen += cright->rc_blockcount; 1010 - if (ulen >= MAXREFCEXTLEN) 970 + if (ulen >= XFS_REFC_LEN_MAX) 1011 971 return false; 1012 972 1013 973 return true; ··· 1105 1065 */ 1106 1066 overhead = xfs_allocfree_block_count(cur->bc_mp, 1107 1067 cur->bc_refc.shape_changes); 1108 - overhead += cur->bc_mp->m_refc_maxlevels; 1068 + overhead += cur->bc_maxlevels; 1109 1069 overhead *= cur->bc_mp->m_sb.sb_blocksize; 1110 1070 1111 1071 /* ··· 1125 1085 cur->bc_refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; 1126 1086 } 1127 1087 1088 + /* Schedule an extent free. */ 1089 + static int 1090 + xrefc_free_extent( 1091 + struct xfs_btree_cur *cur, 1092 + struct xfs_refcount_irec *rec) 1093 + { 1094 + unsigned int flags = 0; 1095 + 1096 + if (xfs_btree_is_rtrefcount(cur->bc_ops)) 1097 + flags |= XFS_FREE_EXTENT_REALTIME; 1098 + 1099 + return xfs_free_extent_later(cur->bc_tp, 1100 + xfs_gbno_to_fsb(cur->bc_group, rec->rc_startblock), 1101 + rec->rc_blockcount, NULL, XFS_AG_RESV_NONE, flags); 1102 + } 1103 + 1128 1104 /* 1129 1105 * Adjust the refcounts of middle extents. At this point we should have 1130 1106 * split extents that crossed the adjustment range; merged with adjacent ··· 1157 1101 struct xfs_refcount_irec ext, tmp; 1158 1102 int error; 1159 1103 int found_rec, found_tmp; 1160 - xfs_fsblock_t fsbno; 1161 1104 1162 1105 /* Merging did all the work already. */ 1163 1106 if (*aglen == 0) ··· 1172 1117 if (error) 1173 1118 goto out_error; 1174 1119 if (!found_rec || ext.rc_domain != XFS_REFC_DOMAIN_SHARED) { 1175 - ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; 1120 + ext.rc_startblock = xfs_group_max_blocks(cur->bc_group); 1176 1121 ext.rc_blockcount = 0; 1177 1122 ext.rc_refcount = 0; 1178 1123 ext.rc_domain = XFS_REFC_DOMAIN_SHARED; ··· 1209 1154 goto out_error; 1210 1155 } 1211 1156 } else { 1212 - fsbno = xfs_agbno_to_fsb(to_perag(cur->bc_group), 1213 - tmp.rc_startblock); 1214 - error = xfs_free_extent_later(cur->bc_tp, fsbno, 1215 - tmp.rc_blockcount, NULL, 1216 - XFS_AG_RESV_NONE, 0); 1157 + error = xrefc_free_extent(cur, &tmp); 1217 1158 if (error) 1218 1159 goto out_error; 1219 1160 } ··· 1247 1196 * Adjust the reference count and either update the tree 1248 1197 * (incr) or free the blocks (decr). 1249 1198 */ 1250 - if (ext.rc_refcount == MAXREFCOUNT) 1199 + if (ext.rc_refcount == XFS_REFC_REFCOUNT_MAX) 1251 1200 goto skip; 1252 1201 ext.rc_refcount += adj; 1253 1202 trace_xfs_refcount_modify_extent(cur, &ext); ··· 1267 1216 } 1268 1217 goto advloop; 1269 1218 } else { 1270 - fsbno = xfs_agbno_to_fsb(to_perag(cur->bc_group), 1271 - ext.rc_startblock); 1272 - error = xfs_free_extent_later(cur->bc_tp, fsbno, 1273 - ext.rc_blockcount, NULL, 1274 - XFS_AG_RESV_NONE, 0); 1219 + error = xrefc_free_extent(cur, &ext); 1275 1220 if (error) 1276 1221 goto out_error; 1277 1222 } ··· 1464 1417 } 1465 1418 1466 1419 /* 1420 + * Set up a continuation a deferred rtrefcount operation by updating the 1421 + * intent. Checks to make sure we're not going to run off the end of the 1422 + * rtgroup. 1423 + */ 1424 + static inline int 1425 + xfs_rtrefcount_continue_op( 1426 + struct xfs_btree_cur *cur, 1427 + struct xfs_refcount_intent *ri, 1428 + xfs_agblock_t new_agbno) 1429 + { 1430 + struct xfs_mount *mp = cur->bc_mp; 1431 + struct xfs_rtgroup *rtg = to_rtg(ri->ri_group); 1432 + 1433 + if (XFS_IS_CORRUPT(mp, !xfs_verify_rgbext(rtg, new_agbno, 1434 + ri->ri_blockcount))) { 1435 + xfs_btree_mark_sick(cur); 1436 + return -EFSCORRUPTED; 1437 + } 1438 + 1439 + ri->ri_startblock = xfs_rgbno_to_rtb(rtg, new_agbno); 1440 + 1441 + ASSERT(xfs_verify_rtbext(mp, ri->ri_startblock, ri->ri_blockcount)); 1442 + return 0; 1443 + } 1444 + 1445 + /* 1446 + * Process one of the deferred realtime refcount operations. We pass back the 1447 + * btree cursor to maintain our lock on the btree between calls. 1448 + */ 1449 + int 1450 + xfs_rtrefcount_finish_one( 1451 + struct xfs_trans *tp, 1452 + struct xfs_refcount_intent *ri, 1453 + struct xfs_btree_cur **pcur) 1454 + { 1455 + struct xfs_mount *mp = tp->t_mountp; 1456 + struct xfs_rtgroup *rtg = to_rtg(ri->ri_group); 1457 + struct xfs_btree_cur *rcur = *pcur; 1458 + int error = 0; 1459 + xfs_rgblock_t bno; 1460 + unsigned long nr_ops = 0; 1461 + int shape_changes = 0; 1462 + 1463 + bno = xfs_rtb_to_rgbno(mp, ri->ri_startblock); 1464 + 1465 + trace_xfs_refcount_deferred(mp, ri); 1466 + 1467 + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) 1468 + return -EIO; 1469 + 1470 + /* 1471 + * If we haven't gotten a cursor or the cursor AG doesn't match 1472 + * the startblock, get one now. 1473 + */ 1474 + if (rcur != NULL && rcur->bc_group != ri->ri_group) { 1475 + nr_ops = rcur->bc_refc.nr_ops; 1476 + shape_changes = rcur->bc_refc.shape_changes; 1477 + xfs_btree_del_cursor(rcur, 0); 1478 + rcur = NULL; 1479 + *pcur = NULL; 1480 + } 1481 + if (rcur == NULL) { 1482 + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_REFCOUNT); 1483 + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_REFCOUNT); 1484 + *pcur = rcur = xfs_rtrefcountbt_init_cursor(tp, rtg); 1485 + 1486 + rcur->bc_refc.nr_ops = nr_ops; 1487 + rcur->bc_refc.shape_changes = shape_changes; 1488 + } 1489 + 1490 + switch (ri->ri_type) { 1491 + case XFS_REFCOUNT_INCREASE: 1492 + error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, 1493 + XFS_REFCOUNT_ADJUST_INCREASE); 1494 + if (error) 1495 + return error; 1496 + if (ri->ri_blockcount > 0) 1497 + error = xfs_rtrefcount_continue_op(rcur, ri, bno); 1498 + break; 1499 + case XFS_REFCOUNT_DECREASE: 1500 + error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, 1501 + XFS_REFCOUNT_ADJUST_DECREASE); 1502 + if (error) 1503 + return error; 1504 + if (ri->ri_blockcount > 0) 1505 + error = xfs_rtrefcount_continue_op(rcur, ri, bno); 1506 + break; 1507 + case XFS_REFCOUNT_ALLOC_COW: 1508 + error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount); 1509 + if (error) 1510 + return error; 1511 + ri->ri_blockcount = 0; 1512 + break; 1513 + case XFS_REFCOUNT_FREE_COW: 1514 + error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount); 1515 + if (error) 1516 + return error; 1517 + ri->ri_blockcount = 0; 1518 + break; 1519 + default: 1520 + ASSERT(0); 1521 + return -EFSCORRUPTED; 1522 + } 1523 + if (!error && ri->ri_blockcount > 0) 1524 + trace_xfs_refcount_finish_one_leftover(mp, ri); 1525 + return error; 1526 + } 1527 + 1528 + /* 1467 1529 * Record a refcount intent for later processing. 1468 1530 */ 1469 1531 static void 1470 1532 __xfs_refcount_add( 1471 1533 struct xfs_trans *tp, 1472 1534 enum xfs_refcount_intent_type type, 1535 + bool isrt, 1473 1536 xfs_fsblock_t startblock, 1474 1537 xfs_extlen_t blockcount) 1475 1538 { ··· 1591 1434 ri->ri_type = type; 1592 1435 ri->ri_startblock = startblock; 1593 1436 ri->ri_blockcount = blockcount; 1437 + ri->ri_realtime = isrt; 1594 1438 1595 1439 xfs_refcount_defer_add(tp, ri); 1596 1440 } ··· 1602 1444 void 1603 1445 xfs_refcount_increase_extent( 1604 1446 struct xfs_trans *tp, 1447 + bool isrt, 1605 1448 struct xfs_bmbt_irec *PREV) 1606 1449 { 1607 1450 if (!xfs_has_reflink(tp->t_mountp)) 1608 1451 return; 1609 1452 1610 - __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock, 1453 + __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, isrt, PREV->br_startblock, 1611 1454 PREV->br_blockcount); 1612 1455 } 1613 1456 ··· 1618 1459 void 1619 1460 xfs_refcount_decrease_extent( 1620 1461 struct xfs_trans *tp, 1462 + bool isrt, 1621 1463 struct xfs_bmbt_irec *PREV) 1622 1464 { 1623 1465 if (!xfs_has_reflink(tp->t_mountp)) 1624 1466 return; 1625 1467 1626 - __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock, 1468 + __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, isrt, PREV->br_startblock, 1627 1469 PREV->br_blockcount); 1628 1470 } 1629 1471 ··· 1826 1666 goto out_error; 1827 1667 } 1828 1668 if (!found_rec) { 1829 - ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; 1669 + ext.rc_startblock = xfs_group_max_blocks(cur->bc_group); 1830 1670 ext.rc_blockcount = 0; 1831 1671 ext.rc_refcount = 0; 1832 1672 ext.rc_domain = XFS_REFC_DOMAIN_COW; ··· 1980 1820 void 1981 1821 xfs_refcount_alloc_cow_extent( 1982 1822 struct xfs_trans *tp, 1823 + bool isrt, 1983 1824 xfs_fsblock_t fsb, 1984 1825 xfs_extlen_t len) 1985 1826 { ··· 1989 1828 if (!xfs_has_reflink(mp)) 1990 1829 return; 1991 1830 1992 - __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len); 1831 + __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, isrt, fsb, len); 1993 1832 1994 1833 /* Add rmap entry */ 1995 - xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), 1996 - XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); 1834 + xfs_rmap_alloc_extent(tp, isrt, fsb, len, XFS_RMAP_OWN_COW); 1997 1835 } 1998 1836 1999 1837 /* Forget a CoW staging event in the refcount btree. */ 2000 1838 void 2001 1839 xfs_refcount_free_cow_extent( 2002 1840 struct xfs_trans *tp, 1841 + bool isrt, 2003 1842 xfs_fsblock_t fsb, 2004 1843 xfs_extlen_t len) 2005 1844 { ··· 2009 1848 return; 2010 1849 2011 1850 /* Remove rmap entry */ 2012 - xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb), 2013 - XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW); 2014 - __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len); 1851 + xfs_rmap_free_extent(tp, isrt, fsb, len, XFS_RMAP_OWN_COW); 1852 + __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, isrt, fsb, len); 2015 1853 } 2016 1854 2017 1855 struct xfs_refcount_recovery { ··· 2039 1879 INIT_LIST_HEAD(&rr->rr_list); 2040 1880 xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); 2041 1881 2042 - if (xfs_refcount_check_irec(to_perag(cur->bc_group), &rr->rr_rrec) != 2043 - NULL || 1882 + if (xfs_refcount_check_btrec(cur, &rr->rr_rrec) != NULL || 2044 1883 XFS_IS_CORRUPT(cur->bc_mp, 2045 1884 rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) { 2046 1885 xfs_btree_mark_sick(cur); ··· 2054 1895 /* Find and remove leftover CoW reservations. */ 2055 1896 int 2056 1897 xfs_refcount_recover_cow_leftovers( 2057 - struct xfs_mount *mp, 2058 - struct xfs_perag *pag) 1898 + struct xfs_group *xg) 2059 1899 { 1900 + struct xfs_mount *mp = xg->xg_mount; 1901 + bool isrt = xg->xg_type == XG_TYPE_RTG; 2060 1902 struct xfs_trans *tp; 2061 1903 struct xfs_btree_cur *cur; 2062 - struct xfs_buf *agbp; 1904 + struct xfs_buf *agbp = NULL; 2063 1905 struct xfs_refcount_recovery *rr, *n; 2064 1906 struct list_head debris; 2065 1907 union xfs_btree_irec low = { ··· 2073 1913 xfs_fsblock_t fsb; 2074 1914 int error; 2075 1915 2076 - /* reflink filesystems mustn't have AGs larger than 2^31-1 blocks */ 1916 + /* reflink filesystems must not have groups larger than 2^31-1 blocks */ 1917 + BUILD_BUG_ON(XFS_MAX_RGBLOCKS >= XFS_REFC_COWFLAG); 2077 1918 BUILD_BUG_ON(XFS_MAX_CRC_AG_BLOCKS >= XFS_REFC_COWFLAG); 2078 - if (mp->m_sb.sb_agblocks > XFS_MAX_CRC_AG_BLOCKS) 2079 - return -EOPNOTSUPP; 1919 + 1920 + if (isrt) { 1921 + if (!xfs_has_rtgroups(mp)) 1922 + return 0; 1923 + if (xfs_group_max_blocks(xg) >= XFS_MAX_RGBLOCKS) 1924 + return -EOPNOTSUPP; 1925 + } else { 1926 + if (xfs_group_max_blocks(xg) > XFS_MAX_CRC_AG_BLOCKS) 1927 + return -EOPNOTSUPP; 1928 + } 2080 1929 2081 1930 INIT_LIST_HEAD(&debris); 2082 1931 ··· 2103 1934 if (error) 2104 1935 return error; 2105 1936 2106 - error = xfs_alloc_read_agf(pag, tp, 0, &agbp); 2107 - if (error) 2108 - goto out_trans; 2109 - cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); 1937 + if (isrt) { 1938 + xfs_rtgroup_lock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT); 1939 + cur = xfs_rtrefcountbt_init_cursor(tp, to_rtg(xg)); 1940 + } else { 1941 + error = xfs_alloc_read_agf(to_perag(xg), tp, 0, &agbp); 1942 + if (error) 1943 + goto out_trans; 1944 + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, to_perag(xg)); 1945 + } 2110 1946 2111 1947 /* Find all the leftover CoW staging extents. */ 2112 1948 error = xfs_btree_query_range(cur, &low, &high, 2113 1949 xfs_refcount_recover_extent, &debris); 2114 1950 xfs_btree_del_cursor(cur, error); 2115 - xfs_trans_brelse(tp, agbp); 1951 + if (agbp) 1952 + xfs_trans_brelse(tp, agbp); 1953 + else 1954 + xfs_rtgroup_unlock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT); 2116 1955 xfs_trans_cancel(tp); 2117 1956 if (error) 2118 1957 goto out_free; ··· 2133 1956 goto out_free; 2134 1957 2135 1958 /* Free the orphan record */ 2136 - fsb = xfs_agbno_to_fsb(pag, rr->rr_rrec.rc_startblock); 2137 - xfs_refcount_free_cow_extent(tp, fsb, 1959 + fsb = xfs_gbno_to_fsb(xg, rr->rr_rrec.rc_startblock); 1960 + xfs_refcount_free_cow_extent(tp, isrt, fsb, 2138 1961 rr->rr_rrec.rc_blockcount); 2139 1962 2140 1963 /* Free the block. */ 2141 1964 error = xfs_free_extent_later(tp, fsb, 2142 1965 rr->rr_rrec.rc_blockcount, NULL, 2143 - XFS_AG_RESV_NONE, 0); 1966 + XFS_AG_RESV_NONE, 1967 + isrt ? XFS_FREE_EXTENT_REALTIME : 0); 2144 1968 if (error) 2145 1969 goto out_trans; 2146 1970 ··· 2206 2028 xfs_failaddr_t fa; 2207 2029 2208 2030 xfs_refcount_btrec_to_irec(rec, &irec); 2209 - fa = xfs_refcount_check_irec(to_perag(cur->bc_group), &irec); 2031 + fa = xfs_refcount_check_btrec(cur, &irec); 2210 2032 if (fa) 2211 2033 return xfs_refcount_complain_bad_rec(cur, fa, &irec); 2212 2034

+14 -9

fs/xfs/libxfs/xfs_refcount.h

··· 12 12 struct xfs_btree_cur; 13 13 struct xfs_bmbt_irec; 14 14 struct xfs_refcount_irec; 15 + struct xfs_rtgroup; 15 16 16 17 extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur, 17 18 enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat); ··· 61 60 enum xfs_refcount_intent_type ri_type; 62 61 xfs_extlen_t ri_blockcount; 63 62 xfs_fsblock_t ri_startblock; 63 + bool ri_realtime; 64 64 }; 65 65 66 66 /* Check that the refcount is appropriate for the record domain. */ ··· 76 74 return true; 77 75 } 78 76 79 - void xfs_refcount_increase_extent(struct xfs_trans *tp, 77 + void xfs_refcount_increase_extent(struct xfs_trans *tp, bool isrt, 80 78 struct xfs_bmbt_irec *irec); 81 - void xfs_refcount_decrease_extent(struct xfs_trans *tp, 79 + void xfs_refcount_decrease_extent(struct xfs_trans *tp, bool isrt, 82 80 struct xfs_bmbt_irec *irec); 83 81 84 - extern int xfs_refcount_finish_one(struct xfs_trans *tp, 82 + int xfs_refcount_finish_one(struct xfs_trans *tp, 83 + struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur); 84 + int xfs_rtrefcount_finish_one(struct xfs_trans *tp, 85 85 struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur); 86 86 87 87 extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur, 88 88 xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, 89 89 xfs_extlen_t *flen, bool find_end_of_shared); 90 90 91 - void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, 92 - xfs_extlen_t len); 93 - void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, 94 - xfs_extlen_t len); 95 - extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, 96 - struct xfs_perag *pag); 91 + void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, bool isrt, 92 + xfs_fsblock_t fsb, xfs_extlen_t len); 93 + void xfs_refcount_free_cow_extent(struct xfs_trans *tp, bool isrt, 94 + xfs_fsblock_t fsb, xfs_extlen_t len); 95 + int xfs_refcount_recover_cow_leftovers(struct xfs_group *xg); 97 96 98 97 /* 99 98 * While we're adjusting the refcounts records of an extent, we have ··· 122 119 extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec, 123 120 struct xfs_refcount_irec *irec); 124 121 xfs_failaddr_t xfs_refcount_check_irec(struct xfs_perag *pag, 122 + const struct xfs_refcount_irec *irec); 123 + xfs_failaddr_t xfs_rtrefcount_check_irec(struct xfs_rtgroup *rtg, 125 124 const struct xfs_refcount_irec *irec); 126 125 extern int xfs_refcount_insert(struct xfs_btree_cur *cur, 127 126 struct xfs_refcount_irec *irec, int *stat);

+141 -37

fs/xfs/libxfs/xfs_rmap.c

··· 25 25 #include "xfs_ag.h" 26 26 #include "xfs_health.h" 27 27 #include "xfs_rmap_item.h" 28 + #include "xfs_rtgroup.h" 29 + #include "xfs_rtrmap_btree.h" 28 30 29 31 struct kmem_cache *xfs_rmap_intent_cache; 30 32 ··· 266 264 return NULL; 267 265 } 268 266 267 + static xfs_failaddr_t 268 + xfs_rtrmap_check_meta_irec( 269 + struct xfs_rtgroup *rtg, 270 + const struct xfs_rmap_irec *irec) 271 + { 272 + struct xfs_mount *mp = rtg_mount(rtg); 273 + 274 + if (irec->rm_offset != 0) 275 + return __this_address; 276 + if (irec->rm_flags & XFS_RMAP_UNWRITTEN) 277 + return __this_address; 278 + 279 + switch (irec->rm_owner) { 280 + case XFS_RMAP_OWN_FS: 281 + if (irec->rm_startblock != 0) 282 + return __this_address; 283 + if (irec->rm_blockcount != mp->m_sb.sb_rextsize) 284 + return __this_address; 285 + return NULL; 286 + case XFS_RMAP_OWN_COW: 287 + if (!xfs_has_rtreflink(mp)) 288 + return __this_address; 289 + if (!xfs_verify_rgbext(rtg, irec->rm_startblock, 290 + irec->rm_blockcount)) 291 + return __this_address; 292 + return NULL; 293 + default: 294 + return __this_address; 295 + } 296 + 297 + return NULL; 298 + } 299 + 300 + static xfs_failaddr_t 301 + xfs_rtrmap_check_inode_irec( 302 + struct xfs_rtgroup *rtg, 303 + const struct xfs_rmap_irec *irec) 304 + { 305 + struct xfs_mount *mp = rtg_mount(rtg); 306 + 307 + if (!xfs_verify_ino(mp, irec->rm_owner)) 308 + return __this_address; 309 + if (!xfs_verify_rgbext(rtg, irec->rm_startblock, irec->rm_blockcount)) 310 + return __this_address; 311 + if (!xfs_verify_fileext(mp, irec->rm_offset, irec->rm_blockcount)) 312 + return __this_address; 313 + return NULL; 314 + } 315 + 316 + xfs_failaddr_t 317 + xfs_rtrmap_check_irec( 318 + struct xfs_rtgroup *rtg, 319 + const struct xfs_rmap_irec *irec) 320 + { 321 + if (irec->rm_blockcount == 0) 322 + return __this_address; 323 + if (irec->rm_flags & (XFS_RMAP_BMBT_BLOCK | XFS_RMAP_ATTR_FORK)) 324 + return __this_address; 325 + if (XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)) 326 + return xfs_rtrmap_check_meta_irec(rtg, irec); 327 + return xfs_rtrmap_check_inode_irec(rtg, irec); 328 + } 329 + 269 330 static inline xfs_failaddr_t 270 331 xfs_rmap_check_btrec( 271 332 struct xfs_btree_cur *cur, 272 333 const struct xfs_rmap_irec *irec) 273 334 { 335 + if (xfs_btree_is_rtrmap(cur->bc_ops) || 336 + xfs_btree_is_mem_rtrmap(cur->bc_ops)) 337 + return xfs_rtrmap_check_irec(to_rtg(cur->bc_group), irec); 274 338 return xfs_rmap_check_irec(to_perag(cur->bc_group), irec); 275 339 } 276 340 ··· 351 283 if (xfs_btree_is_mem_rmap(cur->bc_ops)) 352 284 xfs_warn(mp, 353 285 "In-Memory Reverse Mapping BTree record corruption detected at %pS!", fa); 286 + else if (xfs_btree_is_rtrmap(cur->bc_ops)) 287 + xfs_warn(mp, 288 + "RT Reverse Mapping BTree record corruption in rtgroup %u detected at %pS!", 289 + cur->bc_group->xg_gno, fa); 354 290 else 355 291 xfs_warn(mp, 356 292 "Reverse Mapping BTree record corruption in AG %d detected at %pS!", ··· 597 525 struct xfs_btree_cur *cur, 598 526 uint64_t ltoff, 599 527 struct xfs_rmap_irec *rec, 600 - xfs_filblks_t len, 528 + xfs_extlen_t len, 601 529 uint64_t owner, 602 530 uint64_t offset, 603 531 unsigned int flags) ··· 2628 2556 } 2629 2557 } 2630 2558 2559 + static int 2560 + xfs_rmap_finish_init_cursor( 2561 + struct xfs_trans *tp, 2562 + struct xfs_rmap_intent *ri, 2563 + struct xfs_btree_cur **pcur) 2564 + { 2565 + struct xfs_perag *pag = to_perag(ri->ri_group); 2566 + struct xfs_buf *agbp = NULL; 2567 + int error; 2568 + 2569 + /* 2570 + * Refresh the freelist before we start changing the rmapbt, because a 2571 + * shape change could cause us to allocate blocks. 2572 + */ 2573 + error = xfs_free_extent_fix_freelist(tp, pag, &agbp); 2574 + if (error) { 2575 + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL); 2576 + return error; 2577 + } 2578 + if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) { 2579 + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL); 2580 + return -EFSCORRUPTED; 2581 + } 2582 + *pcur = xfs_rmapbt_init_cursor(tp->t_mountp, tp, agbp, pag); 2583 + return 0; 2584 + } 2585 + 2586 + static int 2587 + xfs_rtrmap_finish_init_cursor( 2588 + struct xfs_trans *tp, 2589 + struct xfs_rmap_intent *ri, 2590 + struct xfs_btree_cur **pcur) 2591 + { 2592 + struct xfs_rtgroup *rtg = to_rtg(ri->ri_group); 2593 + 2594 + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 2595 + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); 2596 + *pcur = xfs_rtrmapbt_init_cursor(tp, rtg); 2597 + return 0; 2598 + } 2599 + 2631 2600 /* 2632 2601 * Process one of the deferred rmap operations. We pass back the 2633 2602 * btree cursor to maintain our lock on the rmapbt between calls. ··· 2684 2571 { 2685 2572 struct xfs_owner_info oinfo; 2686 2573 struct xfs_mount *mp = tp->t_mountp; 2687 - struct xfs_btree_cur *rcur = *pcur; 2688 - struct xfs_buf *agbp = NULL; 2689 2574 xfs_agblock_t bno; 2690 2575 bool unwritten; 2691 2576 int error = 0; ··· 2697 2586 * If we haven't gotten a cursor or the cursor AG doesn't match 2698 2587 * the startblock, get one now. 2699 2588 */ 2700 - if (rcur != NULL && rcur->bc_group != ri->ri_group) { 2701 - xfs_btree_del_cursor(rcur, 0); 2702 - rcur = NULL; 2589 + if (*pcur != NULL && (*pcur)->bc_group != ri->ri_group) { 2590 + xfs_btree_del_cursor(*pcur, 0); 2703 2591 *pcur = NULL; 2704 2592 } 2705 - if (rcur == NULL) { 2706 - struct xfs_perag *pag = to_perag(ri->ri_group); 2707 - 2708 - /* 2709 - * Refresh the freelist before we start changing the 2710 - * rmapbt, because a shape change could cause us to 2711 - * allocate blocks. 2712 - */ 2713 - error = xfs_free_extent_fix_freelist(tp, pag, &agbp); 2714 - if (error) { 2715 - xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL); 2593 + if (*pcur == NULL) { 2594 + if (ri->ri_group->xg_type == XG_TYPE_RTG) 2595 + error = xfs_rtrmap_finish_init_cursor(tp, ri, pcur); 2596 + else 2597 + error = xfs_rmap_finish_init_cursor(tp, ri, pcur); 2598 + if (error) 2716 2599 return error; 2717 - } 2718 - if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) { 2719 - xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL); 2720 - return -EFSCORRUPTED; 2721 - } 2722 - 2723 - *pcur = rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); 2724 2600 } 2725 2601 2726 2602 xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork, 2727 2603 ri->ri_bmap.br_startoff); 2728 2604 unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN; 2729 - bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, ri->ri_bmap.br_startblock); 2730 2605 2731 - error = __xfs_rmap_finish_intent(rcur, ri->ri_type, bno, 2606 + bno = xfs_fsb_to_gbno(mp, ri->ri_bmap.br_startblock, 2607 + ri->ri_group->xg_type); 2608 + error = __xfs_rmap_finish_intent(*pcur, ri->ri_type, bno, 2732 2609 ri->ri_bmap.br_blockcount, &oinfo, unwritten); 2733 2610 if (error) 2734 2611 return error; ··· 2746 2647 struct xfs_trans *tp, 2747 2648 enum xfs_rmap_intent_type type, 2748 2649 uint64_t owner, 2650 + bool isrt, 2749 2651 int whichfork, 2750 2652 struct xfs_bmbt_irec *bmap) 2751 2653 { ··· 2758 2658 ri->ri_owner = owner; 2759 2659 ri->ri_whichfork = whichfork; 2760 2660 ri->ri_bmap = *bmap; 2661 + ri->ri_realtime = isrt; 2761 2662 2762 2663 xfs_rmap_defer_add(tp, ri); 2763 2664 } ··· 2772 2671 struct xfs_bmbt_irec *PREV) 2773 2672 { 2774 2673 enum xfs_rmap_intent_type type = XFS_RMAP_MAP; 2674 + bool isrt = xfs_ifork_is_realtime(ip, whichfork); 2775 2675 2776 2676 if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) 2777 2677 return; ··· 2780 2678 if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) 2781 2679 type = XFS_RMAP_MAP_SHARED; 2782 2680 2783 - __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); 2681 + __xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV); 2784 2682 } 2785 2683 2786 2684 /* Unmap an extent out of a file. */ ··· 2792 2690 struct xfs_bmbt_irec *PREV) 2793 2691 { 2794 2692 enum xfs_rmap_intent_type type = XFS_RMAP_UNMAP; 2693 + bool isrt = xfs_ifork_is_realtime(ip, whichfork); 2795 2694 2796 2695 if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) 2797 2696 return; ··· 2800 2697 if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) 2801 2698 type = XFS_RMAP_UNMAP_SHARED; 2802 2699 2803 - __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); 2700 + __xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV); 2804 2701 } 2805 2702 2806 2703 /* ··· 2818 2715 struct xfs_bmbt_irec *PREV) 2819 2716 { 2820 2717 enum xfs_rmap_intent_type type = XFS_RMAP_CONVERT; 2718 + bool isrt = xfs_ifork_is_realtime(ip, whichfork); 2821 2719 2822 2720 if (!xfs_rmap_update_is_needed(mp, whichfork)) 2823 2721 return; ··· 2826 2722 if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) 2827 2723 type = XFS_RMAP_CONVERT_SHARED; 2828 2724 2829 - __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); 2725 + __xfs_rmap_add(tp, type, ip->i_ino, isrt, whichfork, PREV); 2830 2726 } 2831 2727 2832 2728 /* Schedule the creation of an rmap for non-file data. */ 2833 2729 void 2834 2730 xfs_rmap_alloc_extent( 2835 2731 struct xfs_trans *tp, 2836 - xfs_agnumber_t agno, 2837 - xfs_agblock_t bno, 2732 + bool isrt, 2733 + xfs_fsblock_t fsbno, 2838 2734 xfs_extlen_t len, 2839 2735 uint64_t owner) 2840 2736 { ··· 2843 2739 if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK)) 2844 2740 return; 2845 2741 2846 - bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno); 2742 + bmap.br_startblock = fsbno; 2847 2743 bmap.br_blockcount = len; 2848 2744 bmap.br_startoff = 0; 2849 2745 bmap.br_state = XFS_EXT_NORM; 2850 2746 2851 - __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap); 2747 + __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, isrt, XFS_DATA_FORK, &bmap); 2852 2748 } 2853 2749 2854 2750 /* Schedule the deletion of an rmap for non-file data. */ 2855 2751 void 2856 2752 xfs_rmap_free_extent( 2857 2753 struct xfs_trans *tp, 2858 - xfs_agnumber_t agno, 2859 - xfs_agblock_t bno, 2754 + bool isrt, 2755 + xfs_fsblock_t fsbno, 2860 2756 xfs_extlen_t len, 2861 2757 uint64_t owner) 2862 2758 { ··· 2865 2761 if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK)) 2866 2762 return; 2867 2763 2868 - bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno); 2764 + bmap.br_startblock = fsbno; 2869 2765 bmap.br_blockcount = len; 2870 2766 bmap.br_startoff = 0; 2871 2767 bmap.br_state = XFS_EXT_NORM; 2872 2768 2873 - __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap); 2769 + __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, isrt, XFS_DATA_FORK, &bmap); 2874 2770 } 2875 2771 2876 2772 /* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */

+8 -4

fs/xfs/libxfs/xfs_rmap.h

··· 7 7 #define __XFS_RMAP_H__ 8 8 9 9 struct xfs_perag; 10 + struct xfs_rtgroup; 10 11 11 12 static inline void 12 13 xfs_rmap_ino_bmbt_owner( ··· 175 174 uint64_t ri_owner; 176 175 struct xfs_bmbt_irec ri_bmap; 177 176 struct xfs_group *ri_group; 177 + bool ri_realtime; 178 178 }; 179 179 180 180 /* functions for updating the rmapbt based on bmbt map/unmap operations */ ··· 186 184 void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp, 187 185 struct xfs_inode *ip, int whichfork, 188 186 struct xfs_bmbt_irec *imap); 189 - void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno, 190 - xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); 191 - void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno, 192 - xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner); 187 + void xfs_rmap_alloc_extent(struct xfs_trans *tp, bool isrt, xfs_fsblock_t fsbno, 188 + xfs_extlen_t len, uint64_t owner); 189 + void xfs_rmap_free_extent(struct xfs_trans *tp, bool isrt, xfs_fsblock_t fsbno, 190 + xfs_extlen_t len, uint64_t owner); 193 191 194 192 int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri, 195 193 struct xfs_btree_cur **pcur); ··· 207 205 xfs_failaddr_t xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec, 208 206 struct xfs_rmap_irec *irec); 209 207 xfs_failaddr_t xfs_rmap_check_irec(struct xfs_perag *pag, 208 + const struct xfs_rmap_irec *irec); 209 + xfs_failaddr_t xfs_rtrmap_check_irec(struct xfs_rtgroup *rtg, 210 210 const struct xfs_rmap_irec *irec); 211 211 212 212 int xfs_rmap_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno,

+1 -1

fs/xfs/libxfs/xfs_rtbitmap.c

··· 1055 1055 xfs_rtxlen_t len) /* length of extent freed */ 1056 1056 { 1057 1057 struct xfs_mount *mp = tp->t_mountp; 1058 - struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP]; 1058 + struct xfs_inode *rbmip = rtg_bitmap(rtg); 1059 1059 struct xfs_rtalloc_args args = { 1060 1060 .mp = mp, 1061 1061 .tp = tp,

+9

fs/xfs/libxfs/xfs_rtbitmap.h

··· 135 135 return div_u64(rtbno, mp->m_sb.sb_rextsize); 136 136 } 137 137 138 + /* Return the offset of a rtgroup block number within an rt extent. */ 139 + static inline xfs_extlen_t 140 + xfs_rgbno_to_rtxoff( 141 + struct xfs_mount *mp, 142 + xfs_rgblock_t rgbno) 143 + { 144 + return rgbno % mp->m_sb.sb_rextsize; 145 + } 146 + 138 147 /* Return the offset of an rt block number within an rt extent. */ 139 148 static inline xfs_extlen_t 140 149 xfs_rtb_to_rtxoff(

+60 -14

fs/xfs/libxfs/xfs_rtgroup.c

··· 33 33 #include "xfs_rtbitmap.h" 34 34 #include "xfs_metafile.h" 35 35 #include "xfs_metadir.h" 36 + #include "xfs_rtrmap_btree.h" 37 + #include "xfs_rtrefcount_btree.h" 36 38 37 39 /* Find the first usable fsblock in this rtgroup. */ 38 40 static inline uint32_t ··· 199 197 * Lock both realtime free space metadata inodes for a freespace 200 198 * update. 201 199 */ 202 - xfs_ilock(rtg->rtg_inodes[XFS_RTGI_BITMAP], XFS_ILOCK_EXCL); 203 - xfs_ilock(rtg->rtg_inodes[XFS_RTGI_SUMMARY], XFS_ILOCK_EXCL); 200 + xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); 201 + xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL); 204 202 } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { 205 - xfs_ilock(rtg->rtg_inodes[XFS_RTGI_BITMAP], XFS_ILOCK_SHARED); 203 + xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); 206 204 } 205 + 206 + if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) 207 + xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_EXCL); 208 + 209 + if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg)) 210 + xfs_ilock(rtg_refcount(rtg), XFS_ILOCK_EXCL); 207 211 } 208 212 209 213 /* Unlock metadata inodes associated with this rt group. */ ··· 222 214 ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) || 223 215 !(rtglock_flags & XFS_RTGLOCK_BITMAP)); 224 216 217 + if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg)) 218 + xfs_iunlock(rtg_refcount(rtg), XFS_ILOCK_EXCL); 219 + 220 + if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) 221 + xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_EXCL); 222 + 225 223 if (rtglock_flags & XFS_RTGLOCK_BITMAP) { 226 - xfs_iunlock(rtg->rtg_inodes[XFS_RTGI_SUMMARY], XFS_ILOCK_EXCL); 227 - xfs_iunlock(rtg->rtg_inodes[XFS_RTGI_BITMAP], XFS_ILOCK_EXCL); 224 + xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL); 225 + xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); 228 226 } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { 229 - xfs_iunlock(rtg->rtg_inodes[XFS_RTGI_BITMAP], XFS_ILOCK_SHARED); 227 + xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); 230 228 } 231 229 } 232 230 ··· 250 236 ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED)); 251 237 252 238 if (rtglock_flags & XFS_RTGLOCK_BITMAP) { 253 - xfs_trans_ijoin(tp, rtg->rtg_inodes[XFS_RTGI_BITMAP], 254 - XFS_ILOCK_EXCL); 255 - xfs_trans_ijoin(tp, rtg->rtg_inodes[XFS_RTGI_SUMMARY], 256 - XFS_ILOCK_EXCL); 239 + xfs_trans_ijoin(tp, rtg_bitmap(rtg), XFS_ILOCK_EXCL); 240 + xfs_trans_ijoin(tp, rtg_summary(rtg), XFS_ILOCK_EXCL); 257 241 } 242 + 243 + if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) 244 + xfs_trans_ijoin(tp, rtg_rmap(rtg), XFS_ILOCK_EXCL); 245 + 246 + if ((rtglock_flags & XFS_RTGLOCK_REFCOUNT) && rtg_refcount(rtg)) 247 + xfs_trans_ijoin(tp, rtg_refcount(rtg), XFS_ILOCK_EXCL); 258 248 } 259 249 260 250 /* Retrieve rt group geometry. */ ··· 302 284 const struct xfs_inode *ip = 303 285 container_of(m, struct xfs_inode, i_lock.dep_map); 304 286 305 - printk(KERN_CONT " rgno=%u", ip->i_projid); 287 + printk(KERN_CONT " rgno=%u metatype=%s", ip->i_projid, 288 + xfs_metafile_type_str(ip->i_metatype)); 306 289 } 307 290 308 291 /* ··· 335 316 336 317 unsigned int sick; /* rtgroup sickness flag */ 337 318 319 + unsigned int fmt_mask; /* all valid data fork formats */ 320 + 338 321 /* Does the fs have this feature? */ 339 - bool (*enabled)(struct xfs_mount *mp); 322 + bool (*enabled)(const struct xfs_mount *mp); 340 323 341 324 /* Create this rtgroup metadata inode and initialize it. */ 342 325 int (*create)(struct xfs_rtgroup *rtg, ··· 352 331 .name = "bitmap", 353 332 .metafile_type = XFS_METAFILE_RTBITMAP, 354 333 .sick = XFS_SICK_RG_BITMAP, 334 + .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) | 335 + (1U << XFS_DINODE_FMT_BTREE), 355 336 .create = xfs_rtbitmap_create, 356 337 }, 357 338 [XFS_RTGI_SUMMARY] = { 358 339 .name = "summary", 359 340 .metafile_type = XFS_METAFILE_RTSUMMARY, 360 341 .sick = XFS_SICK_RG_SUMMARY, 342 + .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) | 343 + (1U << XFS_DINODE_FMT_BTREE), 361 344 .create = xfs_rtsummary_create, 345 + }, 346 + [XFS_RTGI_RMAP] = { 347 + .name = "rmap", 348 + .metafile_type = XFS_METAFILE_RTRMAP, 349 + .sick = XFS_SICK_RG_RMAPBT, 350 + .fmt_mask = 1U << XFS_DINODE_FMT_META_BTREE, 351 + /* 352 + * growfs must create the rtrmap inodes before adding a 353 + * realtime volume to the filesystem, so we cannot use the 354 + * rtrmapbt predicate here. 355 + */ 356 + .enabled = xfs_has_rmapbt, 357 + .create = xfs_rtrmapbt_create, 358 + }, 359 + [XFS_RTGI_REFCOUNT] = { 360 + .name = "refcount", 361 + .metafile_type = XFS_METAFILE_RTREFCOUNT, 362 + .sick = XFS_SICK_RG_REFCNTBT, 363 + .fmt_mask = 1U << XFS_DINODE_FMT_META_BTREE, 364 + /* same comment about growfs and rmap inodes applies here */ 365 + .enabled = xfs_has_reflink, 366 + .create = xfs_rtrefcountbt_create, 362 367 }, 363 368 }; 364 369 ··· 482 435 return error; 483 436 } 484 437 485 - if (XFS_IS_CORRUPT(mp, ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && 486 - ip->i_df.if_format != XFS_DINODE_FMT_BTREE)) { 438 + if (XFS_IS_CORRUPT(mp, !((1U << ip->i_df.if_format) & ops->fmt_mask))) { 487 439 xfs_irele(ip); 488 440 xfs_rtginode_mark_sick(rtg, type); 489 441 return -EFSCORRUPTED;

+57 -1

fs/xfs/libxfs/xfs_rtgroup.h

··· 14 14 enum xfs_rtg_inodes { 15 15 XFS_RTGI_BITMAP, /* allocation bitmap */ 16 16 XFS_RTGI_SUMMARY, /* allocation summary */ 17 + XFS_RTGI_RMAP, /* rmap btree inode */ 18 + XFS_RTGI_REFCOUNT, /* refcount btree inode */ 17 19 18 20 XFS_RTGI_MAX, 19 21 }; ··· 64 62 static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg) 65 63 { 66 64 return rtg->rtg_group.xg_gno; 65 + } 66 + 67 + static inline struct xfs_inode *rtg_bitmap(const struct xfs_rtgroup *rtg) 68 + { 69 + return rtg->rtg_inodes[XFS_RTGI_BITMAP]; 70 + } 71 + 72 + static inline struct xfs_inode *rtg_summary(const struct xfs_rtgroup *rtg) 73 + { 74 + return rtg->rtg_inodes[XFS_RTGI_SUMMARY]; 75 + } 76 + 77 + static inline struct xfs_inode *rtg_rmap(const struct xfs_rtgroup *rtg) 78 + { 79 + return rtg->rtg_inodes[XFS_RTGI_RMAP]; 80 + } 81 + 82 + static inline struct xfs_inode *rtg_refcount(const struct xfs_rtgroup *rtg) 83 + { 84 + return rtg->rtg_inodes[XFS_RTGI_REFCOUNT]; 67 85 } 68 86 69 87 /* Passive rtgroup references */ ··· 142 120 struct xfs_rtgroup *rtg) 143 121 { 144 122 return xfs_rtgroup_next_range(mp, rtg, 0, mp->m_sb.sb_rgcount - 1); 123 + } 124 + 125 + static inline bool 126 + xfs_verify_rgbno( 127 + struct xfs_rtgroup *rtg, 128 + xfs_rgblock_t rgbno) 129 + { 130 + ASSERT(xfs_has_rtgroups(rtg_mount(rtg))); 131 + 132 + return xfs_verify_gbno(rtg_group(rtg), rgbno); 133 + } 134 + 135 + /* 136 + * Check that [@rgbno,@len] is a valid extent range in @rtg. 137 + * 138 + * Must only be used for RTG-enabled file systems. 139 + */ 140 + static inline bool 141 + xfs_verify_rgbext( 142 + struct xfs_rtgroup *rtg, 143 + xfs_rgblock_t rgbno, 144 + xfs_extlen_t len) 145 + { 146 + ASSERT(xfs_has_rtgroups(rtg_mount(rtg))); 147 + 148 + return xfs_verify_gbext(rtg_group(rtg), rgbno, len); 145 149 } 146 150 147 151 static inline xfs_rtblock_t ··· 271 223 #define XFS_RTGLOCK_BITMAP (1U << 0) 272 224 /* Lock the rt bitmap inode in shared mode */ 273 225 #define XFS_RTGLOCK_BITMAP_SHARED (1U << 1) 226 + /* Lock the rt rmap inode in exclusive mode */ 227 + #define XFS_RTGLOCK_RMAP (1U << 2) 228 + /* Lock the rt refcount inode in exclusive mode */ 229 + #define XFS_RTGLOCK_REFCOUNT (1U << 3) 274 230 275 231 #define XFS_RTGLOCK_ALL_FLAGS (XFS_RTGLOCK_BITMAP | \ 276 - XFS_RTGLOCK_BITMAP_SHARED) 232 + XFS_RTGLOCK_BITMAP_SHARED | \ 233 + XFS_RTGLOCK_RMAP | \ 234 + XFS_RTGLOCK_REFCOUNT) 277 235 278 236 void xfs_rtgroup_lock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags); 279 237 void xfs_rtgroup_unlock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags); ··· 300 246 struct xfs_trans *tp); 301 247 int xfs_rtginode_create(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type, 302 248 bool init); 249 + void xfs_rtginode_irele(struct xfs_inode **ipp); 250 + 303 251 void xfs_rtginode_irele(struct xfs_inode **ipp); 304 252 305 253 static inline const char *xfs_rtginode_path(xfs_rgnumber_t rgno,

+757

fs/xfs/libxfs/xfs_rtrefcount_btree.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <djwong@kernel.org> 5 + */ 6 + #include "xfs.h" 7 + #include "xfs_fs.h" 8 + #include "xfs_shared.h" 9 + #include "xfs_format.h" 10 + #include "xfs_log_format.h" 11 + #include "xfs_trans_resv.h" 12 + #include "xfs_bit.h" 13 + #include "xfs_sb.h" 14 + #include "xfs_mount.h" 15 + #include "xfs_defer.h" 16 + #include "xfs_inode.h" 17 + #include "xfs_trans.h" 18 + #include "xfs_alloc.h" 19 + #include "xfs_btree.h" 20 + #include "xfs_btree_staging.h" 21 + #include "xfs_rtrefcount_btree.h" 22 + #include "xfs_refcount.h" 23 + #include "xfs_trace.h" 24 + #include "xfs_cksum.h" 25 + #include "xfs_error.h" 26 + #include "xfs_extent_busy.h" 27 + #include "xfs_rtgroup.h" 28 + #include "xfs_rtbitmap.h" 29 + #include "xfs_metafile.h" 30 + #include "xfs_health.h" 31 + 32 + static struct kmem_cache *xfs_rtrefcountbt_cur_cache; 33 + 34 + /* 35 + * Realtime Reference Count btree. 36 + * 37 + * This is a btree used to track the owner(s) of a given extent in the realtime 38 + * device. See the comments in xfs_refcount_btree.c for more information. 39 + * 40 + * This tree is basically the same as the regular refcount btree except that 41 + * it's rooted in an inode. 42 + */ 43 + 44 + static struct xfs_btree_cur * 45 + xfs_rtrefcountbt_dup_cursor( 46 + struct xfs_btree_cur *cur) 47 + { 48 + return xfs_rtrefcountbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group)); 49 + } 50 + 51 + STATIC int 52 + xfs_rtrefcountbt_get_minrecs( 53 + struct xfs_btree_cur *cur, 54 + int level) 55 + { 56 + if (level == cur->bc_nlevels - 1) { 57 + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); 58 + 59 + return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, 60 + level == 0) / 2; 61 + } 62 + 63 + return cur->bc_mp->m_rtrefc_mnr[level != 0]; 64 + } 65 + 66 + STATIC int 67 + xfs_rtrefcountbt_get_maxrecs( 68 + struct xfs_btree_cur *cur, 69 + int level) 70 + { 71 + if (level == cur->bc_nlevels - 1) { 72 + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); 73 + 74 + return xfs_rtrefcountbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, 75 + level == 0); 76 + } 77 + 78 + return cur->bc_mp->m_rtrefc_mxr[level != 0]; 79 + } 80 + 81 + /* 82 + * Calculate number of records in a realtime refcount btree inode root. 83 + */ 84 + unsigned int 85 + xfs_rtrefcountbt_droot_maxrecs( 86 + unsigned int blocklen, 87 + bool leaf) 88 + { 89 + blocklen -= sizeof(struct xfs_rtrefcount_root); 90 + 91 + if (leaf) 92 + return blocklen / sizeof(struct xfs_refcount_rec); 93 + return blocklen / (2 * sizeof(struct xfs_refcount_key) + 94 + sizeof(xfs_rtrefcount_ptr_t)); 95 + } 96 + 97 + /* 98 + * Get the maximum records we could store in the on-disk format. 99 + * 100 + * For non-root nodes this is equivalent to xfs_rtrefcountbt_get_maxrecs, but 101 + * for the root node this checks the available space in the dinode fork so that 102 + * we can resize the in-memory buffer to match it. After a resize to the 103 + * maximum size this function returns the same value as 104 + * xfs_rtrefcountbt_get_maxrecs for the root node, too. 105 + */ 106 + STATIC int 107 + xfs_rtrefcountbt_get_dmaxrecs( 108 + struct xfs_btree_cur *cur, 109 + int level) 110 + { 111 + if (level != cur->bc_nlevels - 1) 112 + return cur->bc_mp->m_rtrefc_mxr[level != 0]; 113 + return xfs_rtrefcountbt_droot_maxrecs(cur->bc_ino.forksize, level == 0); 114 + } 115 + 116 + STATIC void 117 + xfs_rtrefcountbt_init_key_from_rec( 118 + union xfs_btree_key *key, 119 + const union xfs_btree_rec *rec) 120 + { 121 + key->refc.rc_startblock = rec->refc.rc_startblock; 122 + } 123 + 124 + STATIC void 125 + xfs_rtrefcountbt_init_high_key_from_rec( 126 + union xfs_btree_key *key, 127 + const union xfs_btree_rec *rec) 128 + { 129 + __u32 x; 130 + 131 + x = be32_to_cpu(rec->refc.rc_startblock); 132 + x += be32_to_cpu(rec->refc.rc_blockcount) - 1; 133 + key->refc.rc_startblock = cpu_to_be32(x); 134 + } 135 + 136 + STATIC void 137 + xfs_rtrefcountbt_init_rec_from_cur( 138 + struct xfs_btree_cur *cur, 139 + union xfs_btree_rec *rec) 140 + { 141 + const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; 142 + uint32_t start; 143 + 144 + start = xfs_refcount_encode_startblock(irec->rc_startblock, 145 + irec->rc_domain); 146 + rec->refc.rc_startblock = cpu_to_be32(start); 147 + rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount); 148 + rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount); 149 + } 150 + 151 + STATIC void 152 + xfs_rtrefcountbt_init_ptr_from_cur( 153 + struct xfs_btree_cur *cur, 154 + union xfs_btree_ptr *ptr) 155 + { 156 + ptr->l = 0; 157 + } 158 + 159 + STATIC int64_t 160 + xfs_rtrefcountbt_key_diff( 161 + struct xfs_btree_cur *cur, 162 + const union xfs_btree_key *key) 163 + { 164 + const struct xfs_refcount_key *kp = &key->refc; 165 + const struct xfs_refcount_irec *irec = &cur->bc_rec.rc; 166 + uint32_t start; 167 + 168 + start = xfs_refcount_encode_startblock(irec->rc_startblock, 169 + irec->rc_domain); 170 + return (int64_t)be32_to_cpu(kp->rc_startblock) - start; 171 + } 172 + 173 + STATIC int64_t 174 + xfs_rtrefcountbt_diff_two_keys( 175 + struct xfs_btree_cur *cur, 176 + const union xfs_btree_key *k1, 177 + const union xfs_btree_key *k2, 178 + const union xfs_btree_key *mask) 179 + { 180 + ASSERT(!mask || mask->refc.rc_startblock); 181 + 182 + return (int64_t)be32_to_cpu(k1->refc.rc_startblock) - 183 + be32_to_cpu(k2->refc.rc_startblock); 184 + } 185 + 186 + static xfs_failaddr_t 187 + xfs_rtrefcountbt_verify( 188 + struct xfs_buf *bp) 189 + { 190 + struct xfs_mount *mp = bp->b_target->bt_mount; 191 + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 192 + xfs_failaddr_t fa; 193 + int level; 194 + 195 + if (!xfs_verify_magic(bp, block->bb_magic)) 196 + return __this_address; 197 + 198 + if (!xfs_has_reflink(mp)) 199 + return __this_address; 200 + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); 201 + if (fa) 202 + return fa; 203 + level = be16_to_cpu(block->bb_level); 204 + if (level > mp->m_rtrefc_maxlevels) 205 + return __this_address; 206 + 207 + return xfs_btree_fsblock_verify(bp, mp->m_rtrefc_mxr[level != 0]); 208 + } 209 + 210 + static void 211 + xfs_rtrefcountbt_read_verify( 212 + struct xfs_buf *bp) 213 + { 214 + xfs_failaddr_t fa; 215 + 216 + if (!xfs_btree_fsblock_verify_crc(bp)) 217 + xfs_verifier_error(bp, -EFSBADCRC, __this_address); 218 + else { 219 + fa = xfs_rtrefcountbt_verify(bp); 220 + if (fa) 221 + xfs_verifier_error(bp, -EFSCORRUPTED, fa); 222 + } 223 + 224 + if (bp->b_error) 225 + trace_xfs_btree_corrupt(bp, _RET_IP_); 226 + } 227 + 228 + static void 229 + xfs_rtrefcountbt_write_verify( 230 + struct xfs_buf *bp) 231 + { 232 + xfs_failaddr_t fa; 233 + 234 + fa = xfs_rtrefcountbt_verify(bp); 235 + if (fa) { 236 + trace_xfs_btree_corrupt(bp, _RET_IP_); 237 + xfs_verifier_error(bp, -EFSCORRUPTED, fa); 238 + return; 239 + } 240 + xfs_btree_fsblock_calc_crc(bp); 241 + 242 + } 243 + 244 + const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops = { 245 + .name = "xfs_rtrefcountbt", 246 + .magic = { 0, cpu_to_be32(XFS_RTREFC_CRC_MAGIC) }, 247 + .verify_read = xfs_rtrefcountbt_read_verify, 248 + .verify_write = xfs_rtrefcountbt_write_verify, 249 + .verify_struct = xfs_rtrefcountbt_verify, 250 + }; 251 + 252 + STATIC int 253 + xfs_rtrefcountbt_keys_inorder( 254 + struct xfs_btree_cur *cur, 255 + const union xfs_btree_key *k1, 256 + const union xfs_btree_key *k2) 257 + { 258 + return be32_to_cpu(k1->refc.rc_startblock) < 259 + be32_to_cpu(k2->refc.rc_startblock); 260 + } 261 + 262 + STATIC int 263 + xfs_rtrefcountbt_recs_inorder( 264 + struct xfs_btree_cur *cur, 265 + const union xfs_btree_rec *r1, 266 + const union xfs_btree_rec *r2) 267 + { 268 + return be32_to_cpu(r1->refc.rc_startblock) + 269 + be32_to_cpu(r1->refc.rc_blockcount) <= 270 + be32_to_cpu(r2->refc.rc_startblock); 271 + } 272 + 273 + STATIC enum xbtree_key_contig 274 + xfs_rtrefcountbt_keys_contiguous( 275 + struct xfs_btree_cur *cur, 276 + const union xfs_btree_key *key1, 277 + const union xfs_btree_key *key2, 278 + const union xfs_btree_key *mask) 279 + { 280 + ASSERT(!mask || mask->refc.rc_startblock); 281 + 282 + return xbtree_key_contig(be32_to_cpu(key1->refc.rc_startblock), 283 + be32_to_cpu(key2->refc.rc_startblock)); 284 + } 285 + 286 + static inline void 287 + xfs_rtrefcountbt_move_ptrs( 288 + struct xfs_mount *mp, 289 + struct xfs_btree_block *broot, 290 + short old_size, 291 + size_t new_size, 292 + unsigned int numrecs) 293 + { 294 + void *dptr; 295 + void *sptr; 296 + 297 + sptr = xfs_rtrefcount_broot_ptr_addr(mp, broot, 1, old_size); 298 + dptr = xfs_rtrefcount_broot_ptr_addr(mp, broot, 1, new_size); 299 + memmove(dptr, sptr, numrecs * sizeof(xfs_rtrefcount_ptr_t)); 300 + } 301 + 302 + static struct xfs_btree_block * 303 + xfs_rtrefcountbt_broot_realloc( 304 + struct xfs_btree_cur *cur, 305 + unsigned int new_numrecs) 306 + { 307 + struct xfs_mount *mp = cur->bc_mp; 308 + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); 309 + struct xfs_btree_block *broot; 310 + unsigned int new_size; 311 + unsigned int old_size = ifp->if_broot_bytes; 312 + const unsigned int level = cur->bc_nlevels - 1; 313 + 314 + new_size = xfs_rtrefcount_broot_space_calc(mp, level, new_numrecs); 315 + 316 + /* Handle the nop case quietly. */ 317 + if (new_size == old_size) 318 + return ifp->if_broot; 319 + 320 + if (new_size > old_size) { 321 + unsigned int old_numrecs; 322 + 323 + /* 324 + * If there wasn't any memory allocated before, just allocate 325 + * it now and get out. 326 + */ 327 + if (old_size == 0) 328 + return xfs_broot_realloc(ifp, new_size); 329 + 330 + /* 331 + * If there is already an existing if_broot, then we need to 332 + * realloc it and possibly move the node block pointers because 333 + * those are not butted up against the btree block header. 334 + */ 335 + old_numrecs = xfs_rtrefcountbt_maxrecs(mp, old_size, level); 336 + broot = xfs_broot_realloc(ifp, new_size); 337 + if (level > 0) 338 + xfs_rtrefcountbt_move_ptrs(mp, broot, old_size, 339 + new_size, old_numrecs); 340 + goto out_broot; 341 + } 342 + 343 + /* 344 + * We're reducing numrecs. If we're going all the way to zero, just 345 + * free the block. 346 + */ 347 + ASSERT(ifp->if_broot != NULL && old_size > 0); 348 + if (new_size == 0) 349 + return xfs_broot_realloc(ifp, 0); 350 + 351 + /* 352 + * Shrink the btree root by possibly moving the rtrmapbt pointers, 353 + * since they are not butted up against the btree block header. Then 354 + * reallocate broot. 355 + */ 356 + if (level > 0) 357 + xfs_rtrefcountbt_move_ptrs(mp, ifp->if_broot, old_size, 358 + new_size, new_numrecs); 359 + broot = xfs_broot_realloc(ifp, new_size); 360 + 361 + out_broot: 362 + ASSERT(xfs_rtrefcount_droot_space(broot) <= 363 + xfs_inode_fork_size(cur->bc_ino.ip, cur->bc_ino.whichfork)); 364 + return broot; 365 + } 366 + 367 + const struct xfs_btree_ops xfs_rtrefcountbt_ops = { 368 + .name = "rtrefcount", 369 + .type = XFS_BTREE_TYPE_INODE, 370 + .geom_flags = XFS_BTGEO_IROOT_RECORDS, 371 + 372 + .rec_len = sizeof(struct xfs_refcount_rec), 373 + .key_len = sizeof(struct xfs_refcount_key), 374 + .ptr_len = XFS_BTREE_LONG_PTR_LEN, 375 + 376 + .lru_refs = XFS_REFC_BTREE_REF, 377 + .statoff = XFS_STATS_CALC_INDEX(xs_rtrefcbt_2), 378 + .sick_mask = XFS_SICK_RG_REFCNTBT, 379 + 380 + .dup_cursor = xfs_rtrefcountbt_dup_cursor, 381 + .alloc_block = xfs_btree_alloc_metafile_block, 382 + .free_block = xfs_btree_free_metafile_block, 383 + .get_minrecs = xfs_rtrefcountbt_get_minrecs, 384 + .get_maxrecs = xfs_rtrefcountbt_get_maxrecs, 385 + .get_dmaxrecs = xfs_rtrefcountbt_get_dmaxrecs, 386 + .init_key_from_rec = xfs_rtrefcountbt_init_key_from_rec, 387 + .init_high_key_from_rec = xfs_rtrefcountbt_init_high_key_from_rec, 388 + .init_rec_from_cur = xfs_rtrefcountbt_init_rec_from_cur, 389 + .init_ptr_from_cur = xfs_rtrefcountbt_init_ptr_from_cur, 390 + .key_diff = xfs_rtrefcountbt_key_diff, 391 + .buf_ops = &xfs_rtrefcountbt_buf_ops, 392 + .diff_two_keys = xfs_rtrefcountbt_diff_two_keys, 393 + .keys_inorder = xfs_rtrefcountbt_keys_inorder, 394 + .recs_inorder = xfs_rtrefcountbt_recs_inorder, 395 + .keys_contiguous = xfs_rtrefcountbt_keys_contiguous, 396 + .broot_realloc = xfs_rtrefcountbt_broot_realloc, 397 + }; 398 + 399 + /* Allocate a new rt refcount btree cursor. */ 400 + struct xfs_btree_cur * 401 + xfs_rtrefcountbt_init_cursor( 402 + struct xfs_trans *tp, 403 + struct xfs_rtgroup *rtg) 404 + { 405 + struct xfs_inode *ip = rtg_refcount(rtg); 406 + struct xfs_mount *mp = rtg_mount(rtg); 407 + struct xfs_btree_cur *cur; 408 + 409 + xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); 410 + 411 + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrefcountbt_ops, 412 + mp->m_rtrefc_maxlevels, xfs_rtrefcountbt_cur_cache); 413 + 414 + cur->bc_ino.ip = ip; 415 + cur->bc_refc.nr_ops = 0; 416 + cur->bc_refc.shape_changes = 0; 417 + cur->bc_group = xfs_group_hold(rtg_group(rtg)); 418 + cur->bc_nlevels = be16_to_cpu(ip->i_df.if_broot->bb_level) + 1; 419 + cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK); 420 + cur->bc_ino.whichfork = XFS_DATA_FORK; 421 + return cur; 422 + } 423 + 424 + /* 425 + * Install a new rt reverse mapping btree root. Caller is responsible for 426 + * invalidating and freeing the old btree blocks. 427 + */ 428 + void 429 + xfs_rtrefcountbt_commit_staged_btree( 430 + struct xfs_btree_cur *cur, 431 + struct xfs_trans *tp) 432 + { 433 + struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake; 434 + struct xfs_ifork *ifp; 435 + int flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT; 436 + 437 + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); 438 + ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_META_BTREE); 439 + 440 + /* 441 + * Free any resources hanging off the real fork, then shallow-copy the 442 + * staging fork's contents into the real fork to transfer everything 443 + * we just built. 444 + */ 445 + ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK); 446 + xfs_idestroy_fork(ifp); 447 + memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork)); 448 + 449 + cur->bc_ino.ip->i_projid = cur->bc_group->xg_gno; 450 + xfs_trans_log_inode(tp, cur->bc_ino.ip, flags); 451 + xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK); 452 + } 453 + 454 + /* Calculate number of records in a realtime refcount btree block. */ 455 + static inline unsigned int 456 + xfs_rtrefcountbt_block_maxrecs( 457 + unsigned int blocklen, 458 + bool leaf) 459 + { 460 + 461 + if (leaf) 462 + return blocklen / sizeof(struct xfs_refcount_rec); 463 + return blocklen / (sizeof(struct xfs_refcount_key) + 464 + sizeof(xfs_rtrefcount_ptr_t)); 465 + } 466 + 467 + /* 468 + * Calculate number of records in an refcount btree block. 469 + */ 470 + unsigned int 471 + xfs_rtrefcountbt_maxrecs( 472 + struct xfs_mount *mp, 473 + unsigned int blocklen, 474 + bool leaf) 475 + { 476 + blocklen -= XFS_RTREFCOUNT_BLOCK_LEN; 477 + return xfs_rtrefcountbt_block_maxrecs(blocklen, leaf); 478 + } 479 + 480 + /* Compute the max possible height for realtime refcount btrees. */ 481 + unsigned int 482 + xfs_rtrefcountbt_maxlevels_ondisk(void) 483 + { 484 + unsigned int minrecs[2]; 485 + unsigned int blocklen; 486 + 487 + blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; 488 + 489 + minrecs[0] = xfs_rtrefcountbt_block_maxrecs(blocklen, true) / 2; 490 + minrecs[1] = xfs_rtrefcountbt_block_maxrecs(blocklen, false) / 2; 491 + 492 + /* We need at most one record for every block in an rt group. */ 493 + return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_RGBLOCKS); 494 + } 495 + 496 + int __init 497 + xfs_rtrefcountbt_init_cur_cache(void) 498 + { 499 + xfs_rtrefcountbt_cur_cache = kmem_cache_create("xfs_rtrefcountbt_cur", 500 + xfs_btree_cur_sizeof( 501 + xfs_rtrefcountbt_maxlevels_ondisk()), 502 + 0, 0, NULL); 503 + 504 + if (!xfs_rtrefcountbt_cur_cache) 505 + return -ENOMEM; 506 + return 0; 507 + } 508 + 509 + void 510 + xfs_rtrefcountbt_destroy_cur_cache(void) 511 + { 512 + kmem_cache_destroy(xfs_rtrefcountbt_cur_cache); 513 + xfs_rtrefcountbt_cur_cache = NULL; 514 + } 515 + 516 + /* Compute the maximum height of a realtime refcount btree. */ 517 + void 518 + xfs_rtrefcountbt_compute_maxlevels( 519 + struct xfs_mount *mp) 520 + { 521 + unsigned int d_maxlevels, r_maxlevels; 522 + 523 + if (!xfs_has_rtreflink(mp)) { 524 + mp->m_rtrefc_maxlevels = 0; 525 + return; 526 + } 527 + 528 + /* 529 + * The realtime refcountbt lives on the data device, which means that 530 + * its maximum height is constrained by the size of the data device and 531 + * the height required to store one refcount record for each rtextent 532 + * in an rt group. 533 + */ 534 + d_maxlevels = xfs_btree_space_to_height(mp->m_rtrefc_mnr, 535 + mp->m_sb.sb_dblocks); 536 + r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrefc_mnr, 537 + mp->m_sb.sb_rgextents); 538 + 539 + /* Add one level to handle the inode root level. */ 540 + mp->m_rtrefc_maxlevels = min(d_maxlevels, r_maxlevels) + 1; 541 + } 542 + 543 + /* Calculate the rtrefcount btree size for some records. */ 544 + unsigned long long 545 + xfs_rtrefcountbt_calc_size( 546 + struct xfs_mount *mp, 547 + unsigned long long len) 548 + { 549 + return xfs_btree_calc_size(mp->m_rtrefc_mnr, len); 550 + } 551 + 552 + /* 553 + * Calculate the maximum refcount btree size. 554 + */ 555 + static unsigned long long 556 + xfs_rtrefcountbt_max_size( 557 + struct xfs_mount *mp, 558 + xfs_rtblock_t rtblocks) 559 + { 560 + /* Bail out if we're uninitialized, which can happen in mkfs. */ 561 + if (mp->m_rtrefc_mxr[0] == 0) 562 + return 0; 563 + 564 + return xfs_rtrefcountbt_calc_size(mp, rtblocks); 565 + } 566 + 567 + /* 568 + * Figure out how many blocks to reserve and how many are used by this btree. 569 + * We need enough space to hold one record for every rt extent in the rtgroup. 570 + */ 571 + xfs_filblks_t 572 + xfs_rtrefcountbt_calc_reserves( 573 + struct xfs_mount *mp) 574 + { 575 + if (!xfs_has_rtreflink(mp)) 576 + return 0; 577 + 578 + return xfs_rtrefcountbt_max_size(mp, mp->m_sb.sb_rgextents); 579 + } 580 + 581 + /* 582 + * Convert on-disk form of btree root to in-memory form. 583 + */ 584 + STATIC void 585 + xfs_rtrefcountbt_from_disk( 586 + struct xfs_inode *ip, 587 + struct xfs_rtrefcount_root *dblock, 588 + int dblocklen, 589 + struct xfs_btree_block *rblock) 590 + { 591 + struct xfs_mount *mp = ip->i_mount; 592 + struct xfs_refcount_key *fkp; 593 + __be64 *fpp; 594 + struct xfs_refcount_key *tkp; 595 + __be64 *tpp; 596 + struct xfs_refcount_rec *frp; 597 + struct xfs_refcount_rec *trp; 598 + unsigned int numrecs; 599 + unsigned int maxrecs; 600 + unsigned int rblocklen; 601 + 602 + rblocklen = xfs_rtrefcount_broot_space(mp, dblock); 603 + 604 + xfs_btree_init_block(mp, rblock, &xfs_rtrefcountbt_ops, 0, 0, 605 + ip->i_ino); 606 + 607 + rblock->bb_level = dblock->bb_level; 608 + rblock->bb_numrecs = dblock->bb_numrecs; 609 + 610 + if (be16_to_cpu(rblock->bb_level) > 0) { 611 + maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false); 612 + fkp = xfs_rtrefcount_droot_key_addr(dblock, 1); 613 + tkp = xfs_rtrefcount_key_addr(rblock, 1); 614 + fpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs); 615 + tpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen); 616 + numrecs = be16_to_cpu(dblock->bb_numrecs); 617 + memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs); 618 + memcpy(tpp, fpp, sizeof(*fpp) * numrecs); 619 + } else { 620 + frp = xfs_rtrefcount_droot_rec_addr(dblock, 1); 621 + trp = xfs_rtrefcount_rec_addr(rblock, 1); 622 + numrecs = be16_to_cpu(dblock->bb_numrecs); 623 + memcpy(trp, frp, sizeof(*frp) * numrecs); 624 + } 625 + } 626 + 627 + /* Load a realtime reference count btree root in from disk. */ 628 + int 629 + xfs_iformat_rtrefcount( 630 + struct xfs_inode *ip, 631 + struct xfs_dinode *dip) 632 + { 633 + struct xfs_mount *mp = ip->i_mount; 634 + struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 635 + struct xfs_btree_block *broot; 636 + unsigned int numrecs; 637 + unsigned int level; 638 + int dsize; 639 + 640 + /* 641 + * growfs must create the rtrefcount inodes before adding a realtime 642 + * volume to the filesystem, so we cannot use the rtrefcount predicate 643 + * here. 644 + */ 645 + if (!xfs_has_reflink(ip->i_mount)) { 646 + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); 647 + return -EFSCORRUPTED; 648 + } 649 + 650 + dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK); 651 + numrecs = be16_to_cpu(dfp->bb_numrecs); 652 + level = be16_to_cpu(dfp->bb_level); 653 + 654 + if (level > mp->m_rtrefc_maxlevels || 655 + xfs_rtrefcount_droot_space_calc(level, numrecs) > dsize) { 656 + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); 657 + return -EFSCORRUPTED; 658 + } 659 + 660 + broot = xfs_broot_alloc(xfs_ifork_ptr(ip, XFS_DATA_FORK), 661 + xfs_rtrefcount_broot_space_calc(mp, level, numrecs)); 662 + if (broot) 663 + xfs_rtrefcountbt_from_disk(ip, dfp, dsize, broot); 664 + return 0; 665 + } 666 + 667 + /* 668 + * Convert in-memory form of btree root to on-disk form. 669 + */ 670 + void 671 + xfs_rtrefcountbt_to_disk( 672 + struct xfs_mount *mp, 673 + struct xfs_btree_block *rblock, 674 + int rblocklen, 675 + struct xfs_rtrefcount_root *dblock, 676 + int dblocklen) 677 + { 678 + struct xfs_refcount_key *fkp; 679 + __be64 *fpp; 680 + struct xfs_refcount_key *tkp; 681 + __be64 *tpp; 682 + struct xfs_refcount_rec *frp; 683 + struct xfs_refcount_rec *trp; 684 + unsigned int maxrecs; 685 + unsigned int numrecs; 686 + 687 + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTREFC_CRC_MAGIC)); 688 + ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)); 689 + ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL)); 690 + ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)); 691 + ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)); 692 + 693 + dblock->bb_level = rblock->bb_level; 694 + dblock->bb_numrecs = rblock->bb_numrecs; 695 + 696 + if (be16_to_cpu(rblock->bb_level) > 0) { 697 + maxrecs = xfs_rtrefcountbt_droot_maxrecs(dblocklen, false); 698 + fkp = xfs_rtrefcount_key_addr(rblock, 1); 699 + tkp = xfs_rtrefcount_droot_key_addr(dblock, 1); 700 + fpp = xfs_rtrefcount_broot_ptr_addr(mp, rblock, 1, rblocklen); 701 + tpp = xfs_rtrefcount_droot_ptr_addr(dblock, 1, maxrecs); 702 + numrecs = be16_to_cpu(rblock->bb_numrecs); 703 + memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs); 704 + memcpy(tpp, fpp, sizeof(*fpp) * numrecs); 705 + } else { 706 + frp = xfs_rtrefcount_rec_addr(rblock, 1); 707 + trp = xfs_rtrefcount_droot_rec_addr(dblock, 1); 708 + numrecs = be16_to_cpu(rblock->bb_numrecs); 709 + memcpy(trp, frp, sizeof(*frp) * numrecs); 710 + } 711 + } 712 + 713 + /* Flush a realtime reference count btree root out to disk. */ 714 + void 715 + xfs_iflush_rtrefcount( 716 + struct xfs_inode *ip, 717 + struct xfs_dinode *dip) 718 + { 719 + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); 720 + struct xfs_rtrefcount_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 721 + 722 + ASSERT(ifp->if_broot != NULL); 723 + ASSERT(ifp->if_broot_bytes > 0); 724 + ASSERT(xfs_rtrefcount_droot_space(ifp->if_broot) <= 725 + xfs_inode_fork_size(ip, XFS_DATA_FORK)); 726 + xfs_rtrefcountbt_to_disk(ip->i_mount, ifp->if_broot, 727 + ifp->if_broot_bytes, dfp, 728 + XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK)); 729 + } 730 + 731 + /* 732 + * Create a realtime refcount btree inode. 733 + */ 734 + int 735 + xfs_rtrefcountbt_create( 736 + struct xfs_rtgroup *rtg, 737 + struct xfs_inode *ip, 738 + struct xfs_trans *tp, 739 + bool init) 740 + { 741 + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); 742 + struct xfs_mount *mp = ip->i_mount; 743 + struct xfs_btree_block *broot; 744 + 745 + ifp->if_format = XFS_DINODE_FMT_META_BTREE; 746 + ASSERT(ifp->if_broot_bytes == 0); 747 + ASSERT(ifp->if_bytes == 0); 748 + 749 + /* Initialize the empty incore btree root. */ 750 + broot = xfs_broot_realloc(ifp, 751 + xfs_rtrefcount_broot_space_calc(mp, 0, 0)); 752 + if (broot) 753 + xfs_btree_init_block(mp, broot, &xfs_rtrefcountbt_ops, 0, 0, 754 + ip->i_ino); 755 + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT); 756 + return 0; 757 + }

+189

fs/xfs/libxfs/xfs_rtrefcount_btree.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <djwong@kernel.org> 5 + */ 6 + #ifndef __XFS_RTREFCOUNT_BTREE_H__ 7 + #define __XFS_RTREFCOUNT_BTREE_H__ 8 + 9 + struct xfs_buf; 10 + struct xfs_btree_cur; 11 + struct xfs_mount; 12 + struct xbtree_ifakeroot; 13 + struct xfs_rtgroup; 14 + 15 + /* refcounts only exist on crc enabled filesystems */ 16 + #define XFS_RTREFCOUNT_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN 17 + 18 + struct xfs_btree_cur *xfs_rtrefcountbt_init_cursor(struct xfs_trans *tp, 19 + struct xfs_rtgroup *rtg); 20 + struct xfs_btree_cur *xfs_rtrefcountbt_stage_cursor(struct xfs_mount *mp, 21 + struct xfs_rtgroup *rtg, struct xfs_inode *ip, 22 + struct xbtree_ifakeroot *ifake); 23 + void xfs_rtrefcountbt_commit_staged_btree(struct xfs_btree_cur *cur, 24 + struct xfs_trans *tp); 25 + unsigned int xfs_rtrefcountbt_maxrecs(struct xfs_mount *mp, 26 + unsigned int blocklen, bool leaf); 27 + void xfs_rtrefcountbt_compute_maxlevels(struct xfs_mount *mp); 28 + unsigned int xfs_rtrefcountbt_droot_maxrecs(unsigned int blocklen, bool leaf); 29 + 30 + /* 31 + * Addresses of records, keys, and pointers within an incore rtrefcountbt block. 32 + * 33 + * (note that some of these may appear unused, but they are used in userspace) 34 + */ 35 + static inline struct xfs_refcount_rec * 36 + xfs_rtrefcount_rec_addr( 37 + struct xfs_btree_block *block, 38 + unsigned int index) 39 + { 40 + return (struct xfs_refcount_rec *) 41 + ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN + 42 + (index - 1) * sizeof(struct xfs_refcount_rec)); 43 + } 44 + 45 + static inline struct xfs_refcount_key * 46 + xfs_rtrefcount_key_addr( 47 + struct xfs_btree_block *block, 48 + unsigned int index) 49 + { 50 + return (struct xfs_refcount_key *) 51 + ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN + 52 + (index - 1) * sizeof(struct xfs_refcount_key)); 53 + } 54 + 55 + static inline xfs_rtrefcount_ptr_t * 56 + xfs_rtrefcount_ptr_addr( 57 + struct xfs_btree_block *block, 58 + unsigned int index, 59 + unsigned int maxrecs) 60 + { 61 + return (xfs_rtrefcount_ptr_t *) 62 + ((char *)block + XFS_RTREFCOUNT_BLOCK_LEN + 63 + maxrecs * sizeof(struct xfs_refcount_key) + 64 + (index - 1) * sizeof(xfs_rtrefcount_ptr_t)); 65 + } 66 + 67 + unsigned int xfs_rtrefcountbt_maxlevels_ondisk(void); 68 + int __init xfs_rtrefcountbt_init_cur_cache(void); 69 + void xfs_rtrefcountbt_destroy_cur_cache(void); 70 + 71 + xfs_filblks_t xfs_rtrefcountbt_calc_reserves(struct xfs_mount *mp); 72 + unsigned long long xfs_rtrefcountbt_calc_size(struct xfs_mount *mp, 73 + unsigned long long len); 74 + 75 + /* Addresses of key, pointers, and records within an ondisk rtrefcount block. */ 76 + 77 + static inline struct xfs_refcount_rec * 78 + xfs_rtrefcount_droot_rec_addr( 79 + struct xfs_rtrefcount_root *block, 80 + unsigned int index) 81 + { 82 + return (struct xfs_refcount_rec *) 83 + ((char *)(block + 1) + 84 + (index - 1) * sizeof(struct xfs_refcount_rec)); 85 + } 86 + 87 + static inline struct xfs_refcount_key * 88 + xfs_rtrefcount_droot_key_addr( 89 + struct xfs_rtrefcount_root *block, 90 + unsigned int index) 91 + { 92 + return (struct xfs_refcount_key *) 93 + ((char *)(block + 1) + 94 + (index - 1) * sizeof(struct xfs_refcount_key)); 95 + } 96 + 97 + static inline xfs_rtrefcount_ptr_t * 98 + xfs_rtrefcount_droot_ptr_addr( 99 + struct xfs_rtrefcount_root *block, 100 + unsigned int index, 101 + unsigned int maxrecs) 102 + { 103 + return (xfs_rtrefcount_ptr_t *) 104 + ((char *)(block + 1) + 105 + maxrecs * sizeof(struct xfs_refcount_key) + 106 + (index - 1) * sizeof(xfs_rtrefcount_ptr_t)); 107 + } 108 + 109 + /* 110 + * Address of pointers within the incore btree root. 111 + * 112 + * These are to be used when we know the size of the block and 113 + * we don't have a cursor. 114 + */ 115 + static inline xfs_rtrefcount_ptr_t * 116 + xfs_rtrefcount_broot_ptr_addr( 117 + struct xfs_mount *mp, 118 + struct xfs_btree_block *bb, 119 + unsigned int index, 120 + unsigned int block_size) 121 + { 122 + return xfs_rtrefcount_ptr_addr(bb, index, 123 + xfs_rtrefcountbt_maxrecs(mp, block_size, false)); 124 + } 125 + 126 + /* 127 + * Compute the space required for the incore btree root containing the given 128 + * number of records. 129 + */ 130 + static inline size_t 131 + xfs_rtrefcount_broot_space_calc( 132 + struct xfs_mount *mp, 133 + unsigned int level, 134 + unsigned int nrecs) 135 + { 136 + size_t sz = XFS_RTREFCOUNT_BLOCK_LEN; 137 + 138 + if (level > 0) 139 + return sz + nrecs * (sizeof(struct xfs_refcount_key) + 140 + sizeof(xfs_rtrefcount_ptr_t)); 141 + return sz + nrecs * sizeof(struct xfs_refcount_rec); 142 + } 143 + 144 + /* 145 + * Compute the space required for the incore btree root given the ondisk 146 + * btree root block. 147 + */ 148 + static inline size_t 149 + xfs_rtrefcount_broot_space(struct xfs_mount *mp, struct xfs_rtrefcount_root *bb) 150 + { 151 + return xfs_rtrefcount_broot_space_calc(mp, be16_to_cpu(bb->bb_level), 152 + be16_to_cpu(bb->bb_numrecs)); 153 + } 154 + 155 + /* Compute the space required for the ondisk root block. */ 156 + static inline size_t 157 + xfs_rtrefcount_droot_space_calc( 158 + unsigned int level, 159 + unsigned int nrecs) 160 + { 161 + size_t sz = sizeof(struct xfs_rtrefcount_root); 162 + 163 + if (level > 0) 164 + return sz + nrecs * (sizeof(struct xfs_refcount_key) + 165 + sizeof(xfs_rtrefcount_ptr_t)); 166 + return sz + nrecs * sizeof(struct xfs_refcount_rec); 167 + } 168 + 169 + /* 170 + * Compute the space required for the ondisk root block given an incore root 171 + * block. 172 + */ 173 + static inline size_t 174 + xfs_rtrefcount_droot_space(struct xfs_btree_block *bb) 175 + { 176 + return xfs_rtrefcount_droot_space_calc(be16_to_cpu(bb->bb_level), 177 + be16_to_cpu(bb->bb_numrecs)); 178 + } 179 + 180 + int xfs_iformat_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip); 181 + void xfs_rtrefcountbt_to_disk(struct xfs_mount *mp, 182 + struct xfs_btree_block *rblock, int rblocklen, 183 + struct xfs_rtrefcount_root *dblock, int dblocklen); 184 + void xfs_iflush_rtrefcount(struct xfs_inode *ip, struct xfs_dinode *dip); 185 + 186 + int xfs_rtrefcountbt_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip, 187 + struct xfs_trans *tp, bool init); 188 + 189 + #endif /* __XFS_RTREFCOUNT_BTREE_H__ */

+1035

fs/xfs/libxfs/xfs_rtrmap_btree.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <djwong@kernel.org> 5 + */ 6 + #include "xfs.h" 7 + #include "xfs_fs.h" 8 + #include "xfs_shared.h" 9 + #include "xfs_format.h" 10 + #include "xfs_log_format.h" 11 + #include "xfs_trans_resv.h" 12 + #include "xfs_bit.h" 13 + #include "xfs_sb.h" 14 + #include "xfs_mount.h" 15 + #include "xfs_defer.h" 16 + #include "xfs_inode.h" 17 + #include "xfs_trans.h" 18 + #include "xfs_alloc.h" 19 + #include "xfs_btree.h" 20 + #include "xfs_btree_staging.h" 21 + #include "xfs_metafile.h" 22 + #include "xfs_rmap.h" 23 + #include "xfs_rtrmap_btree.h" 24 + #include "xfs_trace.h" 25 + #include "xfs_cksum.h" 26 + #include "xfs_error.h" 27 + #include "xfs_extent_busy.h" 28 + #include "xfs_rtgroup.h" 29 + #include "xfs_bmap.h" 30 + #include "xfs_health.h" 31 + #include "xfs_buf_mem.h" 32 + #include "xfs_btree_mem.h" 33 + 34 + static struct kmem_cache *xfs_rtrmapbt_cur_cache; 35 + 36 + /* 37 + * Realtime Reverse Map btree. 38 + * 39 + * This is a btree used to track the owner(s) of a given extent in the realtime 40 + * device. See the comments in xfs_rmap_btree.c for more information. 41 + * 42 + * This tree is basically the same as the regular rmap btree except that it 43 + * is rooted in an inode and does not live in free space. 44 + */ 45 + 46 + static struct xfs_btree_cur * 47 + xfs_rtrmapbt_dup_cursor( 48 + struct xfs_btree_cur *cur) 49 + { 50 + return xfs_rtrmapbt_init_cursor(cur->bc_tp, to_rtg(cur->bc_group)); 51 + } 52 + 53 + STATIC int 54 + xfs_rtrmapbt_get_minrecs( 55 + struct xfs_btree_cur *cur, 56 + int level) 57 + { 58 + if (level == cur->bc_nlevels - 1) { 59 + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); 60 + 61 + return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, 62 + level == 0) / 2; 63 + } 64 + 65 + return cur->bc_mp->m_rtrmap_mnr[level != 0]; 66 + } 67 + 68 + STATIC int 69 + xfs_rtrmapbt_get_maxrecs( 70 + struct xfs_btree_cur *cur, 71 + int level) 72 + { 73 + if (level == cur->bc_nlevels - 1) { 74 + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); 75 + 76 + return xfs_rtrmapbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, 77 + level == 0); 78 + } 79 + 80 + return cur->bc_mp->m_rtrmap_mxr[level != 0]; 81 + } 82 + 83 + /* Calculate number of records in the ondisk realtime rmap btree inode root. */ 84 + unsigned int 85 + xfs_rtrmapbt_droot_maxrecs( 86 + unsigned int blocklen, 87 + bool leaf) 88 + { 89 + blocklen -= sizeof(struct xfs_rtrmap_root); 90 + 91 + if (leaf) 92 + return blocklen / sizeof(struct xfs_rmap_rec); 93 + return blocklen / (2 * sizeof(struct xfs_rmap_key) + 94 + sizeof(xfs_rtrmap_ptr_t)); 95 + } 96 + 97 + /* 98 + * Get the maximum records we could store in the on-disk format. 99 + * 100 + * For non-root nodes this is equivalent to xfs_rtrmapbt_get_maxrecs, but 101 + * for the root node this checks the available space in the dinode fork 102 + * so that we can resize the in-memory buffer to match it. After a 103 + * resize to the maximum size this function returns the same value 104 + * as xfs_rtrmapbt_get_maxrecs for the root node, too. 105 + */ 106 + STATIC int 107 + xfs_rtrmapbt_get_dmaxrecs( 108 + struct xfs_btree_cur *cur, 109 + int level) 110 + { 111 + if (level != cur->bc_nlevels - 1) 112 + return cur->bc_mp->m_rtrmap_mxr[level != 0]; 113 + return xfs_rtrmapbt_droot_maxrecs(cur->bc_ino.forksize, level == 0); 114 + } 115 + 116 + /* 117 + * Convert the ondisk record's offset field into the ondisk key's offset field. 118 + * Fork and bmbt are significant parts of the rmap record key, but written 119 + * status is merely a record attribute. 120 + */ 121 + static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec) 122 + { 123 + return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN); 124 + } 125 + 126 + STATIC void 127 + xfs_rtrmapbt_init_key_from_rec( 128 + union xfs_btree_key *key, 129 + const union xfs_btree_rec *rec) 130 + { 131 + key->rmap.rm_startblock = rec->rmap.rm_startblock; 132 + key->rmap.rm_owner = rec->rmap.rm_owner; 133 + key->rmap.rm_offset = ondisk_rec_offset_to_key(rec); 134 + } 135 + 136 + STATIC void 137 + xfs_rtrmapbt_init_high_key_from_rec( 138 + union xfs_btree_key *key, 139 + const union xfs_btree_rec *rec) 140 + { 141 + uint64_t off; 142 + int adj; 143 + 144 + adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1; 145 + 146 + key->rmap.rm_startblock = rec->rmap.rm_startblock; 147 + be32_add_cpu(&key->rmap.rm_startblock, adj); 148 + key->rmap.rm_owner = rec->rmap.rm_owner; 149 + key->rmap.rm_offset = ondisk_rec_offset_to_key(rec); 150 + if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) || 151 + XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset))) 152 + return; 153 + off = be64_to_cpu(key->rmap.rm_offset); 154 + off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK); 155 + key->rmap.rm_offset = cpu_to_be64(off); 156 + } 157 + 158 + STATIC void 159 + xfs_rtrmapbt_init_rec_from_cur( 160 + struct xfs_btree_cur *cur, 161 + union xfs_btree_rec *rec) 162 + { 163 + rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock); 164 + rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount); 165 + rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner); 166 + rec->rmap.rm_offset = cpu_to_be64( 167 + xfs_rmap_irec_offset_pack(&cur->bc_rec.r)); 168 + } 169 + 170 + STATIC void 171 + xfs_rtrmapbt_init_ptr_from_cur( 172 + struct xfs_btree_cur *cur, 173 + union xfs_btree_ptr *ptr) 174 + { 175 + ptr->l = 0; 176 + } 177 + 178 + /* 179 + * Mask the appropriate parts of the ondisk key field for a key comparison. 180 + * Fork and bmbt are significant parts of the rmap record key, but written 181 + * status is merely a record attribute. 182 + */ 183 + static inline uint64_t offset_keymask(uint64_t offset) 184 + { 185 + return offset & ~XFS_RMAP_OFF_UNWRITTEN; 186 + } 187 + 188 + STATIC int64_t 189 + xfs_rtrmapbt_key_diff( 190 + struct xfs_btree_cur *cur, 191 + const union xfs_btree_key *key) 192 + { 193 + struct xfs_rmap_irec *rec = &cur->bc_rec.r; 194 + const struct xfs_rmap_key *kp = &key->rmap; 195 + __u64 x, y; 196 + int64_t d; 197 + 198 + d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; 199 + if (d) 200 + return d; 201 + 202 + x = be64_to_cpu(kp->rm_owner); 203 + y = rec->rm_owner; 204 + if (x > y) 205 + return 1; 206 + else if (y > x) 207 + return -1; 208 + 209 + x = offset_keymask(be64_to_cpu(kp->rm_offset)); 210 + y = offset_keymask(xfs_rmap_irec_offset_pack(rec)); 211 + if (x > y) 212 + return 1; 213 + else if (y > x) 214 + return -1; 215 + return 0; 216 + } 217 + 218 + STATIC int64_t 219 + xfs_rtrmapbt_diff_two_keys( 220 + struct xfs_btree_cur *cur, 221 + const union xfs_btree_key *k1, 222 + const union xfs_btree_key *k2, 223 + const union xfs_btree_key *mask) 224 + { 225 + const struct xfs_rmap_key *kp1 = &k1->rmap; 226 + const struct xfs_rmap_key *kp2 = &k2->rmap; 227 + int64_t d; 228 + __u64 x, y; 229 + 230 + /* Doesn't make sense to mask off the physical space part */ 231 + ASSERT(!mask || mask->rmap.rm_startblock); 232 + 233 + d = (int64_t)be32_to_cpu(kp1->rm_startblock) - 234 + be32_to_cpu(kp2->rm_startblock); 235 + if (d) 236 + return d; 237 + 238 + if (!mask || mask->rmap.rm_owner) { 239 + x = be64_to_cpu(kp1->rm_owner); 240 + y = be64_to_cpu(kp2->rm_owner); 241 + if (x > y) 242 + return 1; 243 + else if (y > x) 244 + return -1; 245 + } 246 + 247 + if (!mask || mask->rmap.rm_offset) { 248 + /* Doesn't make sense to allow offset but not owner */ 249 + ASSERT(!mask || mask->rmap.rm_owner); 250 + 251 + x = offset_keymask(be64_to_cpu(kp1->rm_offset)); 252 + y = offset_keymask(be64_to_cpu(kp2->rm_offset)); 253 + if (x > y) 254 + return 1; 255 + else if (y > x) 256 + return -1; 257 + } 258 + 259 + return 0; 260 + } 261 + 262 + static xfs_failaddr_t 263 + xfs_rtrmapbt_verify( 264 + struct xfs_buf *bp) 265 + { 266 + struct xfs_mount *mp = bp->b_target->bt_mount; 267 + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 268 + xfs_failaddr_t fa; 269 + int level; 270 + 271 + if (!xfs_verify_magic(bp, block->bb_magic)) 272 + return __this_address; 273 + 274 + if (!xfs_has_rmapbt(mp)) 275 + return __this_address; 276 + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); 277 + if (fa) 278 + return fa; 279 + level = be16_to_cpu(block->bb_level); 280 + if (level > mp->m_rtrmap_maxlevels) 281 + return __this_address; 282 + 283 + return xfs_btree_fsblock_verify(bp, mp->m_rtrmap_mxr[level != 0]); 284 + } 285 + 286 + static void 287 + xfs_rtrmapbt_read_verify( 288 + struct xfs_buf *bp) 289 + { 290 + xfs_failaddr_t fa; 291 + 292 + if (!xfs_btree_fsblock_verify_crc(bp)) 293 + xfs_verifier_error(bp, -EFSBADCRC, __this_address); 294 + else { 295 + fa = xfs_rtrmapbt_verify(bp); 296 + if (fa) 297 + xfs_verifier_error(bp, -EFSCORRUPTED, fa); 298 + } 299 + 300 + if (bp->b_error) 301 + trace_xfs_btree_corrupt(bp, _RET_IP_); 302 + } 303 + 304 + static void 305 + xfs_rtrmapbt_write_verify( 306 + struct xfs_buf *bp) 307 + { 308 + xfs_failaddr_t fa; 309 + 310 + fa = xfs_rtrmapbt_verify(bp); 311 + if (fa) { 312 + trace_xfs_btree_corrupt(bp, _RET_IP_); 313 + xfs_verifier_error(bp, -EFSCORRUPTED, fa); 314 + return; 315 + } 316 + xfs_btree_fsblock_calc_crc(bp); 317 + 318 + } 319 + 320 + const struct xfs_buf_ops xfs_rtrmapbt_buf_ops = { 321 + .name = "xfs_rtrmapbt", 322 + .magic = { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) }, 323 + .verify_read = xfs_rtrmapbt_read_verify, 324 + .verify_write = xfs_rtrmapbt_write_verify, 325 + .verify_struct = xfs_rtrmapbt_verify, 326 + }; 327 + 328 + STATIC int 329 + xfs_rtrmapbt_keys_inorder( 330 + struct xfs_btree_cur *cur, 331 + const union xfs_btree_key *k1, 332 + const union xfs_btree_key *k2) 333 + { 334 + uint32_t x; 335 + uint32_t y; 336 + uint64_t a; 337 + uint64_t b; 338 + 339 + x = be32_to_cpu(k1->rmap.rm_startblock); 340 + y = be32_to_cpu(k2->rmap.rm_startblock); 341 + if (x < y) 342 + return 1; 343 + else if (x > y) 344 + return 0; 345 + a = be64_to_cpu(k1->rmap.rm_owner); 346 + b = be64_to_cpu(k2->rmap.rm_owner); 347 + if (a < b) 348 + return 1; 349 + else if (a > b) 350 + return 0; 351 + a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset)); 352 + b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset)); 353 + if (a <= b) 354 + return 1; 355 + return 0; 356 + } 357 + 358 + STATIC int 359 + xfs_rtrmapbt_recs_inorder( 360 + struct xfs_btree_cur *cur, 361 + const union xfs_btree_rec *r1, 362 + const union xfs_btree_rec *r2) 363 + { 364 + uint32_t x; 365 + uint32_t y; 366 + uint64_t a; 367 + uint64_t b; 368 + 369 + x = be32_to_cpu(r1->rmap.rm_startblock); 370 + y = be32_to_cpu(r2->rmap.rm_startblock); 371 + if (x < y) 372 + return 1; 373 + else if (x > y) 374 + return 0; 375 + a = be64_to_cpu(r1->rmap.rm_owner); 376 + b = be64_to_cpu(r2->rmap.rm_owner); 377 + if (a < b) 378 + return 1; 379 + else if (a > b) 380 + return 0; 381 + a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset)); 382 + b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset)); 383 + if (a <= b) 384 + return 1; 385 + return 0; 386 + } 387 + 388 + STATIC enum xbtree_key_contig 389 + xfs_rtrmapbt_keys_contiguous( 390 + struct xfs_btree_cur *cur, 391 + const union xfs_btree_key *key1, 392 + const union xfs_btree_key *key2, 393 + const union xfs_btree_key *mask) 394 + { 395 + ASSERT(!mask || mask->rmap.rm_startblock); 396 + 397 + /* 398 + * We only support checking contiguity of the physical space component. 399 + * If any callers ever need more specificity than that, they'll have to 400 + * implement it here. 401 + */ 402 + ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset)); 403 + 404 + return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock), 405 + be32_to_cpu(key2->rmap.rm_startblock)); 406 + } 407 + 408 + static inline void 409 + xfs_rtrmapbt_move_ptrs( 410 + struct xfs_mount *mp, 411 + struct xfs_btree_block *broot, 412 + short old_size, 413 + size_t new_size, 414 + unsigned int numrecs) 415 + { 416 + void *dptr; 417 + void *sptr; 418 + 419 + sptr = xfs_rtrmap_broot_ptr_addr(mp, broot, 1, old_size); 420 + dptr = xfs_rtrmap_broot_ptr_addr(mp, broot, 1, new_size); 421 + memmove(dptr, sptr, numrecs * sizeof(xfs_rtrmap_ptr_t)); 422 + } 423 + 424 + static struct xfs_btree_block * 425 + xfs_rtrmapbt_broot_realloc( 426 + struct xfs_btree_cur *cur, 427 + unsigned int new_numrecs) 428 + { 429 + struct xfs_mount *mp = cur->bc_mp; 430 + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); 431 + struct xfs_btree_block *broot; 432 + unsigned int new_size; 433 + unsigned int old_size = ifp->if_broot_bytes; 434 + const unsigned int level = cur->bc_nlevels - 1; 435 + 436 + new_size = xfs_rtrmap_broot_space_calc(mp, level, new_numrecs); 437 + 438 + /* Handle the nop case quietly. */ 439 + if (new_size == old_size) 440 + return ifp->if_broot; 441 + 442 + if (new_size > old_size) { 443 + unsigned int old_numrecs; 444 + 445 + /* 446 + * If there wasn't any memory allocated before, just allocate 447 + * it now and get out. 448 + */ 449 + if (old_size == 0) 450 + return xfs_broot_realloc(ifp, new_size); 451 + 452 + /* 453 + * If there is already an existing if_broot, then we need to 454 + * realloc it and possibly move the node block pointers because 455 + * those are not butted up against the btree block header. 456 + */ 457 + old_numrecs = xfs_rtrmapbt_maxrecs(mp, old_size, level == 0); 458 + broot = xfs_broot_realloc(ifp, new_size); 459 + if (level > 0) 460 + xfs_rtrmapbt_move_ptrs(mp, broot, old_size, new_size, 461 + old_numrecs); 462 + goto out_broot; 463 + } 464 + 465 + /* 466 + * We're reducing numrecs. If we're going all the way to zero, just 467 + * free the block. 468 + */ 469 + ASSERT(ifp->if_broot != NULL && old_size > 0); 470 + if (new_size == 0) 471 + return xfs_broot_realloc(ifp, 0); 472 + 473 + /* 474 + * Shrink the btree root by possibly moving the rtrmapbt pointers, 475 + * since they are not butted up against the btree block header. Then 476 + * reallocate broot. 477 + */ 478 + if (level > 0) 479 + xfs_rtrmapbt_move_ptrs(mp, ifp->if_broot, old_size, new_size, 480 + new_numrecs); 481 + broot = xfs_broot_realloc(ifp, new_size); 482 + 483 + out_broot: 484 + ASSERT(xfs_rtrmap_droot_space(broot) <= 485 + xfs_inode_fork_size(cur->bc_ino.ip, cur->bc_ino.whichfork)); 486 + return broot; 487 + } 488 + 489 + const struct xfs_btree_ops xfs_rtrmapbt_ops = { 490 + .name = "rtrmap", 491 + .type = XFS_BTREE_TYPE_INODE, 492 + .geom_flags = XFS_BTGEO_OVERLAPPING | 493 + XFS_BTGEO_IROOT_RECORDS, 494 + 495 + .rec_len = sizeof(struct xfs_rmap_rec), 496 + /* Overlapping btree; 2 keys per pointer. */ 497 + .key_len = 2 * sizeof(struct xfs_rmap_key), 498 + .ptr_len = XFS_BTREE_LONG_PTR_LEN, 499 + 500 + .lru_refs = XFS_RMAP_BTREE_REF, 501 + .statoff = XFS_STATS_CALC_INDEX(xs_rtrmap_2), 502 + .sick_mask = XFS_SICK_RG_RMAPBT, 503 + 504 + .dup_cursor = xfs_rtrmapbt_dup_cursor, 505 + .alloc_block = xfs_btree_alloc_metafile_block, 506 + .free_block = xfs_btree_free_metafile_block, 507 + .get_minrecs = xfs_rtrmapbt_get_minrecs, 508 + .get_maxrecs = xfs_rtrmapbt_get_maxrecs, 509 + .get_dmaxrecs = xfs_rtrmapbt_get_dmaxrecs, 510 + .init_key_from_rec = xfs_rtrmapbt_init_key_from_rec, 511 + .init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec, 512 + .init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur, 513 + .init_ptr_from_cur = xfs_rtrmapbt_init_ptr_from_cur, 514 + .key_diff = xfs_rtrmapbt_key_diff, 515 + .buf_ops = &xfs_rtrmapbt_buf_ops, 516 + .diff_two_keys = xfs_rtrmapbt_diff_two_keys, 517 + .keys_inorder = xfs_rtrmapbt_keys_inorder, 518 + .recs_inorder = xfs_rtrmapbt_recs_inorder, 519 + .keys_contiguous = xfs_rtrmapbt_keys_contiguous, 520 + .broot_realloc = xfs_rtrmapbt_broot_realloc, 521 + }; 522 + 523 + /* Allocate a new rt rmap btree cursor. */ 524 + struct xfs_btree_cur * 525 + xfs_rtrmapbt_init_cursor( 526 + struct xfs_trans *tp, 527 + struct xfs_rtgroup *rtg) 528 + { 529 + struct xfs_inode *ip = rtg_rmap(rtg); 530 + struct xfs_mount *mp = rtg_mount(rtg); 531 + struct xfs_btree_cur *cur; 532 + 533 + xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); 534 + 535 + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrmapbt_ops, 536 + mp->m_rtrmap_maxlevels, xfs_rtrmapbt_cur_cache); 537 + 538 + cur->bc_ino.ip = ip; 539 + cur->bc_group = xfs_group_hold(rtg_group(rtg)); 540 + cur->bc_ino.whichfork = XFS_DATA_FORK; 541 + cur->bc_nlevels = be16_to_cpu(ip->i_df.if_broot->bb_level) + 1; 542 + cur->bc_ino.forksize = xfs_inode_fork_size(ip, XFS_DATA_FORK); 543 + 544 + return cur; 545 + } 546 + 547 + #ifdef CONFIG_XFS_BTREE_IN_MEM 548 + /* 549 + * Validate an in-memory realtime rmap btree block. Callers are allowed to 550 + * generate an in-memory btree even if the ondisk feature is not enabled. 551 + */ 552 + static xfs_failaddr_t 553 + xfs_rtrmapbt_mem_verify( 554 + struct xfs_buf *bp) 555 + { 556 + struct xfs_mount *mp = bp->b_mount; 557 + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 558 + xfs_failaddr_t fa; 559 + unsigned int level; 560 + unsigned int maxrecs; 561 + 562 + if (!xfs_verify_magic(bp, block->bb_magic)) 563 + return __this_address; 564 + 565 + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); 566 + if (fa) 567 + return fa; 568 + 569 + level = be16_to_cpu(block->bb_level); 570 + if (xfs_has_rmapbt(mp)) { 571 + if (level >= mp->m_rtrmap_maxlevels) 572 + return __this_address; 573 + } else { 574 + if (level >= xfs_rtrmapbt_maxlevels_ondisk()) 575 + return __this_address; 576 + } 577 + 578 + maxrecs = xfs_rtrmapbt_maxrecs(mp, XFBNO_BLOCKSIZE, level == 0); 579 + return xfs_btree_memblock_verify(bp, maxrecs); 580 + } 581 + 582 + static void 583 + xfs_rtrmapbt_mem_rw_verify( 584 + struct xfs_buf *bp) 585 + { 586 + xfs_failaddr_t fa = xfs_rtrmapbt_mem_verify(bp); 587 + 588 + if (fa) 589 + xfs_verifier_error(bp, -EFSCORRUPTED, fa); 590 + } 591 + 592 + /* skip crc checks on in-memory btrees to save time */ 593 + static const struct xfs_buf_ops xfs_rtrmapbt_mem_buf_ops = { 594 + .name = "xfs_rtrmapbt_mem", 595 + .magic = { 0, cpu_to_be32(XFS_RTRMAP_CRC_MAGIC) }, 596 + .verify_read = xfs_rtrmapbt_mem_rw_verify, 597 + .verify_write = xfs_rtrmapbt_mem_rw_verify, 598 + .verify_struct = xfs_rtrmapbt_mem_verify, 599 + }; 600 + 601 + const struct xfs_btree_ops xfs_rtrmapbt_mem_ops = { 602 + .type = XFS_BTREE_TYPE_MEM, 603 + .geom_flags = XFS_BTGEO_OVERLAPPING, 604 + 605 + .rec_len = sizeof(struct xfs_rmap_rec), 606 + /* Overlapping btree; 2 keys per pointer. */ 607 + .key_len = 2 * sizeof(struct xfs_rmap_key), 608 + .ptr_len = XFS_BTREE_LONG_PTR_LEN, 609 + 610 + .lru_refs = XFS_RMAP_BTREE_REF, 611 + .statoff = XFS_STATS_CALC_INDEX(xs_rtrmap_mem_2), 612 + 613 + .dup_cursor = xfbtree_dup_cursor, 614 + .set_root = xfbtree_set_root, 615 + .alloc_block = xfbtree_alloc_block, 616 + .free_block = xfbtree_free_block, 617 + .get_minrecs = xfbtree_get_minrecs, 618 + .get_maxrecs = xfbtree_get_maxrecs, 619 + .init_key_from_rec = xfs_rtrmapbt_init_key_from_rec, 620 + .init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec, 621 + .init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur, 622 + .init_ptr_from_cur = xfbtree_init_ptr_from_cur, 623 + .key_diff = xfs_rtrmapbt_key_diff, 624 + .buf_ops = &xfs_rtrmapbt_mem_buf_ops, 625 + .diff_two_keys = xfs_rtrmapbt_diff_two_keys, 626 + .keys_inorder = xfs_rtrmapbt_keys_inorder, 627 + .recs_inorder = xfs_rtrmapbt_recs_inorder, 628 + .keys_contiguous = xfs_rtrmapbt_keys_contiguous, 629 + }; 630 + 631 + /* Create a cursor for an in-memory btree. */ 632 + struct xfs_btree_cur * 633 + xfs_rtrmapbt_mem_cursor( 634 + struct xfs_rtgroup *rtg, 635 + struct xfs_trans *tp, 636 + struct xfbtree *xfbt) 637 + { 638 + struct xfs_mount *mp = rtg_mount(rtg); 639 + struct xfs_btree_cur *cur; 640 + 641 + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rtrmapbt_mem_ops, 642 + mp->m_rtrmap_maxlevels, xfs_rtrmapbt_cur_cache); 643 + cur->bc_mem.xfbtree = xfbt; 644 + cur->bc_nlevels = xfbt->nlevels; 645 + cur->bc_group = xfs_group_hold(rtg_group(rtg)); 646 + return cur; 647 + } 648 + 649 + /* Create an in-memory realtime rmap btree. */ 650 + int 651 + xfs_rtrmapbt_mem_init( 652 + struct xfs_mount *mp, 653 + struct xfbtree *xfbt, 654 + struct xfs_buftarg *btp, 655 + xfs_rgnumber_t rgno) 656 + { 657 + xfbt->owner = rgno; 658 + return xfbtree_init(mp, xfbt, btp, &xfs_rtrmapbt_mem_ops); 659 + } 660 + #endif /* CONFIG_XFS_BTREE_IN_MEM */ 661 + 662 + /* 663 + * Install a new rt reverse mapping btree root. Caller is responsible for 664 + * invalidating and freeing the old btree blocks. 665 + */ 666 + void 667 + xfs_rtrmapbt_commit_staged_btree( 668 + struct xfs_btree_cur *cur, 669 + struct xfs_trans *tp) 670 + { 671 + struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake; 672 + struct xfs_ifork *ifp; 673 + int flags = XFS_ILOG_CORE | XFS_ILOG_DBROOT; 674 + 675 + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); 676 + ASSERT(ifake->if_fork->if_format == XFS_DINODE_FMT_META_BTREE); 677 + 678 + /* 679 + * Free any resources hanging off the real fork, then shallow-copy the 680 + * staging fork's contents into the real fork to transfer everything 681 + * we just built. 682 + */ 683 + ifp = xfs_ifork_ptr(cur->bc_ino.ip, XFS_DATA_FORK); 684 + xfs_idestroy_fork(ifp); 685 + memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork)); 686 + 687 + cur->bc_ino.ip->i_projid = cur->bc_group->xg_gno; 688 + xfs_trans_log_inode(tp, cur->bc_ino.ip, flags); 689 + xfs_btree_commit_ifakeroot(cur, tp, XFS_DATA_FORK); 690 + } 691 + 692 + /* Calculate number of records in a rt reverse mapping btree block. */ 693 + static inline unsigned int 694 + xfs_rtrmapbt_block_maxrecs( 695 + unsigned int blocklen, 696 + bool leaf) 697 + { 698 + if (leaf) 699 + return blocklen / sizeof(struct xfs_rmap_rec); 700 + return blocklen / 701 + (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rtrmap_ptr_t)); 702 + } 703 + 704 + /* 705 + * Calculate number of records in an rt reverse mapping btree block. 706 + */ 707 + unsigned int 708 + xfs_rtrmapbt_maxrecs( 709 + struct xfs_mount *mp, 710 + unsigned int blocklen, 711 + bool leaf) 712 + { 713 + blocklen -= XFS_RTRMAP_BLOCK_LEN; 714 + return xfs_rtrmapbt_block_maxrecs(blocklen, leaf); 715 + } 716 + 717 + /* Compute the max possible height for realtime reverse mapping btrees. */ 718 + unsigned int 719 + xfs_rtrmapbt_maxlevels_ondisk(void) 720 + { 721 + unsigned long long max_dblocks; 722 + unsigned int minrecs[2]; 723 + unsigned int blocklen; 724 + 725 + blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; 726 + 727 + minrecs[0] = xfs_rtrmapbt_block_maxrecs(blocklen, true) / 2; 728 + minrecs[1] = xfs_rtrmapbt_block_maxrecs(blocklen, false) / 2; 729 + 730 + /* 731 + * Compute the asymptotic maxlevels for an rtrmapbt on any rtreflink fs. 732 + * 733 + * On a reflink filesystem, each block in an rtgroup can have up to 734 + * 2^32 (per the refcount record format) owners, which means that 735 + * theoretically we could face up to 2^64 rmap records. However, we're 736 + * likely to run out of blocks in the data device long before that 737 + * happens, which means that we must compute the max height based on 738 + * what the btree will look like if it consumes almost all the blocks 739 + * in the data device due to maximal sharing factor. 740 + */ 741 + max_dblocks = -1U; /* max ag count */ 742 + max_dblocks *= XFS_MAX_CRC_AG_BLOCKS; 743 + return xfs_btree_space_to_height(minrecs, max_dblocks); 744 + } 745 + 746 + int __init 747 + xfs_rtrmapbt_init_cur_cache(void) 748 + { 749 + xfs_rtrmapbt_cur_cache = kmem_cache_create("xfs_rtrmapbt_cur", 750 + xfs_btree_cur_sizeof(xfs_rtrmapbt_maxlevels_ondisk()), 751 + 0, 0, NULL); 752 + 753 + if (!xfs_rtrmapbt_cur_cache) 754 + return -ENOMEM; 755 + return 0; 756 + } 757 + 758 + void 759 + xfs_rtrmapbt_destroy_cur_cache(void) 760 + { 761 + kmem_cache_destroy(xfs_rtrmapbt_cur_cache); 762 + xfs_rtrmapbt_cur_cache = NULL; 763 + } 764 + 765 + /* Compute the maximum height of an rt reverse mapping btree. */ 766 + void 767 + xfs_rtrmapbt_compute_maxlevels( 768 + struct xfs_mount *mp) 769 + { 770 + unsigned int d_maxlevels, r_maxlevels; 771 + 772 + if (!xfs_has_rtrmapbt(mp)) { 773 + mp->m_rtrmap_maxlevels = 0; 774 + return; 775 + } 776 + 777 + /* 778 + * The realtime rmapbt lives on the data device, which means that its 779 + * maximum height is constrained by the size of the data device and 780 + * the height required to store one rmap record for each block in an 781 + * rt group. 782 + * 783 + * On a reflink filesystem, each rt block can have up to 2^32 (per the 784 + * refcount record format) owners, which means that theoretically we 785 + * could face up to 2^64 rmap records. This makes the computation of 786 + * maxlevels based on record count meaningless, so we only consider the 787 + * size of the data device. 788 + */ 789 + d_maxlevels = xfs_btree_space_to_height(mp->m_rtrmap_mnr, 790 + mp->m_sb.sb_dblocks); 791 + if (xfs_has_rtreflink(mp)) { 792 + mp->m_rtrmap_maxlevels = d_maxlevels + 1; 793 + return; 794 + } 795 + 796 + r_maxlevels = xfs_btree_compute_maxlevels(mp->m_rtrmap_mnr, 797 + mp->m_groups[XG_TYPE_RTG].blocks); 798 + 799 + /* Add one level to handle the inode root level. */ 800 + mp->m_rtrmap_maxlevels = min(d_maxlevels, r_maxlevels) + 1; 801 + } 802 + 803 + /* Calculate the rtrmap btree size for some records. */ 804 + unsigned long long 805 + xfs_rtrmapbt_calc_size( 806 + struct xfs_mount *mp, 807 + unsigned long long len) 808 + { 809 + return xfs_btree_calc_size(mp->m_rtrmap_mnr, len); 810 + } 811 + 812 + /* 813 + * Calculate the maximum rmap btree size. 814 + */ 815 + static unsigned long long 816 + xfs_rtrmapbt_max_size( 817 + struct xfs_mount *mp, 818 + xfs_rtblock_t rtblocks) 819 + { 820 + /* Bail out if we're uninitialized, which can happen in mkfs. */ 821 + if (mp->m_rtrmap_mxr[0] == 0) 822 + return 0; 823 + 824 + return xfs_rtrmapbt_calc_size(mp, rtblocks); 825 + } 826 + 827 + /* 828 + * Figure out how many blocks to reserve and how many are used by this btree. 829 + */ 830 + xfs_filblks_t 831 + xfs_rtrmapbt_calc_reserves( 832 + struct xfs_mount *mp) 833 + { 834 + uint32_t blocks = mp->m_groups[XG_TYPE_RTG].blocks; 835 + 836 + if (!xfs_has_rtrmapbt(mp)) 837 + return 0; 838 + 839 + /* Reserve 1% of the rtgroup or enough for 1 block per record. */ 840 + return max_t(xfs_filblks_t, blocks / 100, 841 + xfs_rtrmapbt_max_size(mp, blocks)); 842 + } 843 + 844 + /* Convert on-disk form of btree root to in-memory form. */ 845 + STATIC void 846 + xfs_rtrmapbt_from_disk( 847 + struct xfs_inode *ip, 848 + struct xfs_rtrmap_root *dblock, 849 + unsigned int dblocklen, 850 + struct xfs_btree_block *rblock) 851 + { 852 + struct xfs_mount *mp = ip->i_mount; 853 + struct xfs_rmap_key *fkp; 854 + __be64 *fpp; 855 + struct xfs_rmap_key *tkp; 856 + __be64 *tpp; 857 + struct xfs_rmap_rec *frp; 858 + struct xfs_rmap_rec *trp; 859 + unsigned int rblocklen = xfs_rtrmap_broot_space(mp, dblock); 860 + unsigned int numrecs; 861 + unsigned int maxrecs; 862 + 863 + xfs_btree_init_block(mp, rblock, &xfs_rtrmapbt_ops, 0, 0, ip->i_ino); 864 + 865 + rblock->bb_level = dblock->bb_level; 866 + rblock->bb_numrecs = dblock->bb_numrecs; 867 + numrecs = be16_to_cpu(dblock->bb_numrecs); 868 + 869 + if (be16_to_cpu(rblock->bb_level) > 0) { 870 + maxrecs = xfs_rtrmapbt_droot_maxrecs(dblocklen, false); 871 + fkp = xfs_rtrmap_droot_key_addr(dblock, 1); 872 + tkp = xfs_rtrmap_key_addr(rblock, 1); 873 + fpp = xfs_rtrmap_droot_ptr_addr(dblock, 1, maxrecs); 874 + tpp = xfs_rtrmap_broot_ptr_addr(mp, rblock, 1, rblocklen); 875 + memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs); 876 + memcpy(tpp, fpp, sizeof(*fpp) * numrecs); 877 + } else { 878 + frp = xfs_rtrmap_droot_rec_addr(dblock, 1); 879 + trp = xfs_rtrmap_rec_addr(rblock, 1); 880 + memcpy(trp, frp, sizeof(*frp) * numrecs); 881 + } 882 + } 883 + 884 + /* Load a realtime reverse mapping btree root in from disk. */ 885 + int 886 + xfs_iformat_rtrmap( 887 + struct xfs_inode *ip, 888 + struct xfs_dinode *dip) 889 + { 890 + struct xfs_mount *mp = ip->i_mount; 891 + struct xfs_rtrmap_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 892 + struct xfs_btree_block *broot; 893 + unsigned int numrecs; 894 + unsigned int level; 895 + int dsize; 896 + 897 + /* 898 + * growfs must create the rtrmap inodes before adding a realtime volume 899 + * to the filesystem, so we cannot use the rtrmapbt predicate here. 900 + */ 901 + if (!xfs_has_rmapbt(ip->i_mount)) { 902 + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); 903 + return -EFSCORRUPTED; 904 + } 905 + 906 + dsize = XFS_DFORK_SIZE(dip, mp, XFS_DATA_FORK); 907 + numrecs = be16_to_cpu(dfp->bb_numrecs); 908 + level = be16_to_cpu(dfp->bb_level); 909 + 910 + if (level > mp->m_rtrmap_maxlevels || 911 + xfs_rtrmap_droot_space_calc(level, numrecs) > dsize) { 912 + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); 913 + return -EFSCORRUPTED; 914 + } 915 + 916 + broot = xfs_broot_alloc(xfs_ifork_ptr(ip, XFS_DATA_FORK), 917 + xfs_rtrmap_broot_space_calc(mp, level, numrecs)); 918 + if (broot) 919 + xfs_rtrmapbt_from_disk(ip, dfp, dsize, broot); 920 + return 0; 921 + } 922 + 923 + /* Convert in-memory form of btree root to on-disk form. */ 924 + void 925 + xfs_rtrmapbt_to_disk( 926 + struct xfs_mount *mp, 927 + struct xfs_btree_block *rblock, 928 + unsigned int rblocklen, 929 + struct xfs_rtrmap_root *dblock, 930 + unsigned int dblocklen) 931 + { 932 + struct xfs_rmap_key *fkp; 933 + __be64 *fpp; 934 + struct xfs_rmap_key *tkp; 935 + __be64 *tpp; 936 + struct xfs_rmap_rec *frp; 937 + struct xfs_rmap_rec *trp; 938 + unsigned int numrecs; 939 + unsigned int maxrecs; 940 + 941 + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_RTRMAP_CRC_MAGIC)); 942 + ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)); 943 + ASSERT(rblock->bb_u.l.bb_blkno == cpu_to_be64(XFS_BUF_DADDR_NULL)); 944 + ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)); 945 + ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)); 946 + 947 + dblock->bb_level = rblock->bb_level; 948 + dblock->bb_numrecs = rblock->bb_numrecs; 949 + numrecs = be16_to_cpu(rblock->bb_numrecs); 950 + 951 + if (be16_to_cpu(rblock->bb_level) > 0) { 952 + maxrecs = xfs_rtrmapbt_droot_maxrecs(dblocklen, false); 953 + fkp = xfs_rtrmap_key_addr(rblock, 1); 954 + tkp = xfs_rtrmap_droot_key_addr(dblock, 1); 955 + fpp = xfs_rtrmap_broot_ptr_addr(mp, rblock, 1, rblocklen); 956 + tpp = xfs_rtrmap_droot_ptr_addr(dblock, 1, maxrecs); 957 + memcpy(tkp, fkp, 2 * sizeof(*fkp) * numrecs); 958 + memcpy(tpp, fpp, sizeof(*fpp) * numrecs); 959 + } else { 960 + frp = xfs_rtrmap_rec_addr(rblock, 1); 961 + trp = xfs_rtrmap_droot_rec_addr(dblock, 1); 962 + memcpy(trp, frp, sizeof(*frp) * numrecs); 963 + } 964 + } 965 + 966 + /* Flush a realtime reverse mapping btree root out to disk. */ 967 + void 968 + xfs_iflush_rtrmap( 969 + struct xfs_inode *ip, 970 + struct xfs_dinode *dip) 971 + { 972 + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); 973 + struct xfs_rtrmap_root *dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 974 + 975 + ASSERT(ifp->if_broot != NULL); 976 + ASSERT(ifp->if_broot_bytes > 0); 977 + ASSERT(xfs_rtrmap_droot_space(ifp->if_broot) <= 978 + xfs_inode_fork_size(ip, XFS_DATA_FORK)); 979 + xfs_rtrmapbt_to_disk(ip->i_mount, ifp->if_broot, ifp->if_broot_bytes, 980 + dfp, XFS_DFORK_SIZE(dip, ip->i_mount, XFS_DATA_FORK)); 981 + } 982 + 983 + /* 984 + * Create a realtime rmap btree inode. 985 + */ 986 + int 987 + xfs_rtrmapbt_create( 988 + struct xfs_rtgroup *rtg, 989 + struct xfs_inode *ip, 990 + struct xfs_trans *tp, 991 + bool init) 992 + { 993 + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); 994 + struct xfs_mount *mp = ip->i_mount; 995 + struct xfs_btree_block *broot; 996 + 997 + ifp->if_format = XFS_DINODE_FMT_META_BTREE; 998 + ASSERT(ifp->if_broot_bytes == 0); 999 + ASSERT(ifp->if_bytes == 0); 1000 + 1001 + /* Initialize the empty incore btree root. */ 1002 + broot = xfs_broot_realloc(ifp, xfs_rtrmap_broot_space_calc(mp, 0, 0)); 1003 + if (broot) 1004 + xfs_btree_init_block(mp, broot, &xfs_rtrmapbt_ops, 0, 0, 1005 + ip->i_ino); 1006 + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE | XFS_ILOG_DBROOT); 1007 + 1008 + return 0; 1009 + } 1010 + 1011 + /* 1012 + * Initialize an rmap for a realtime superblock using the potentially updated 1013 + * rt geometry in the provided @mp. 1014 + */ 1015 + int 1016 + xfs_rtrmapbt_init_rtsb( 1017 + struct xfs_mount *mp, 1018 + struct xfs_rtgroup *rtg, 1019 + struct xfs_trans *tp) 1020 + { 1021 + struct xfs_rmap_irec rmap = { 1022 + .rm_blockcount = mp->m_sb.sb_rextsize, 1023 + .rm_owner = XFS_RMAP_OWN_FS, 1024 + }; 1025 + struct xfs_btree_cur *cur; 1026 + int error; 1027 + 1028 + ASSERT(xfs_has_rtsb(mp)); 1029 + ASSERT(rtg_rgno(rtg) == 0); 1030 + 1031 + cur = xfs_rtrmapbt_init_cursor(tp, rtg); 1032 + error = xfs_rmap_map_raw(cur, &rmap); 1033 + xfs_btree_del_cursor(cur, error); 1034 + return error; 1035 + }

+210

fs/xfs/libxfs/xfs_rtrmap_btree.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <djwong@kernel.org> 5 + */ 6 + #ifndef __XFS_RTRMAP_BTREE_H__ 7 + #define __XFS_RTRMAP_BTREE_H__ 8 + 9 + struct xfs_buf; 10 + struct xfs_btree_cur; 11 + struct xfs_mount; 12 + struct xbtree_ifakeroot; 13 + struct xfs_rtgroup; 14 + struct xfbtree; 15 + 16 + /* rmaps only exist on crc enabled filesystems */ 17 + #define XFS_RTRMAP_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN 18 + 19 + struct xfs_btree_cur *xfs_rtrmapbt_init_cursor(struct xfs_trans *tp, 20 + struct xfs_rtgroup *rtg); 21 + struct xfs_btree_cur *xfs_rtrmapbt_stage_cursor(struct xfs_mount *mp, 22 + struct xfs_rtgroup *rtg, struct xfs_inode *ip, 23 + struct xbtree_ifakeroot *ifake); 24 + void xfs_rtrmapbt_commit_staged_btree(struct xfs_btree_cur *cur, 25 + struct xfs_trans *tp); 26 + unsigned int xfs_rtrmapbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, 27 + bool leaf); 28 + void xfs_rtrmapbt_compute_maxlevels(struct xfs_mount *mp); 29 + unsigned int xfs_rtrmapbt_droot_maxrecs(unsigned int blocklen, bool leaf); 30 + 31 + /* 32 + * Addresses of records, keys, and pointers within an incore rtrmapbt block. 33 + * 34 + * (note that some of these may appear unused, but they are used in userspace) 35 + */ 36 + static inline struct xfs_rmap_rec * 37 + xfs_rtrmap_rec_addr( 38 + struct xfs_btree_block *block, 39 + unsigned int index) 40 + { 41 + return (struct xfs_rmap_rec *) 42 + ((char *)block + XFS_RTRMAP_BLOCK_LEN + 43 + (index - 1) * sizeof(struct xfs_rmap_rec)); 44 + } 45 + 46 + static inline struct xfs_rmap_key * 47 + xfs_rtrmap_key_addr( 48 + struct xfs_btree_block *block, 49 + unsigned int index) 50 + { 51 + return (struct xfs_rmap_key *) 52 + ((char *)block + XFS_RTRMAP_BLOCK_LEN + 53 + (index - 1) * 2 * sizeof(struct xfs_rmap_key)); 54 + } 55 + 56 + static inline struct xfs_rmap_key * 57 + xfs_rtrmap_high_key_addr( 58 + struct xfs_btree_block *block, 59 + unsigned int index) 60 + { 61 + return (struct xfs_rmap_key *) 62 + ((char *)block + XFS_RTRMAP_BLOCK_LEN + 63 + sizeof(struct xfs_rmap_key) + 64 + (index - 1) * 2 * sizeof(struct xfs_rmap_key)); 65 + } 66 + 67 + static inline xfs_rtrmap_ptr_t * 68 + xfs_rtrmap_ptr_addr( 69 + struct xfs_btree_block *block, 70 + unsigned int index, 71 + unsigned int maxrecs) 72 + { 73 + return (xfs_rtrmap_ptr_t *) 74 + ((char *)block + XFS_RTRMAP_BLOCK_LEN + 75 + maxrecs * 2 * sizeof(struct xfs_rmap_key) + 76 + (index - 1) * sizeof(xfs_rtrmap_ptr_t)); 77 + } 78 + 79 + unsigned int xfs_rtrmapbt_maxlevels_ondisk(void); 80 + 81 + int __init xfs_rtrmapbt_init_cur_cache(void); 82 + void xfs_rtrmapbt_destroy_cur_cache(void); 83 + 84 + xfs_filblks_t xfs_rtrmapbt_calc_reserves(struct xfs_mount *mp); 85 + 86 + /* Addresses of key, pointers, and records within an ondisk rtrmapbt block. */ 87 + 88 + static inline struct xfs_rmap_rec * 89 + xfs_rtrmap_droot_rec_addr( 90 + struct xfs_rtrmap_root *block, 91 + unsigned int index) 92 + { 93 + return (struct xfs_rmap_rec *) 94 + ((char *)(block + 1) + 95 + (index - 1) * sizeof(struct xfs_rmap_rec)); 96 + } 97 + 98 + static inline struct xfs_rmap_key * 99 + xfs_rtrmap_droot_key_addr( 100 + struct xfs_rtrmap_root *block, 101 + unsigned int index) 102 + { 103 + return (struct xfs_rmap_key *) 104 + ((char *)(block + 1) + 105 + (index - 1) * 2 * sizeof(struct xfs_rmap_key)); 106 + } 107 + 108 + static inline xfs_rtrmap_ptr_t * 109 + xfs_rtrmap_droot_ptr_addr( 110 + struct xfs_rtrmap_root *block, 111 + unsigned int index, 112 + unsigned int maxrecs) 113 + { 114 + return (xfs_rtrmap_ptr_t *) 115 + ((char *)(block + 1) + 116 + maxrecs * 2 * sizeof(struct xfs_rmap_key) + 117 + (index - 1) * sizeof(xfs_rtrmap_ptr_t)); 118 + } 119 + 120 + /* 121 + * Address of pointers within the incore btree root. 122 + * 123 + * These are to be used when we know the size of the block and 124 + * we don't have a cursor. 125 + */ 126 + static inline xfs_rtrmap_ptr_t * 127 + xfs_rtrmap_broot_ptr_addr( 128 + struct xfs_mount *mp, 129 + struct xfs_btree_block *bb, 130 + unsigned int index, 131 + unsigned int block_size) 132 + { 133 + return xfs_rtrmap_ptr_addr(bb, index, 134 + xfs_rtrmapbt_maxrecs(mp, block_size, false)); 135 + } 136 + 137 + /* 138 + * Compute the space required for the incore btree root containing the given 139 + * number of records. 140 + */ 141 + static inline size_t 142 + xfs_rtrmap_broot_space_calc( 143 + struct xfs_mount *mp, 144 + unsigned int level, 145 + unsigned int nrecs) 146 + { 147 + size_t sz = XFS_RTRMAP_BLOCK_LEN; 148 + 149 + if (level > 0) 150 + return sz + nrecs * (2 * sizeof(struct xfs_rmap_key) + 151 + sizeof(xfs_rtrmap_ptr_t)); 152 + return sz + nrecs * sizeof(struct xfs_rmap_rec); 153 + } 154 + 155 + /* 156 + * Compute the space required for the incore btree root given the ondisk 157 + * btree root block. 158 + */ 159 + static inline size_t 160 + xfs_rtrmap_broot_space(struct xfs_mount *mp, struct xfs_rtrmap_root *bb) 161 + { 162 + return xfs_rtrmap_broot_space_calc(mp, be16_to_cpu(bb->bb_level), 163 + be16_to_cpu(bb->bb_numrecs)); 164 + } 165 + 166 + /* Compute the space required for the ondisk root block. */ 167 + static inline size_t 168 + xfs_rtrmap_droot_space_calc( 169 + unsigned int level, 170 + unsigned int nrecs) 171 + { 172 + size_t sz = sizeof(struct xfs_rtrmap_root); 173 + 174 + if (level > 0) 175 + return sz + nrecs * (2 * sizeof(struct xfs_rmap_key) + 176 + sizeof(xfs_rtrmap_ptr_t)); 177 + return sz + nrecs * sizeof(struct xfs_rmap_rec); 178 + } 179 + 180 + /* 181 + * Compute the space required for the ondisk root block given an incore root 182 + * block. 183 + */ 184 + static inline size_t 185 + xfs_rtrmap_droot_space(struct xfs_btree_block *bb) 186 + { 187 + return xfs_rtrmap_droot_space_calc(be16_to_cpu(bb->bb_level), 188 + be16_to_cpu(bb->bb_numrecs)); 189 + } 190 + 191 + int xfs_iformat_rtrmap(struct xfs_inode *ip, struct xfs_dinode *dip); 192 + void xfs_rtrmapbt_to_disk(struct xfs_mount *mp, struct xfs_btree_block *rblock, 193 + unsigned int rblocklen, struct xfs_rtrmap_root *dblock, 194 + unsigned int dblocklen); 195 + void xfs_iflush_rtrmap(struct xfs_inode *ip, struct xfs_dinode *dip); 196 + 197 + int xfs_rtrmapbt_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip, 198 + struct xfs_trans *tp, bool init); 199 + int xfs_rtrmapbt_init_rtsb(struct xfs_mount *mp, struct xfs_rtgroup *rtg, 200 + struct xfs_trans *tp); 201 + 202 + unsigned long long xfs_rtrmapbt_calc_size(struct xfs_mount *mp, 203 + unsigned long long len); 204 + 205 + struct xfs_btree_cur *xfs_rtrmapbt_mem_cursor(struct xfs_rtgroup *rtg, 206 + struct xfs_trans *tp, struct xfbtree *xfbtree); 207 + int xfs_rtrmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree, 208 + struct xfs_buftarg *btp, xfs_rgnumber_t rgno); 209 + 210 + #endif /* __XFS_RTRMAP_BTREE_H__ */

+14

fs/xfs/libxfs/xfs_sb.c

··· 28 28 #include "xfs_rtbitmap.h" 29 29 #include "xfs_exchrange.h" 30 30 #include "xfs_rtgroup.h" 31 + #include "xfs_rtrmap_btree.h" 32 + #include "xfs_rtrefcount_btree.h" 31 33 32 34 /* 33 35 * Physical superblock buffer manipulations. Shared with libxfs in userspace. ··· 1217 1215 mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2; 1218 1216 mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2; 1219 1217 1218 + mp->m_rtrmap_mxr[0] = xfs_rtrmapbt_maxrecs(mp, sbp->sb_blocksize, true); 1219 + mp->m_rtrmap_mxr[1] = xfs_rtrmapbt_maxrecs(mp, sbp->sb_blocksize, false); 1220 + mp->m_rtrmap_mnr[0] = mp->m_rtrmap_mxr[0] / 2; 1221 + mp->m_rtrmap_mnr[1] = mp->m_rtrmap_mxr[1] / 2; 1222 + 1220 1223 mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, true); 1221 1224 mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, false); 1222 1225 mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2; 1223 1226 mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2; 1227 + 1228 + mp->m_rtrefc_mxr[0] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize, 1229 + true); 1230 + mp->m_rtrefc_mxr[1] = xfs_rtrefcountbt_maxrecs(mp, sbp->sb_blocksize, 1231 + false); 1232 + mp->m_rtrefc_mnr[0] = mp->m_rtrefc_mxr[0] / 2; 1233 + mp->m_rtrefc_mnr[1] = mp->m_rtrefc_mxr[1] / 2; 1224 1234 1225 1235 mp->m_bsize = XFS_FSB_TO_BB(mp, 1); 1226 1236 mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);

+21

fs/xfs/libxfs/xfs_shared.h

··· 42 42 extern const struct xfs_buf_ops xfs_rtsummary_buf_ops; 43 43 extern const struct xfs_buf_ops xfs_rtbuf_ops; 44 44 extern const struct xfs_buf_ops xfs_rtsb_buf_ops; 45 + extern const struct xfs_buf_ops xfs_rtrefcountbt_buf_ops; 46 + extern const struct xfs_buf_ops xfs_rtrmapbt_buf_ops; 45 47 extern const struct xfs_buf_ops xfs_sb_buf_ops; 46 48 extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; 47 49 extern const struct xfs_buf_ops xfs_symlink_buf_ops; ··· 57 55 extern const struct xfs_btree_ops xfs_refcountbt_ops; 58 56 extern const struct xfs_btree_ops xfs_rmapbt_ops; 59 57 extern const struct xfs_btree_ops xfs_rmapbt_mem_ops; 58 + extern const struct xfs_btree_ops xfs_rtrmapbt_ops; 59 + extern const struct xfs_btree_ops xfs_rtrmapbt_mem_ops; 60 + extern const struct xfs_btree_ops xfs_rtrefcountbt_ops; 60 61 61 62 static inline bool xfs_btree_is_bno(const struct xfs_btree_ops *ops) 62 63 { ··· 101 96 { 102 97 return ops == &xfs_rmapbt_mem_ops; 103 98 } 99 + 100 + static inline bool xfs_btree_is_mem_rtrmap(const struct xfs_btree_ops *ops) 101 + { 102 + return ops == &xfs_rtrmapbt_mem_ops; 103 + } 104 104 #else 105 105 # define xfs_btree_is_mem_rmap(...) (false) 106 + # define xfs_btree_is_mem_rtrmap(...) (false) 106 107 #endif 108 + 109 + static inline bool xfs_btree_is_rtrmap(const struct xfs_btree_ops *ops) 110 + { 111 + return ops == &xfs_rtrmapbt_ops; 112 + } 113 + 114 + static inline bool xfs_btree_is_rtrefcount(const struct xfs_btree_ops *ops) 115 + { 116 + return ops == &xfs_rtrefcountbt_ops; 117 + } 107 118 108 119 /* log size calculation functions */ 109 120 int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);

+32 -5

fs/xfs/libxfs/xfs_trans_resv.c

··· 92 92 return num_ops * (2 * mp->m_refc_maxlevels - 1); 93 93 } 94 94 95 + static unsigned int 96 + xfs_rtrefcountbt_block_count( 97 + struct xfs_mount *mp, 98 + unsigned int num_ops) 99 + { 100 + return num_ops * (2 * mp->m_rtrefc_maxlevels - 1); 101 + } 102 + 95 103 /* 96 104 * Logging inodes is really tricksy. They are logged in memory format, 97 105 * which means that what we write into the log doesn't directly translate into ··· 221 213 * Per-extent log reservation for the btree changes involved in freeing or 222 214 * allocating a realtime extent. We have to be able to log as many rtbitmap 223 215 * blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime 224 - * extents, as well as the realtime summary block. 216 + * extents, as well as the realtime summary block (t1). Realtime rmap btree 217 + * operations happen in a second transaction, so factor in a couple of rtrmapbt 218 + * splits (t2). 225 219 */ 226 220 static unsigned int 227 221 xfs_rtalloc_block_count( ··· 232 222 { 233 223 unsigned int rtbmp_blocks; 234 224 xfs_rtxlen_t rtxlen; 225 + unsigned int t1, t2 = 0; 235 226 236 227 rtxlen = xfs_extlen_to_rtxlen(mp, XFS_MAX_BMBT_EXTLEN); 237 228 rtbmp_blocks = xfs_rtbitmap_blockcount_len(mp, rtxlen); 238 - return (rtbmp_blocks + 1) * num_ops; 229 + t1 = (rtbmp_blocks + 1) * num_ops; 230 + 231 + if (xfs_has_rmapbt(mp)) 232 + t2 = num_ops * (2 * mp->m_rtrmap_maxlevels - 1); 233 + 234 + return max(t1, t2); 239 235 } 240 236 241 237 /* ··· 267 251 * Compute the log reservation required to handle the refcount update 268 252 * transaction. Refcount updates are always done via deferred log items. 269 253 * 270 - * This is calculated as: 254 + * This is calculated as the max of: 271 255 * Data device refcount updates (t1): 272 256 * the agfs of the ags containing the blocks: nr_ops * sector size 273 257 * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size 258 + * Realtime refcount updates (t2); 259 + * the rt refcount inode 260 + * the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size 274 261 */ 275 262 static unsigned int 276 263 xfs_calc_refcountbt_reservation( ··· 281 262 unsigned int nr_ops) 282 263 { 283 264 unsigned int blksz = XFS_FSB_TO_B(mp, 1); 265 + unsigned int t1, t2 = 0; 284 266 285 267 if (!xfs_has_reflink(mp)) 286 268 return 0; 287 269 288 - return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + 289 - xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz); 270 + t1 = xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + 271 + xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz); 272 + 273 + if (xfs_has_realtime(mp)) 274 + t2 = xfs_calc_inode_res(mp, 1) + 275 + xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops), 276 + blksz); 277 + 278 + return max(t1, t2); 290 279 } 291 280 292 281 /*

+13

fs/xfs/libxfs/xfs_trans_space.h

··· 14 14 #define XFS_MAX_CONTIG_BMAPS_PER_BLOCK(mp) \ 15 15 (((mp)->m_bmap_dmxr[0]) - ((mp)->m_bmap_dmnr[0])) 16 16 17 + /* Worst case number of realtime rmaps that can be held in a block. */ 18 + #define XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp) \ 19 + (((mp)->m_rtrmap_mxr[0]) - ((mp)->m_rtrmap_mnr[0])) 20 + 21 + /* Adding one realtime rmap could split every level to the top of the tree. */ 22 + #define XFS_RTRMAPADD_SPACE_RES(mp) ((mp)->m_rtrmap_maxlevels) 23 + 24 + /* Blocks we might need to add "b" realtime rmaps to a tree. */ 25 + #define XFS_NRTRMAPADD_SPACE_RES(mp, b) \ 26 + ((((b) + XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp) - 1) / \ 27 + XFS_MAX_CONTIG_RTRMAPS_PER_BLOCK(mp)) * \ 28 + XFS_RTRMAPADD_SPACE_RES(mp)) 29 + 17 30 /* Worst case number of rmaps that can be held in a block. */ 18 31 #define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \ 19 32 (((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0]))

+7

fs/xfs/libxfs/xfs_types.h

··· 202 202 * altering fdblocks. If you think you need this you're wrong. 203 203 */ 204 204 XFS_AG_RESV_IGNORE, 205 + 206 + /* 207 + * This allocation activity is being done on behalf of a metadata file. 208 + * These files maintain their own permanent space reservations and are 209 + * required to adjust fdblocks using the xfs_metafile_resv_* helpers. 210 + */ 211 + XFS_AG_RESV_METAFILE, 205 212 }; 206 213 207 214 /* Results of scanning a btree keyspace to check occupancy. */

+1 -1

fs/xfs/scrub/agheader_repair.c

··· 647 647 xfs_agblock_t agbno = start; 648 648 int error; 649 649 650 - trace_xrep_agfl_insert(sc->sa.pag, agbno, len); 650 + trace_xrep_agfl_insert(pag_group(sc->sa.pag), agbno, len); 651 651 652 652 while (agbno < start + len && af->fl_off < af->flcount) 653 653 af->agfl_bno[af->fl_off++] = cpu_to_be32(agbno++);

+3 -2

fs/xfs/scrub/alloc_repair.c

··· 542 542 543 543 /* Add a deferred rmap for each extent we used. */ 544 544 if (resv->used > 0) 545 - xfs_rmap_alloc_extent(sc->tp, pag_agno(pag), resv->agbno, 546 - resv->used, XFS_RMAP_OWN_AG); 545 + xfs_rmap_alloc_extent(sc->tp, false, 546 + xfs_agbno_to_fsb(pag, resv->agbno), resv->used, 547 + XFS_RMAP_OWN_AG); 547 548 548 549 /* 549 550 * For each reserved btree block we didn't use, add it to the free

+104 -22

fs/xfs/scrub/bmap.c

··· 21 21 #include "xfs_rmap_btree.h" 22 22 #include "xfs_rtgroup.h" 23 23 #include "xfs_health.h" 24 + #include "xfs_rtalloc.h" 25 + #include "xfs_rtrmap_btree.h" 24 26 #include "scrub/scrub.h" 25 27 #include "scrub/common.h" 26 28 #include "scrub/btree.h" ··· 145 143 xchk_bmap_get_rmap( 146 144 struct xchk_bmap_info *info, 147 145 struct xfs_bmbt_irec *irec, 148 - xfs_agblock_t agbno, 146 + xfs_agblock_t bno, 149 147 uint64_t owner, 150 148 struct xfs_rmap_irec *rmap) 151 149 { 150 + struct xfs_btree_cur **curp = &info->sc->sa.rmap_cur; 152 151 xfs_fileoff_t offset; 153 152 unsigned int rflags = 0; 154 153 int has_rmap; 155 154 int error; 155 + 156 + if (xfs_ifork_is_realtime(info->sc->ip, info->whichfork)) 157 + curp = &info->sc->sr.rmap_cur; 158 + 159 + if (*curp == NULL) 160 + return false; 156 161 157 162 if (info->whichfork == XFS_ATTR_FORK) 158 163 rflags |= XFS_RMAP_ATTR_FORK; ··· 181 172 * range rmap lookup to make sure we get the correct owner/offset. 182 173 */ 183 174 if (info->is_shared) { 184 - error = xfs_rmap_lookup_le_range(info->sc->sa.rmap_cur, agbno, 185 - owner, offset, rflags, rmap, &has_rmap); 175 + error = xfs_rmap_lookup_le_range(*curp, bno, owner, offset, 176 + rflags, rmap, &has_rmap); 186 177 } else { 187 - error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno, 188 - owner, offset, rflags, rmap, &has_rmap); 178 + error = xfs_rmap_lookup_le(*curp, bno, owner, offset, 179 + rflags, rmap, &has_rmap); 189 180 } 190 - if (!xchk_should_check_xref(info->sc, &error, &info->sc->sa.rmap_cur)) 181 + if (!xchk_should_check_xref(info->sc, &error, curp)) 191 182 return false; 192 183 193 184 if (!has_rmap) ··· 201 192 xchk_bmap_xref_rmap( 202 193 struct xchk_bmap_info *info, 203 194 struct xfs_bmbt_irec *irec, 204 - xfs_agblock_t agbno) 195 + xfs_agblock_t bno) 205 196 { 206 197 struct xfs_rmap_irec rmap; 207 198 unsigned long long rmap_end; 208 199 uint64_t owner = info->sc->ip->i_ino; 209 200 210 - if (!info->sc->sa.rmap_cur || xchk_skip_xref(info->sc->sm)) 201 + if (xchk_skip_xref(info->sc->sm)) 211 202 return; 212 203 213 204 /* Find the rmap record for this irec. */ 214 - if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap)) 205 + if (!xchk_bmap_get_rmap(info, irec, bno, owner, &rmap)) 215 206 return; 216 207 217 208 /* 218 209 * The rmap must be an exact match for this incore file mapping record, 219 210 * which may have arisen from multiple ondisk records. 220 211 */ 221 - if (rmap.rm_startblock != agbno) 212 + if (rmap.rm_startblock != bno) 222 213 xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, 223 214 irec->br_startoff); 224 215 225 216 rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; 226 - if (rmap_end != agbno + irec->br_blockcount) 217 + if (rmap_end != bno + irec->br_blockcount) 227 218 xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, 228 219 irec->br_startoff); 229 220 ··· 268 259 xchk_bmap_xref_rmap_cow( 269 260 struct xchk_bmap_info *info, 270 261 struct xfs_bmbt_irec *irec, 271 - xfs_agblock_t agbno) 262 + xfs_agblock_t bno) 272 263 { 273 264 struct xfs_rmap_irec rmap; 274 265 unsigned long long rmap_end; ··· 278 269 return; 279 270 280 271 /* Find the rmap record for this irec. */ 281 - if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap)) 272 + if (!xchk_bmap_get_rmap(info, irec, bno, owner, &rmap)) 282 273 return; 283 274 284 275 /* ··· 286 277 * can start before and end after the physical space allocated to this 287 278 * mapping. There are no offsets to check. 288 279 */ 289 - if (rmap.rm_startblock > agbno) 280 + if (rmap.rm_startblock > bno) 290 281 xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, 291 282 irec->br_startoff); 292 283 293 284 rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; 294 - if (rmap_end < agbno + irec->br_blockcount) 285 + if (rmap_end < bno + irec->br_blockcount) 295 286 xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, 296 287 irec->br_startoff); 297 288 ··· 324 315 struct xchk_bmap_info *info, 325 316 struct xfs_bmbt_irec *irec) 326 317 { 318 + struct xfs_owner_info oinfo; 319 + xfs_rgblock_t rgbno; 327 320 int error; 328 321 329 322 error = xchk_rtgroup_init_existing(info->sc, ··· 335 324 irec->br_startoff, &error)) 336 325 return; 337 326 338 - xchk_rtgroup_lock(&info->sc->sr, XCHK_RTGLOCK_ALL); 327 + error = xchk_rtgroup_lock(info->sc, &info->sc->sr, XCHK_RTGLOCK_ALL); 328 + if (!xchk_fblock_process_error(info->sc, info->whichfork, 329 + irec->br_startoff, &error)) 330 + goto out_free; 331 + 339 332 xchk_xref_is_used_rt_space(info->sc, irec->br_startblock, 340 333 irec->br_blockcount); 341 334 335 + if (!xfs_has_rtrmapbt(info->sc->mp)) 336 + goto out_cur; 337 + 338 + rgbno = xfs_rtb_to_rgbno(info->sc->mp, irec->br_startblock); 339 + 340 + switch (info->whichfork) { 341 + case XFS_DATA_FORK: 342 + xchk_bmap_xref_rmap(info, irec, rgbno); 343 + if (!xfs_is_reflink_inode(info->sc->ip)) { 344 + xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino, 345 + info->whichfork, irec->br_startoff); 346 + xchk_xref_is_only_rt_owned_by(info->sc, rgbno, 347 + irec->br_blockcount, &oinfo); 348 + xchk_xref_is_not_rt_shared(info->sc, rgbno, 349 + irec->br_blockcount); 350 + } 351 + xchk_xref_is_not_rt_cow_staging(info->sc, rgbno, 352 + irec->br_blockcount); 353 + break; 354 + case XFS_COW_FORK: 355 + xchk_bmap_xref_rmap_cow(info, irec, rgbno); 356 + xchk_xref_is_only_rt_owned_by(info->sc, rgbno, 357 + irec->br_blockcount, &XFS_RMAP_OINFO_COW); 358 + xchk_xref_is_rt_cow_staging(info->sc, rgbno, 359 + irec->br_blockcount); 360 + xchk_xref_is_not_rt_shared(info->sc, rgbno, 361 + irec->br_blockcount); 362 + break; 363 + } 364 + out_cur: 365 + xchk_rtgroup_btcur_free(&info->sc->sr); 366 + out_free: 342 367 xchk_rtgroup_free(info->sc, &info->sc->sr); 343 368 } 344 369 ··· 661 614 xchk_fblock_set_corrupt(sc, sbcri->whichfork, 662 615 check_rec.rm_offset); 663 616 if (irec.br_startblock != 664 - xfs_agbno_to_fsb(to_perag(cur->bc_group), 665 - check_rec.rm_startblock)) 617 + xfs_gbno_to_fsb(cur->bc_group, check_rec.rm_startblock)) 666 618 xchk_fblock_set_corrupt(sc, sbcri->whichfork, 667 619 check_rec.rm_offset); 668 620 if (irec.br_blockcount > check_rec.rm_blockcount) ··· 712 666 713 667 xfs_btree_del_cursor(cur, error); 714 668 xfs_trans_brelse(sc->tp, agf); 669 + return error; 670 + } 671 + 672 + /* Make sure each rt rmap has a corresponding bmbt entry. */ 673 + STATIC int 674 + xchk_bmap_check_rt_rmaps( 675 + struct xfs_scrub *sc, 676 + struct xfs_rtgroup *rtg) 677 + { 678 + struct xchk_bmap_check_rmap_info sbcri; 679 + struct xfs_btree_cur *cur; 680 + int error; 681 + 682 + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 683 + cur = xfs_rtrmapbt_init_cursor(sc->tp, rtg); 684 + 685 + sbcri.sc = sc; 686 + sbcri.whichfork = XFS_DATA_FORK; 687 + error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri); 688 + if (error == -ECANCELED) 689 + error = 0; 690 + 691 + xfs_btree_del_cursor(cur, error); 692 + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 715 693 return error; 716 694 } 717 695 ··· 793 723 { 794 724 struct xfs_ifork *ifp = &ip->i_df; 795 725 796 - /* Don't support realtime rmap checks yet. */ 797 - if (XFS_IS_REALTIME_INODE(ip)) 798 - return false; 799 - 800 726 /* 801 727 * If the dinode repair found a bad data fork, it will reset the fork 802 728 * to extents format with zero records and wait for the this scrubber ··· 842 776 { 843 777 struct xfs_perag *pag = NULL; 844 778 int error; 779 + 780 + if (xfs_ifork_is_realtime(sc->ip, whichfork)) { 781 + struct xfs_rtgroup *rtg = NULL; 782 + 783 + while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) { 784 + error = xchk_bmap_check_rt_rmaps(sc, rtg); 785 + if (error || 786 + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { 787 + xfs_rtgroup_rele(rtg); 788 + return error; 789 + } 790 + } 791 + 792 + return 0; 793 + } 845 794 846 795 while ((pag = xfs_perag_next(sc->mp, pag))) { 847 796 error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag); ··· 1064 983 case XFS_DINODE_FMT_UUID: 1065 984 case XFS_DINODE_FMT_DEV: 1066 985 case XFS_DINODE_FMT_LOCAL: 986 + case XFS_DINODE_FMT_META_BTREE: 1067 987 /* No mappings to check. */ 1068 988 if (whichfork == XFS_COW_FORK) 1069 989 xchk_fblock_set_corrupt(sc, whichfork, 0);

+137 -11

fs/xfs/scrub/bmap_repair.c

··· 25 25 #include "xfs_bmap_btree.h" 26 26 #include "xfs_rmap.h" 27 27 #include "xfs_rmap_btree.h" 28 + #include "xfs_rtrmap_btree.h" 28 29 #include "xfs_refcount.h" 29 30 #include "xfs_quota.h" 30 31 #include "xfs_ialloc.h" 31 32 #include "xfs_ag.h" 32 33 #include "xfs_reflink.h" 34 + #include "xfs_rtgroup.h" 33 35 #include "scrub/xfs_scrub.h" 34 36 #include "scrub/scrub.h" 35 37 #include "scrub/common.h" ··· 101 99 xfs_filblks_t blockcount) 102 100 { 103 101 struct xfs_scrub *sc = rb->sc; 102 + struct xfs_btree_cur *cur; 104 103 xfs_agblock_t agbno; 105 104 xfs_agblock_t fbno; 106 105 xfs_extlen_t flen; 107 106 int error; 108 107 109 - agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock); 110 - error = xfs_refcount_find_shared(sc->sa.refc_cur, agbno, blockcount, 111 - &fbno, &flen, false); 108 + if (XFS_IS_REALTIME_INODE(sc->ip)) { 109 + agbno = xfs_rtb_to_rgbno(sc->mp, startblock); 110 + cur = sc->sr.refc_cur; 111 + } else { 112 + agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock); 113 + cur = sc->sa.refc_cur; 114 + } 115 + error = xfs_refcount_find_shared(cur, agbno, blockcount, &fbno, &flen, 116 + false); 112 117 if (error) 113 118 return error; 114 119 ··· 368 359 return error; 369 360 } 370 361 362 + #ifdef CONFIG_XFS_RT 363 + /* Check for any obvious errors or conflicts in the file mapping. */ 364 + STATIC int 365 + xrep_bmap_check_rtfork_rmap( 366 + struct xfs_scrub *sc, 367 + struct xfs_btree_cur *cur, 368 + const struct xfs_rmap_irec *rec) 369 + { 370 + /* xattr extents are never stored on realtime devices */ 371 + if (rec->rm_flags & XFS_RMAP_ATTR_FORK) 372 + return -EFSCORRUPTED; 373 + 374 + /* bmbt blocks are never stored on realtime devices */ 375 + if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) 376 + return -EFSCORRUPTED; 377 + 378 + /* Data extents for non-rt files are never stored on the rt device. */ 379 + if (!XFS_IS_REALTIME_INODE(sc->ip)) 380 + return -EFSCORRUPTED; 381 + 382 + /* Check the file offsets and physical extents. */ 383 + if (!xfs_verify_fileext(sc->mp, rec->rm_offset, rec->rm_blockcount)) 384 + return -EFSCORRUPTED; 385 + 386 + /* Check that this is within the rtgroup. */ 387 + if (!xfs_verify_rgbext(to_rtg(cur->bc_group), rec->rm_startblock, 388 + rec->rm_blockcount)) 389 + return -EFSCORRUPTED; 390 + 391 + /* Make sure this isn't free space. */ 392 + return xrep_require_rtext_inuse(sc, rec->rm_startblock, 393 + rec->rm_blockcount); 394 + } 395 + 396 + /* Record realtime extents that belong to this inode's fork. */ 397 + STATIC int 398 + xrep_bmap_walk_rtrmap( 399 + struct xfs_btree_cur *cur, 400 + const struct xfs_rmap_irec *rec, 401 + void *priv) 402 + { 403 + struct xrep_bmap *rb = priv; 404 + int error = 0; 405 + 406 + if (xchk_should_terminate(rb->sc, &error)) 407 + return error; 408 + 409 + /* Skip extents which are not owned by this inode and fork. */ 410 + if (rec->rm_owner != rb->sc->ip->i_ino) 411 + return 0; 412 + 413 + error = xrep_bmap_check_rtfork_rmap(rb->sc, cur, rec); 414 + if (error) 415 + return error; 416 + 417 + /* 418 + * Record all blocks allocated to this file even if the extent isn't 419 + * for the fork we're rebuilding so that we can reset di_nblocks later. 420 + */ 421 + rb->nblocks += rec->rm_blockcount; 422 + 423 + /* If this rmap isn't for the fork we want, we're done. */ 424 + if (rb->whichfork == XFS_DATA_FORK && 425 + (rec->rm_flags & XFS_RMAP_ATTR_FORK)) 426 + return 0; 427 + if (rb->whichfork == XFS_ATTR_FORK && 428 + !(rec->rm_flags & XFS_RMAP_ATTR_FORK)) 429 + return 0; 430 + 431 + return xrep_bmap_from_rmap(rb, rec->rm_offset, 432 + xfs_rgbno_to_rtb(to_rtg(cur->bc_group), 433 + rec->rm_startblock), 434 + rec->rm_blockcount, 435 + rec->rm_flags & XFS_RMAP_UNWRITTEN); 436 + } 437 + 438 + /* Scan the realtime reverse mappings to build the new extent map. */ 439 + STATIC int 440 + xrep_bmap_scan_rtgroup( 441 + struct xrep_bmap *rb, 442 + struct xfs_rtgroup *rtg) 443 + { 444 + struct xfs_scrub *sc = rb->sc; 445 + int error; 446 + 447 + if (!xfs_has_rtrmapbt(sc->mp)) 448 + return 0; 449 + 450 + error = xrep_rtgroup_init(sc, rtg, &sc->sr, 451 + XFS_RTGLOCK_RMAP | 452 + XFS_RTGLOCK_REFCOUNT | 453 + XFS_RTGLOCK_BITMAP_SHARED); 454 + if (error) 455 + return error; 456 + 457 + error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_bmap_walk_rtrmap, rb); 458 + xchk_rtgroup_btcur_free(&sc->sr); 459 + xchk_rtgroup_free(sc, &sc->sr); 460 + return error; 461 + } 462 + #else 463 + static inline int 464 + xrep_bmap_scan_rtgroup(struct xrep_bmap *rb, struct xfs_rtgroup *rtg) 465 + { 466 + return -EFSCORRUPTED; 467 + } 468 + #endif 469 + 371 470 /* Find the delalloc extents from the old incore extent tree. */ 372 471 STATIC int 373 472 xrep_bmap_find_delalloc( ··· 526 409 struct xfs_scrub *sc = rb->sc; 527 410 struct xfs_perag *pag = NULL; 528 411 int error = 0; 412 + 413 + /* 414 + * Iterate the rtrmaps for extents. Metadata files never have content 415 + * on the realtime device, so there's no need to scan them. 416 + */ 417 + if (!xfs_is_metadir_inode(sc->ip)) { 418 + struct xfs_rtgroup *rtg = NULL; 419 + 420 + while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) { 421 + error = xrep_bmap_scan_rtgroup(rb, rtg); 422 + if (error) { 423 + xfs_rtgroup_rele(rtg); 424 + return error; 425 + } 426 + } 427 + } 529 428 530 429 /* Iterate the rmaps for extents. */ 531 430 while ((pag = xfs_perag_next(sc->mp, pag))) { ··· 864 731 case XFS_DINODE_FMT_DEV: 865 732 case XFS_DINODE_FMT_LOCAL: 866 733 case XFS_DINODE_FMT_UUID: 734 + case XFS_DINODE_FMT_META_BTREE: 867 735 return -ECANCELED; 868 736 case XFS_DINODE_FMT_EXTENTS: 869 737 case XFS_DINODE_FMT_BTREE: ··· 886 752 default: 887 753 return -EINVAL; 888 754 } 889 - 890 - /* Don't know how to rebuild realtime data forks. */ 891 - if (XFS_IS_REALTIME_INODE(sc->ip)) 892 - return -EOPNOTSUPP; 893 755 894 756 return 0; 895 757 } ··· 910 780 911 781 /* cannot share attr fork extents */ 912 782 if (whichfork != XFS_DATA_FORK) 913 - return RLS_IRRELEVANT; 914 - 915 - /* cannot share realtime extents */ 916 - if (XFS_IS_REALTIME_INODE(sc->ip)) 917 783 return RLS_IRRELEVANT; 918 784 919 785 return RLS_UNKNOWN;

+166 -4

fs/xfs/scrub/common.c

··· 35 35 #include "xfs_exchmaps.h" 36 36 #include "xfs_rtbitmap.h" 37 37 #include "xfs_rtgroup.h" 38 + #include "xfs_rtrmap_btree.h" 39 + #include "xfs_bmap_util.h" 40 + #include "xfs_rtrefcount_btree.h" 38 41 #include "scrub/scrub.h" 39 42 #include "scrub/common.h" 40 43 #include "scrub/trace.h" ··· 722 719 return 0; 723 720 } 724 721 725 - void 722 + /* Lock all the rt group metadata inode ILOCKs and wait for intents. */ 723 + int 726 724 xchk_rtgroup_lock( 725 + struct xfs_scrub *sc, 727 726 struct xchk_rt *sr, 728 727 unsigned int rtglock_flags) 729 728 { 730 - xfs_rtgroup_lock(sr->rtg, rtglock_flags); 729 + int error = 0; 730 + 731 + ASSERT(sr->rtg != NULL); 732 + 733 + /* 734 + * If we're /only/ locking the rtbitmap in shared mode, then we're 735 + * obviously not trying to compare records in two metadata inodes. 736 + * There's no need to drain intents here because the caller (most 737 + * likely the rgsuper scanner) doesn't need that level of consistency. 738 + */ 739 + if (rtglock_flags == XFS_RTGLOCK_BITMAP_SHARED) { 740 + xfs_rtgroup_lock(sr->rtg, rtglock_flags); 741 + sr->rtlock_flags = rtglock_flags; 742 + return 0; 743 + } 744 + 745 + do { 746 + if (xchk_should_terminate(sc, &error)) 747 + return error; 748 + 749 + xfs_rtgroup_lock(sr->rtg, rtglock_flags); 750 + 751 + /* 752 + * If we've grabbed a non-metadata file for scrubbing, we 753 + * assume that holding its ILOCK will suffice to coordinate 754 + * with any rt intent chains involving this inode. 755 + */ 756 + if (sc->ip && !xfs_is_internal_inode(sc->ip)) 757 + break; 758 + 759 + /* 760 + * Decide if the rt group is quiet enough for all metadata to 761 + * be consistent with each other. Regular file IO doesn't get 762 + * to lock all the rt inodes at the same time, which means that 763 + * there could be other threads in the middle of processing a 764 + * chain of deferred ops. 765 + * 766 + * We just locked all the metadata inodes for this rt group; 767 + * now take a look to see if there are any intents in progress. 768 + * If there are, drop the rt group inode locks and wait for the 769 + * intents to drain. Since we hold the rt group inode locks 770 + * for the duration of the scrub, this is the only time we have 771 + * to sample the intents counter; any threads increasing it 772 + * after this point can't possibly be in the middle of a chain 773 + * of rt metadata updates. 774 + * 775 + * Obviously, this should be slanted against scrub and in favor 776 + * of runtime threads. 777 + */ 778 + if (!xfs_group_intent_busy(rtg_group(sr->rtg))) 779 + break; 780 + 781 + xfs_rtgroup_unlock(sr->rtg, rtglock_flags); 782 + 783 + if (!(sc->flags & XCHK_FSGATES_DRAIN)) 784 + return -ECHRNG; 785 + error = xfs_group_intent_drain(rtg_group(sr->rtg)); 786 + if (error) { 787 + if (error == -ERESTARTSYS) 788 + error = -EINTR; 789 + return error; 790 + } 791 + } while (1); 792 + 731 793 sr->rtlock_flags = rtglock_flags; 794 + 795 + if (xfs_has_rtrmapbt(sc->mp) && (rtglock_flags & XFS_RTGLOCK_RMAP)) 796 + sr->rmap_cur = xfs_rtrmapbt_init_cursor(sc->tp, sr->rtg); 797 + 798 + if (xfs_has_rtreflink(sc->mp) && (rtglock_flags & XFS_RTGLOCK_REFCOUNT)) 799 + sr->refc_cur = xfs_rtrefcountbt_init_cursor(sc->tp, sr->rtg); 800 + 801 + return 0; 802 + } 803 + 804 + /* 805 + * Free all the btree cursors and other incore data relating to the realtime 806 + * group. This has to be done /before/ committing (or cancelling) the scrub 807 + * transaction. 808 + */ 809 + void 810 + xchk_rtgroup_btcur_free( 811 + struct xchk_rt *sr) 812 + { 813 + if (sr->rmap_cur) 814 + xfs_btree_del_cursor(sr->rmap_cur, XFS_BTREE_ERROR); 815 + if (sr->refc_cur) 816 + xfs_btree_del_cursor(sr->refc_cur, XFS_BTREE_ERROR); 817 + 818 + sr->refc_cur = NULL; 819 + sr->rmap_cur = NULL; 732 820 } 733 821 734 822 /* 735 823 * Unlock the realtime group. This must be done /after/ committing (or 736 824 * cancelling) the scrub transaction. 737 825 */ 738 - static void 826 + void 739 827 xchk_rtgroup_unlock( 740 828 struct xchk_rt *sr) 741 829 { ··· 904 810 905 811 resblks = xrep_calc_ag_resblks(sc); 906 812 return xchk_trans_alloc(sc, resblks); 813 + } 814 + 815 + /* Set us up with a transaction and an empty context to repair rt metadata. */ 816 + int 817 + xchk_setup_rt( 818 + struct xfs_scrub *sc) 819 + { 820 + return xchk_trans_alloc(sc, xrep_calc_rtgroup_resblks(sc)); 907 821 } 908 822 909 823 /* Set us up with AG headers and btree cursors. */ ··· 1481 1379 trace_xchk_fsgates_enable(sc, scrub_fsgates); 1482 1380 1483 1381 if (scrub_fsgates & XCHK_FSGATES_DRAIN) 1484 - xfs_drain_wait_enable(); 1382 + xfs_defer_drain_wait_enable(); 1485 1383 1486 1384 if (scrub_fsgates & XCHK_FSGATES_QUOTA) 1487 1385 xfs_dqtrx_hook_enable(); ··· 1674 1572 if (xfs_is_metadir_inode(ip)) 1675 1573 return mp->m_metadirip->i_ino; 1676 1574 return mp->m_rootip->i_ino; 1575 + } 1576 + 1577 + static int 1578 + xchk_meta_btree_count_blocks( 1579 + struct xfs_scrub *sc, 1580 + xfs_extnum_t *nextents, 1581 + xfs_filblks_t *count) 1582 + { 1583 + struct xfs_btree_cur *cur; 1584 + int error; 1585 + 1586 + if (!sc->sr.rtg) { 1587 + ASSERT(0); 1588 + return -EFSCORRUPTED; 1589 + } 1590 + 1591 + switch (sc->ip->i_metatype) { 1592 + case XFS_METAFILE_RTRMAP: 1593 + cur = xfs_rtrmapbt_init_cursor(sc->tp, sc->sr.rtg); 1594 + break; 1595 + case XFS_METAFILE_RTREFCOUNT: 1596 + cur = xfs_rtrefcountbt_init_cursor(sc->tp, sc->sr.rtg); 1597 + break; 1598 + default: 1599 + ASSERT(0); 1600 + return -EFSCORRUPTED; 1601 + } 1602 + 1603 + error = xfs_btree_count_blocks(cur, count); 1604 + xfs_btree_del_cursor(cur, error); 1605 + if (!error) { 1606 + *nextents = 0; 1607 + (*count)--; /* don't count the btree iroot */ 1608 + } 1609 + return error; 1610 + } 1611 + 1612 + /* Count the blocks used by a file, even if it's a metadata inode. */ 1613 + int 1614 + xchk_inode_count_blocks( 1615 + struct xfs_scrub *sc, 1616 + int whichfork, 1617 + xfs_extnum_t *nextents, 1618 + xfs_filblks_t *count) 1619 + { 1620 + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork); 1621 + 1622 + if (!ifp) { 1623 + *nextents = 0; 1624 + *count = 0; 1625 + return 0; 1626 + } 1627 + 1628 + if (ifp->if_format == XFS_DINODE_FMT_META_BTREE) { 1629 + ASSERT(whichfork == XFS_DATA_FORK); 1630 + return xchk_meta_btree_count_blocks(sc, nextents, count); 1631 + } 1632 + 1633 + return xfs_bmap_count_blocks(sc->tp, sc->ip, whichfork, nextents, 1634 + count); 1677 1635 }

+23 -3

fs/xfs/scrub/common.h

··· 63 63 /* Setup functions */ 64 64 int xchk_setup_agheader(struct xfs_scrub *sc); 65 65 int xchk_setup_fs(struct xfs_scrub *sc); 66 + int xchk_setup_rt(struct xfs_scrub *sc); 66 67 int xchk_setup_ag_allocbt(struct xfs_scrub *sc); 67 68 int xchk_setup_ag_iallocbt(struct xfs_scrub *sc); 68 69 int xchk_setup_ag_rmapbt(struct xfs_scrub *sc); ··· 81 80 int xchk_setup_rtbitmap(struct xfs_scrub *sc); 82 81 int xchk_setup_rtsummary(struct xfs_scrub *sc); 83 82 int xchk_setup_rgsuperblock(struct xfs_scrub *sc); 83 + int xchk_setup_rtrmapbt(struct xfs_scrub *sc); 84 + int xchk_setup_rtrefcountbt(struct xfs_scrub *sc); 84 85 #else 85 86 # define xchk_setup_rtbitmap xchk_setup_nothing 86 87 # define xchk_setup_rtsummary xchk_setup_nothing 87 88 # define xchk_setup_rgsuperblock xchk_setup_nothing 89 + # define xchk_setup_rtrmapbt xchk_setup_nothing 90 + # define xchk_setup_rtrefcountbt xchk_setup_nothing 88 91 #endif 89 92 #ifdef CONFIG_XFS_QUOTA 90 93 int xchk_ino_dqattach(struct xfs_scrub *sc); ··· 130 125 #ifdef CONFIG_XFS_RT 131 126 132 127 /* All the locks we need to check an rtgroup. */ 133 - #define XCHK_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP) 128 + #define XCHK_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \ 129 + XFS_RTGLOCK_RMAP | \ 130 + XFS_RTGLOCK_REFCOUNT) 134 131 135 132 int xchk_rtgroup_init(struct xfs_scrub *sc, xfs_rgnumber_t rgno, 136 133 struct xchk_rt *sr); ··· 148 141 return error == -ENOENT ? -EFSCORRUPTED : error; 149 142 } 150 143 151 - void xchk_rtgroup_lock(struct xchk_rt *sr, unsigned int rtglock_flags); 144 + int xchk_rtgroup_lock(struct xfs_scrub *sc, struct xchk_rt *sr, 145 + unsigned int rtglock_flags); 146 + void xchk_rtgroup_unlock(struct xchk_rt *sr); 147 + void xchk_rtgroup_btcur_free(struct xchk_rt *sr); 152 148 void xchk_rtgroup_free(struct xfs_scrub *sc, struct xchk_rt *sr); 153 149 #else 154 150 # define xchk_rtgroup_init(sc, rgno, sr) (-EFSCORRUPTED) 155 151 # define xchk_rtgroup_init_existing(sc, rgno, sr) (-EFSCORRUPTED) 156 - # define xchk_rtgroup_lock(sc, lockflags) do { } while (0) 152 + # define xchk_rtgroup_lock(sc, sr, lockflags) (-EFSCORRUPTED) 153 + # define xchk_rtgroup_unlock(sr) do { } while (0) 154 + # define xchk_rtgroup_btcur_free(sr) do { } while (0) 157 155 # define xchk_rtgroup_free(sc, sr) do { } while (0) 158 156 #endif /* CONFIG_XFS_RT */ 159 157 ··· 269 257 (sc)->mp->m_super->s_id, \ 270 258 (sc)->ip ? (sc)->ip->i_ino : (sc)->sm->sm_ino, \ 271 259 ##__VA_ARGS__) 260 + #define xchk_xfile_rtgroup_descr(sc, fmt, ...) \ 261 + kasprintf(XCHK_GFP_FLAGS, "XFS (%s): rtgroup 0x%x " fmt, \ 262 + (sc)->mp->m_super->s_id, \ 263 + (sc)->sa.pag ? \ 264 + rtg_rgno((sc)->sr.rtg) : (sc)->sm->sm_agno, \ 265 + ##__VA_ARGS__) 272 266 273 267 /* 274 268 * Setting up a hook to wait for intents to drain is costly -- we have to take ··· 292 274 293 275 int xchk_inode_is_allocated(struct xfs_scrub *sc, xfs_agino_t agino, 294 276 bool *inuse); 277 + int xchk_inode_count_blocks(struct xfs_scrub *sc, int whichfork, 278 + xfs_extnum_t *nextents, xfs_filblks_t *count); 295 279 296 280 bool xchk_inode_is_dirtree_root(const struct xfs_inode *ip); 297 281 bool xchk_inode_is_sb_rooted(const struct xfs_inode *ip);

+162 -18

fs/xfs/scrub/cow_repair.c

··· 26 26 #include "xfs_errortag.h" 27 27 #include "xfs_icache.h" 28 28 #include "xfs_refcount_btree.h" 29 + #include "xfs_rtalloc.h" 30 + #include "xfs_rtbitmap.h" 31 + #include "xfs_rtgroup.h" 29 32 #include "scrub/xfs_scrub.h" 30 33 #include "scrub/scrub.h" 31 34 #include "scrub/common.h" ··· 37 34 #include "scrub/bitmap.h" 38 35 #include "scrub/off_bitmap.h" 39 36 #include "scrub/fsb_bitmap.h" 37 + #include "scrub/rtb_bitmap.h" 40 38 #include "scrub/reap.h" 41 39 42 40 /* ··· 65 61 struct xoff_bitmap bad_fileoffs; 66 62 67 63 /* Bitmap of fsblocks that were removed from the CoW fork. */ 68 - struct xfsb_bitmap old_cowfork_fsblocks; 64 + union { 65 + struct xfsb_bitmap old_cowfork_fsblocks; 66 + struct xrtb_bitmap old_cowfork_rtblocks; 67 + }; 69 68 70 69 /* CoW fork mappings used to scan for bad CoW staging extents. */ 71 70 struct xfs_bmbt_irec irec; ··· 152 145 xrep_cow_trim_refcount(xc, &rrec, rec); 153 146 154 147 return xrep_cow_mark_file_range(xc, 155 - xfs_agbno_to_fsb(to_perag(cur->bc_group), 156 - rrec.rc_startblock), 148 + xfs_gbno_to_fsb(cur->bc_group, rrec.rc_startblock), 157 149 rrec.rc_blockcount); 158 150 } 159 151 ··· 183 177 if (xc->next_bno >= rrec.rc_startblock) 184 178 goto next; 185 179 186 - 187 180 error = xrep_cow_mark_file_range(xc, 188 - xfs_agbno_to_fsb(to_perag(cur->bc_group), xc->next_bno), 181 + xfs_gbno_to_fsb(cur->bc_group, xc->next_bno), 189 182 rrec.rc_startblock - xc->next_bno); 190 183 if (error) 191 184 return error; ··· 227 222 } 228 223 229 224 return xrep_cow_mark_file_range(xc, 230 - xfs_agbno_to_fsb(to_perag(cur->bc_group), rec_bno), 231 - rec_len); 225 + xfs_gbno_to_fsb(cur->bc_group, rec_bno), rec_len); 232 226 } 233 227 234 228 /* ··· 315 311 } 316 312 317 313 /* 314 + * Find any part of the CoW fork mapping that isn't a single-owner CoW staging 315 + * extent and mark the corresponding part of the file range in the bitmap. 316 + */ 317 + STATIC int 318 + xrep_cow_find_bad_rt( 319 + struct xrep_cow *xc) 320 + { 321 + struct xfs_refcount_irec rc_low = { 0 }; 322 + struct xfs_refcount_irec rc_high = { 0 }; 323 + struct xfs_rmap_irec rm_low = { 0 }; 324 + struct xfs_rmap_irec rm_high = { 0 }; 325 + struct xfs_scrub *sc = xc->sc; 326 + struct xfs_rtgroup *rtg; 327 + int error = 0; 328 + 329 + xc->irec_startbno = xfs_rtb_to_rgbno(sc->mp, xc->irec.br_startblock); 330 + 331 + rtg = xfs_rtgroup_get(sc->mp, 332 + xfs_rtb_to_rgno(sc->mp, xc->irec.br_startblock)); 333 + if (!rtg) 334 + return -EFSCORRUPTED; 335 + 336 + error = xrep_rtgroup_init(sc, rtg, &sc->sr, 337 + XFS_RTGLOCK_RMAP | XFS_RTGLOCK_REFCOUNT); 338 + if (error) 339 + goto out_rtg; 340 + 341 + /* Mark any CoW fork extents that are shared. */ 342 + rc_low.rc_startblock = xc->irec_startbno; 343 + rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; 344 + rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED; 345 + error = xfs_refcount_query_range(sc->sr.refc_cur, &rc_low, &rc_high, 346 + xrep_cow_mark_shared_staging, xc); 347 + if (error) 348 + goto out_sr; 349 + 350 + /* Make sure there are CoW staging extents for the whole mapping. */ 351 + rc_low.rc_startblock = xc->irec_startbno; 352 + rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; 353 + rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW; 354 + xc->next_bno = xc->irec_startbno; 355 + error = xfs_refcount_query_range(sc->sr.refc_cur, &rc_low, &rc_high, 356 + xrep_cow_mark_missing_staging, xc); 357 + if (error) 358 + goto out_sr; 359 + 360 + if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) { 361 + error = xrep_cow_mark_file_range(xc, 362 + xfs_rgbno_to_rtb(rtg, xc->next_bno), 363 + xc->irec_startbno + xc->irec.br_blockcount - 364 + xc->next_bno); 365 + if (error) 366 + goto out_sr; 367 + } 368 + 369 + /* Mark any area has an rmap that isn't a COW staging extent. */ 370 + rm_low.rm_startblock = xc->irec_startbno; 371 + memset(&rm_high, 0xFF, sizeof(rm_high)); 372 + rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; 373 + error = xfs_rmap_query_range(sc->sr.rmap_cur, &rm_low, &rm_high, 374 + xrep_cow_mark_missing_staging_rmap, xc); 375 + if (error) 376 + goto out_sr; 377 + 378 + /* 379 + * If userspace is forcing us to rebuild the CoW fork or someone 380 + * turned on the debugging knob, replace everything in the 381 + * CoW fork and then scan for staging extents in the refcountbt. 382 + */ 383 + if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || 384 + XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { 385 + error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, 386 + xc->irec.br_blockcount); 387 + if (error) 388 + goto out_rtg; 389 + } 390 + 391 + out_sr: 392 + xchk_rtgroup_btcur_free(&sc->sr); 393 + xchk_rtgroup_free(sc, &sc->sr); 394 + out_rtg: 395 + xfs_rtgroup_put(rtg); 396 + return error; 397 + } 398 + 399 + /* 318 400 * Allocate a replacement CoW staging extent of up to the given number of 319 401 * blocks, and fill out the mapping. 320 402 */ ··· 433 343 if (args.fsbno == NULLFSBLOCK) 434 344 return -ENOSPC; 435 345 436 - xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len); 346 + xfs_refcount_alloc_cow_extent(sc->tp, false, args.fsbno, args.len); 437 347 438 348 repl->fsbno = args.fsbno; 439 349 repl->len = args.len; 350 + return 0; 351 + } 352 + 353 + /* 354 + * Allocate a replacement rt CoW staging extent of up to the given number of 355 + * blocks, and fill out the mapping. 356 + */ 357 + STATIC int 358 + xrep_cow_alloc_rt( 359 + struct xfs_scrub *sc, 360 + xfs_extlen_t maxlen, 361 + struct xrep_cow_extent *repl) 362 + { 363 + xfs_rtxlen_t maxrtx = xfs_rtb_to_rtx(sc->mp, maxlen); 364 + int error; 365 + 366 + error = xfs_trans_reserve_more(sc->tp, 0, maxrtx); 367 + if (error) 368 + return error; 369 + 370 + error = xfs_rtallocate_rtgs(sc->tp, NULLRTBLOCK, 1, maxrtx, 1, false, 371 + false, &repl->fsbno, &repl->len); 372 + if (error) 373 + return error; 374 + 375 + xfs_refcount_alloc_cow_extent(sc->tp, true, repl->fsbno, repl->len); 440 376 return 0; 441 377 } 442 378 ··· 583 467 */ 584 468 alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN, 585 469 nextoff - startoff); 586 - error = xrep_cow_alloc(sc, alloc_len, &repl); 470 + if (XFS_IS_REALTIME_INODE(sc->ip)) 471 + error = xrep_cow_alloc_rt(sc, alloc_len, &repl); 472 + else 473 + error = xrep_cow_alloc(sc, alloc_len, &repl); 587 474 if (error) 588 475 return error; 589 476 ··· 602 483 return error; 603 484 604 485 /* Note the old CoW staging extents; we'll reap them all later. */ 605 - error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock, 606 - repl.len); 486 + if (XFS_IS_REALTIME_INODE(sc->ip)) 487 + error = xrtb_bitmap_set(&xc->old_cowfork_rtblocks, 488 + got.br_startblock, repl.len); 489 + else 490 + error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, 491 + got.br_startblock, repl.len); 607 492 if (error) 608 493 return error; 609 494 ··· 663 540 if (!ifp) 664 541 return 0; 665 542 666 - /* realtime files aren't supported yet */ 667 - if (XFS_IS_REALTIME_INODE(sc->ip)) 543 + /* 544 + * Realtime files with large extent sizes are not supported because 545 + * we could encounter an CoW mapping that has been partially written 546 + * out *and* requires replacement, and there's no solution to that. 547 + */ 548 + if (xfs_inode_has_bigrtalloc(sc->ip)) 549 + return -EOPNOTSUPP; 550 + 551 + /* Metadata inodes aren't supposed to have data on the rt volume. */ 552 + if (xfs_is_metadir_inode(sc->ip) && XFS_IS_REALTIME_INODE(sc->ip)) 668 553 return -EOPNOTSUPP; 669 554 670 555 /* ··· 693 562 694 563 xc->sc = sc; 695 564 xoff_bitmap_init(&xc->bad_fileoffs); 696 - xfsb_bitmap_init(&xc->old_cowfork_fsblocks); 565 + if (XFS_IS_REALTIME_INODE(sc->ip)) 566 + xrtb_bitmap_init(&xc->old_cowfork_rtblocks); 567 + else 568 + xfsb_bitmap_init(&xc->old_cowfork_fsblocks); 697 569 698 570 for_each_xfs_iext(ifp, &icur, &xc->irec) { 699 571 if (xchk_should_terminate(sc, &error)) ··· 719 585 if (xfs_bmap_is_written_extent(&xc->irec)) 720 586 continue; 721 587 722 - error = xrep_cow_find_bad(xc); 588 + if (XFS_IS_REALTIME_INODE(sc->ip)) 589 + error = xrep_cow_find_bad_rt(xc); 590 + else 591 + error = xrep_cow_find_bad(xc); 723 592 if (error) 724 593 goto out_bitmap; 725 594 } ··· 737 600 * by the refcount btree, not the inode, so it is correct to treat them 738 601 * like inode metadata. 739 602 */ 740 - error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks, 741 - &XFS_RMAP_OINFO_COW); 603 + if (XFS_IS_REALTIME_INODE(sc->ip)) 604 + error = xrep_reap_rtblocks(sc, &xc->old_cowfork_rtblocks, 605 + &XFS_RMAP_OINFO_COW); 606 + else 607 + error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks, 608 + &XFS_RMAP_OINFO_COW); 742 609 if (error) 743 610 goto out_bitmap; 744 611 745 612 out_bitmap: 746 - xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks); 613 + if (XFS_IS_REALTIME_INODE(sc->ip)) 614 + xrtb_bitmap_destroy(&xc->old_cowfork_rtblocks); 615 + else 616 + xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks); 747 617 xoff_bitmap_destroy(&xc->bad_fileoffs); 748 618 kfree(xc); 749 619 return error;

+2

fs/xfs/scrub/health.c

··· 114 114 [XFS_SCRUB_TYPE_DIRTREE] = { XHG_INO, XFS_SICK_INO_DIRTREE }, 115 115 [XFS_SCRUB_TYPE_METAPATH] = { XHG_FS, XFS_SICK_FS_METAPATH }, 116 116 [XFS_SCRUB_TYPE_RGSUPER] = { XHG_RTGROUP, XFS_SICK_RG_SUPER }, 117 + [XFS_SCRUB_TYPE_RTRMAPBT] = { XHG_RTGROUP, XFS_SICK_RG_RMAPBT }, 118 + [XFS_SCRUB_TYPE_RTREFCBT] = { XHG_RTGROUP, XFS_SICK_RG_REFCNTBT }, 117 119 }; 118 120 119 121 /* Return the health status mask for this scrub type. */

+26 -15

fs/xfs/scrub/inode.c

··· 260 260 xchk_ino_set_warning(sc, ino); 261 261 } 262 262 263 - /* 264 - * Validate di_cowextsize hint. 265 - * 266 - * The rules are documented at xfs_ioctl_setattr_check_cowextsize(). 267 - * These functions must be kept in sync with each other. 268 - */ 263 + /* Validate di_cowextsize hint. */ 269 264 STATIC void 270 265 xchk_inode_cowextsize( 271 266 struct xfs_scrub *sc, ··· 271 276 uint64_t flags2) 272 277 { 273 278 xfs_failaddr_t fa; 279 + uint32_t value = be32_to_cpu(dip->di_cowextsize); 274 280 275 - fa = xfs_inode_validate_cowextsize(sc->mp, 276 - be32_to_cpu(dip->di_cowextsize), mode, flags, 277 - flags2); 281 + fa = xfs_inode_validate_cowextsize(sc->mp, value, mode, flags, flags2); 278 282 if (fa) 279 283 xchk_ino_set_corrupt(sc, ino); 284 + 285 + /* 286 + * XFS allows a sysadmin to change the rt extent size when adding a rt 287 + * section to a filesystem after formatting. If there are any 288 + * directories with cowextsize and rtinherit set, the hint could become 289 + * misaligned with the new rextsize. The verifier doesn't check this, 290 + * because we allow rtinherit directories even without an rt device. 291 + * Flag this as an administrative warning since we will clean this up 292 + * eventually. 293 + */ 294 + if ((flags & XFS_DIFLAG_RTINHERIT) && 295 + (flags2 & XFS_DIFLAG2_COWEXTSIZE) && 296 + value % sc->mp->m_sb.sb_rextsize > 0) 297 + xchk_ino_set_warning(sc, ino); 280 298 } 281 299 282 300 /* Make sure the di_flags make sense for the inode. */ ··· 368 360 if ((flags2 & XFS_DIFLAG2_REFLINK) && !S_ISREG(mode)) 369 361 goto bad; 370 362 371 - /* realtime and reflink make no sense, currently */ 372 - if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK)) 363 + /* realtime and reflink don't always go together */ 364 + if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK) && 365 + !xfs_has_rtreflink(mp)) 373 366 goto bad; 374 367 375 368 /* no bigtime iflag without the bigtime feature */ ··· 509 500 break; 510 501 case XFS_DINODE_FMT_BTREE: 511 502 if (!S_ISREG(mode) && !S_ISDIR(mode)) 503 + xchk_ino_set_corrupt(sc, ino); 504 + break; 505 + case XFS_DINODE_FMT_META_BTREE: 506 + if (!S_ISREG(mode)) 512 507 xchk_ino_set_corrupt(sc, ino); 513 508 break; 514 509 case XFS_DINODE_FMT_UUID: ··· 699 686 return; 700 687 701 688 /* Walk all the extents to check nextents/naextents/nblocks. */ 702 - error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, 703 - &nextents, &count); 689 + error = xchk_inode_count_blocks(sc, XFS_DATA_FORK, &nextents, &count); 704 690 if (!xchk_should_check_xref(sc, &error, NULL)) 705 691 return; 706 692 if (nextents < xfs_dfork_data_extents(dip)) 707 693 xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); 708 694 709 - error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, 710 - &nextents, &acount); 695 + error = xchk_inode_count_blocks(sc, XFS_ATTR_FORK, &nextents, &acount); 711 696 if (!xchk_should_check_xref(sc, &error, NULL)) 712 697 return; 713 698 if (nextents != xfs_dfork_attr_extents(dip))

+182 -11

fs/xfs/scrub/inode_repair.c

··· 38 38 #include "xfs_log_priv.h" 39 39 #include "xfs_health.h" 40 40 #include "xfs_symlink_remote.h" 41 + #include "xfs_rtgroup.h" 42 + #include "xfs_rtrmap_btree.h" 43 + #include "xfs_rtrefcount_btree.h" 41 44 #include "scrub/xfs_scrub.h" 42 45 #include "scrub/scrub.h" 43 46 #include "scrub/common.h" ··· 565 562 flags2 |= XFS_DIFLAG2_REFLINK; 566 563 else 567 564 flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE); 568 - if (flags & XFS_DIFLAG_REALTIME) 569 - flags2 &= ~XFS_DIFLAG2_REFLINK; 570 565 if (!xfs_has_bigtime(mp)) 571 566 flags2 &= ~XFS_DIFLAG2_BIGTIME; 572 567 if (!xfs_has_large_extent_counts(mp)) ··· 774 773 return error; 775 774 } 776 775 776 + /* Count extents and blocks for an inode given an rt rmap. */ 777 + STATIC int 778 + xrep_dinode_walk_rtrmap( 779 + struct xfs_btree_cur *cur, 780 + const struct xfs_rmap_irec *rec, 781 + void *priv) 782 + { 783 + struct xrep_inode *ri = priv; 784 + int error = 0; 785 + 786 + if (xchk_should_terminate(ri->sc, &error)) 787 + return error; 788 + 789 + /* We only care about this inode. */ 790 + if (rec->rm_owner != ri->sc->sm->sm_ino) 791 + return 0; 792 + 793 + if (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)) 794 + return -EFSCORRUPTED; 795 + 796 + ri->rt_blocks += rec->rm_blockcount; 797 + ri->rt_extents++; 798 + return 0; 799 + } 800 + 801 + /* Count extents and blocks for an inode from all realtime rmap data. */ 802 + STATIC int 803 + xrep_dinode_count_rtgroup_rmaps( 804 + struct xrep_inode *ri, 805 + struct xfs_rtgroup *rtg) 806 + { 807 + struct xfs_scrub *sc = ri->sc; 808 + int error; 809 + 810 + error = xrep_rtgroup_init(sc, rtg, &sc->sr, XFS_RTGLOCK_RMAP); 811 + if (error) 812 + return error; 813 + 814 + error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_dinode_walk_rtrmap, 815 + ri); 816 + xchk_rtgroup_btcur_free(&sc->sr); 817 + xchk_rtgroup_free(sc, &sc->sr); 818 + return error; 819 + } 820 + 777 821 /* Count extents and blocks for a given inode from all rmap data. */ 778 822 STATIC int 779 823 xrep_dinode_count_rmaps( 780 824 struct xrep_inode *ri) 781 825 { 782 826 struct xfs_perag *pag = NULL; 827 + struct xfs_rtgroup *rtg = NULL; 783 828 int error; 784 829 785 - if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp)) 830 + if (!xfs_has_rmapbt(ri->sc->mp)) 786 831 return -EOPNOTSUPP; 832 + 833 + while ((rtg = xfs_rtgroup_next(ri->sc->mp, rtg))) { 834 + error = xrep_dinode_count_rtgroup_rmaps(ri, rtg); 835 + if (error) { 836 + xfs_rtgroup_rele(rtg); 837 + return error; 838 + } 839 + } 787 840 788 841 while ((pag = xfs_perag_next(ri->sc->mp, pag))) { 789 842 error = xrep_dinode_count_ag_rmaps(ri, pag); ··· 943 888 return false; 944 889 } 945 890 891 + /* Return true if this rmap-format ifork looks like garbage. */ 892 + STATIC bool 893 + xrep_dinode_bad_rtrmapbt_fork( 894 + struct xfs_scrub *sc, 895 + struct xfs_dinode *dip, 896 + unsigned int dfork_size) 897 + { 898 + struct xfs_rtrmap_root *dfp; 899 + unsigned int nrecs; 900 + unsigned int level; 901 + 902 + if (dfork_size < sizeof(struct xfs_rtrmap_root)) 903 + return true; 904 + 905 + dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 906 + nrecs = be16_to_cpu(dfp->bb_numrecs); 907 + level = be16_to_cpu(dfp->bb_level); 908 + 909 + if (level > sc->mp->m_rtrmap_maxlevels) 910 + return true; 911 + if (xfs_rtrmap_droot_space_calc(level, nrecs) > dfork_size) 912 + return true; 913 + if (level > 0 && nrecs == 0) 914 + return true; 915 + 916 + return false; 917 + } 918 + 919 + /* Return true if this refcount-format ifork looks like garbage. */ 920 + STATIC bool 921 + xrep_dinode_bad_rtrefcountbt_fork( 922 + struct xfs_scrub *sc, 923 + struct xfs_dinode *dip, 924 + unsigned int dfork_size) 925 + { 926 + struct xfs_rtrefcount_root *dfp; 927 + unsigned int nrecs; 928 + unsigned int level; 929 + 930 + if (dfork_size < sizeof(struct xfs_rtrefcount_root)) 931 + return true; 932 + 933 + dfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 934 + nrecs = be16_to_cpu(dfp->bb_numrecs); 935 + level = be16_to_cpu(dfp->bb_level); 936 + 937 + if (level > sc->mp->m_rtrefc_maxlevels) 938 + return true; 939 + if (xfs_rtrefcount_droot_space_calc(level, nrecs) > dfork_size) 940 + return true; 941 + if (level > 0 && nrecs == 0) 942 + return true; 943 + 944 + return false; 945 + } 946 + 947 + /* Check a metadata-btree fork. */ 948 + STATIC bool 949 + xrep_dinode_bad_metabt_fork( 950 + struct xfs_scrub *sc, 951 + struct xfs_dinode *dip, 952 + unsigned int dfork_size, 953 + int whichfork) 954 + { 955 + if (whichfork != XFS_DATA_FORK) 956 + return true; 957 + 958 + switch (be16_to_cpu(dip->di_metatype)) { 959 + case XFS_METAFILE_RTRMAP: 960 + return xrep_dinode_bad_rtrmapbt_fork(sc, dip, dfork_size); 961 + case XFS_METAFILE_RTREFCOUNT: 962 + return xrep_dinode_bad_rtrefcountbt_fork(sc, dip, dfork_size); 963 + default: 964 + return true; 965 + } 966 + 967 + return false; 968 + } 969 + 946 970 /* 947 971 * Check the data fork for things that will fail the ifork verifiers or the 948 972 * ifork formatters. ··· 1099 965 break; 1100 966 case XFS_DINODE_FMT_BTREE: 1101 967 if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size, 968 + XFS_DATA_FORK)) 969 + return true; 970 + break; 971 + case XFS_DINODE_FMT_META_BTREE: 972 + if (xrep_dinode_bad_metabt_fork(sc, dip, dfork_size, 1102 973 XFS_DATA_FORK)) 1103 974 return true; 1104 975 break; ··· 1227 1088 XFS_ATTR_FORK)) 1228 1089 return true; 1229 1090 break; 1091 + case XFS_DINODE_FMT_META_BTREE: 1092 + if (xrep_dinode_bad_metabt_fork(sc, dip, afork_size, 1093 + XFS_ATTR_FORK)) 1094 + return true; 1095 + break; 1230 1096 default: 1231 1097 return true; 1232 1098 } ··· 1279 1135 uint16_t mode) 1280 1136 { 1281 1137 struct xfs_bmdr_block *bmdr; 1138 + struct xfs_rtrmap_root *rmdr; 1139 + struct xfs_rtrefcount_root *rcdr; 1282 1140 struct xfs_scrub *sc = ri->sc; 1283 1141 xfs_extnum_t attr_extents, data_extents; 1284 1142 size_t bmdr_minsz = xfs_bmdr_space_calc(1); ··· 1386 1240 /* Must have space for btree header and key/pointers. */ 1387 1241 bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 1388 1242 dfork_min = xfs_bmap_broot_space(sc->mp, bmdr); 1243 + break; 1244 + case XFS_DINODE_FMT_META_BTREE: 1245 + switch (be16_to_cpu(dip->di_metatype)) { 1246 + case XFS_METAFILE_RTRMAP: 1247 + rmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 1248 + dfork_min = xfs_rtrmap_broot_space(sc->mp, rmdr); 1249 + break; 1250 + case XFS_METAFILE_RTREFCOUNT: 1251 + rcdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); 1252 + dfork_min = xfs_rtrefcount_broot_space(sc->mp, rcdr); 1253 + break; 1254 + default: 1255 + dfork_min = 0; 1256 + break; 1257 + } 1389 1258 break; 1390 1259 default: 1391 1260 dfork_min = 0; ··· 1661 1500 trace_xrep_inode_blockcounts(sc); 1662 1501 1663 1502 /* Set data fork counters from the data fork mappings. */ 1664 - error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, 1665 - &nextents, &count); 1503 + error = xchk_inode_count_blocks(sc, XFS_DATA_FORK, &nextents, &count); 1666 1504 if (error) 1667 1505 return error; 1668 1506 if (xfs_is_reflink_inode(sc->ip)) { ··· 1685 1525 /* Set attr fork counters from the attr fork mappings. */ 1686 1526 ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); 1687 1527 if (ifp) { 1688 - error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, 1689 - &nextents, &acount); 1528 + error = xchk_inode_count_blocks(sc, XFS_ATTR_FORK, &nextents, 1529 + &acount); 1690 1530 if (error) 1691 1531 return error; 1692 1532 if (count >= sc->mp->m_sb.sb_dblocks) ··· 1824 1664 /* DAX only applies to files and dirs. */ 1825 1665 if (!(S_ISREG(mode) || S_ISDIR(mode))) 1826 1666 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX; 1827 - 1828 - /* No reflink files on the realtime device. */ 1829 - if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) 1830 - sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 1831 1667 } 1832 1668 1833 1669 /* ··· 1939 1783 sizeof(struct xfs_attr_sf_hdr), true); 1940 1784 } 1941 1785 1786 + /* Fix COW extent size hint problems. */ 1787 + STATIC void 1788 + xrep_inode_cowextsize( 1789 + struct xfs_scrub *sc) 1790 + { 1791 + /* Fix misaligned CoW extent size hints on a directory. */ 1792 + if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) && 1793 + (sc->ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && 1794 + sc->ip->i_extsize % sc->mp->m_sb.sb_rextsize > 0) { 1795 + sc->ip->i_cowextsize = 0; 1796 + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; 1797 + } 1798 + } 1799 + 1942 1800 /* Fix any irregularities in an inode that the verifiers don't catch. */ 1943 1801 STATIC int 1944 1802 xrep_inode_problems( ··· 1976 1806 if (S_ISDIR(VFS_I(sc->ip)->i_mode)) 1977 1807 xrep_inode_dir_size(sc); 1978 1808 xrep_inode_extsize(sc); 1809 + xrep_inode_cowextsize(sc); 1979 1810 1980 1811 trace_xrep_inode_fixed(sc); 1981 1812 xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);

+6

fs/xfs/scrub/metapath.c

··· 21 21 #include "xfs_trans_space.h" 22 22 #include "xfs_attr.h" 23 23 #include "xfs_rtgroup.h" 24 + #include "xfs_rtrmap_btree.h" 25 + #include "xfs_rtrefcount_btree.h" 24 26 #include "scrub/scrub.h" 25 27 #include "scrub/common.h" 26 28 #include "scrub/trace.h" ··· 248 246 return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_GROUP); 249 247 case XFS_SCRUB_METAPATH_PRJQUOTA: 250 248 return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_PROJ); 249 + case XFS_SCRUB_METAPATH_RTRMAPBT: 250 + return xchk_setup_metapath_rtginode(sc, XFS_RTGI_RMAP); 251 + case XFS_SCRUB_METAPATH_RTREFCOUNTBT: 252 + return xchk_setup_metapath_rtginode(sc, XFS_RTGI_REFCOUNT); 251 253 default: 252 254 return -ENOENT; 253 255 }

+42

fs/xfs/scrub/newbt.c

··· 19 19 #include "xfs_rmap.h" 20 20 #include "xfs_ag.h" 21 21 #include "xfs_defer.h" 22 + #include "xfs_metafile.h" 23 + #include "xfs_quota.h" 22 24 #include "scrub/scrub.h" 23 25 #include "scrub/common.h" 24 26 #include "scrub/trace.h" ··· 119 117 XFS_AG_RESV_NONE); 120 118 xnr->ifake.if_fork = ifp; 121 119 xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork); 120 + return 0; 121 + } 122 + 123 + /* 124 + * Initialize accounting resources for staging a new metadata inode btree. 125 + * If the metadata file has a space reservation, the caller must adjust that 126 + * reservation when committing the new ondisk btree. 127 + */ 128 + int 129 + xrep_newbt_init_metadir_inode( 130 + struct xrep_newbt *xnr, 131 + struct xfs_scrub *sc) 132 + { 133 + struct xfs_owner_info oinfo; 134 + struct xfs_ifork *ifp; 135 + 136 + ASSERT(xfs_is_metadir_inode(sc->ip)); 137 + 138 + xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK); 139 + 140 + ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS); 141 + if (!ifp) 142 + return -ENOMEM; 143 + 144 + /* 145 + * Allocate new metadir btree blocks with XFS_AG_RESV_NONE because the 146 + * inode metadata space reservations can only account allocated space 147 + * to the i_nblocks. We do not want to change the inode core fields 148 + * until we're ready to commit the new tree, so we allocate the blocks 149 + * as if they were regular file blocks. This exposes us to a higher 150 + * risk of the repair being cancelled due to ENOSPC. 151 + */ 152 + xrep_newbt_init_ag(xnr, sc, &oinfo, 153 + XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino), 154 + XFS_AG_RESV_NONE); 155 + xnr->ifake.if_fork = ifp; 156 + xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, XFS_DATA_FORK); 122 157 return 0; 123 158 } 124 159 ··· 263 224 int error = 0; 264 225 265 226 ASSERT(sc->sa.pag != NULL); 227 + ASSERT(xnr->resv != XFS_AG_RESV_METAFILE); 266 228 267 229 while (nr_blocks > 0) { 268 230 struct xfs_alloc_arg args = { ··· 336 296 struct xfs_scrub *sc = xnr->sc; 337 297 struct xfs_mount *mp = sc->mp; 338 298 int error = 0; 299 + 300 + ASSERT(xnr->resv != XFS_AG_RESV_METAFILE); 339 301 340 302 while (nr_blocks > 0) { 341 303 struct xfs_alloc_arg args = {

+1

fs/xfs/scrub/newbt.h

··· 63 63 enum xfs_ag_resv_type resv); 64 64 int xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc, 65 65 int whichfork, const struct xfs_owner_info *oinfo); 66 + int xrep_newbt_init_metadir_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc); 66 67 int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks); 67 68 int xrep_newbt_add_extent(struct xrep_newbt *xnr, struct xfs_perag *pag, 68 69 xfs_agblock_t agbno, xfs_extlen_t len);

+7 -1

fs/xfs/scrub/quota.c

··· 212 212 if (mp->m_sb.sb_dblocks < dq->q_blk.count) 213 213 xchk_fblock_set_warning(sc, XFS_DATA_FORK, 214 214 offset); 215 + if (mp->m_sb.sb_rblocks < dq->q_rtb.count) 216 + xchk_fblock_set_warning(sc, XFS_DATA_FORK, 217 + offset); 215 218 } else { 216 219 if (mp->m_sb.sb_dblocks < dq->q_blk.count) 217 220 xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 218 221 offset); 222 + if (mp->m_sb.sb_rblocks < dq->q_rtb.count) 223 + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 224 + offset); 219 225 } 220 - if (dq->q_ino.count > fs_icount || dq->q_rtb.count > mp->m_sb.sb_rblocks) 226 + if (dq->q_ino.count > fs_icount) 221 227 xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); 222 228 223 229 /*

+1 -1

fs/xfs/scrub/quota_repair.c

··· 233 233 rqi->need_quotacheck = true; 234 234 dirty = true; 235 235 } 236 - if (dq->q_rtb.count > mp->m_sb.sb_rblocks) { 236 + if (!xfs_has_reflink(mp) && dq->q_rtb.count > mp->m_sb.sb_rblocks) { 237 237 dq->q_rtb.reserved -= dq->q_rtb.count; 238 238 dq->q_rtb.reserved += mp->m_sb.sb_rblocks; 239 239 dq->q_rtb.count = mp->m_sb.sb_rblocks;

+278 -10

fs/xfs/scrub/reap.c

··· 33 33 #include "xfs_attr.h" 34 34 #include "xfs_attr_remote.h" 35 35 #include "xfs_defer.h" 36 + #include "xfs_metafile.h" 37 + #include "xfs_rtgroup.h" 38 + #include "xfs_rtrmap_btree.h" 36 39 #include "scrub/scrub.h" 37 40 #include "scrub/common.h" 38 41 #include "scrub/trace.h" ··· 43 40 #include "scrub/bitmap.h" 44 41 #include "scrub/agb_bitmap.h" 45 42 #include "scrub/fsb_bitmap.h" 43 + #include "scrub/rtb_bitmap.h" 46 44 #include "scrub/reap.h" 47 45 48 46 /* ··· 314 310 } 315 311 316 312 out: 317 - trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp); 313 + trace_xreap_agextent_binval(pag_group(sc->sa.pag), agbno, *aglenp); 318 314 } 319 315 320 316 /* ··· 373 369 374 370 out_found: 375 371 *aglenp = len; 376 - trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked); 372 + trace_xreap_agextent_select(pag_group(sc->sa.pag), agbno, len, 373 + *crosslinked); 377 374 out_cur: 378 375 xfs_btree_del_cursor(cur, error); 379 376 return error; ··· 395 390 xfs_fsblock_t fsbno; 396 391 int error = 0; 397 392 393 + ASSERT(rs->resv != XFS_AG_RESV_METAFILE); 394 + 398 395 fsbno = xfs_agbno_to_fsb(sc->sa.pag, agbno); 399 396 400 397 /* ··· 413 406 * to run xfs_repair. 414 407 */ 415 408 if (crosslinked) { 416 - trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp); 409 + trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), agbno, 410 + *aglenp); 417 411 418 412 rs->force_roll = true; 419 413 ··· 424 416 * records from the refcountbt, which will remove the 425 417 * rmap record as well. 426 418 */ 427 - xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); 419 + xfs_refcount_free_cow_extent(sc->tp, false, fsbno, 420 + *aglenp); 428 421 return 0; 429 422 } 430 423 ··· 433 424 *aglenp, rs->oinfo); 434 425 } 435 426 436 - trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp); 427 + trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), agbno, *aglenp); 437 428 438 429 /* 439 430 * Invalidate as many buffers as we can, starting at agbno. If this ··· 457 448 if (rs->oinfo == &XFS_RMAP_OINFO_COW) { 458 449 ASSERT(rs->resv == XFS_AG_RESV_NONE); 459 450 460 - xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); 451 + xfs_refcount_free_cow_extent(sc->tp, false, fsbno, *aglenp); 461 452 error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL, 462 453 rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD); 463 454 if (error) ··· 684 675 return 0; 685 676 } 686 677 678 + #ifdef CONFIG_XFS_RT 679 + /* 680 + * Figure out the longest run of blocks that we can dispose of with a single 681 + * call. Cross-linked blocks should have their reverse mappings removed, but 682 + * single-owner extents can be freed. Units are rt blocks, not rt extents. 683 + */ 684 + STATIC int 685 + xreap_rgextent_select( 686 + struct xreap_state *rs, 687 + xfs_rgblock_t rgbno, 688 + xfs_rgblock_t rgbno_next, 689 + bool *crosslinked, 690 + xfs_extlen_t *rglenp) 691 + { 692 + struct xfs_scrub *sc = rs->sc; 693 + struct xfs_btree_cur *cur; 694 + xfs_rgblock_t bno = rgbno + 1; 695 + xfs_extlen_t len = 1; 696 + int error; 697 + 698 + /* 699 + * Determine if there are any other rmap records covering the first 700 + * block of this extent. If so, the block is crosslinked. 701 + */ 702 + cur = xfs_rtrmapbt_init_cursor(sc->tp, sc->sr.rtg); 703 + error = xfs_rmap_has_other_keys(cur, rgbno, 1, rs->oinfo, 704 + crosslinked); 705 + if (error) 706 + goto out_cur; 707 + 708 + /* 709 + * Figure out how many of the subsequent blocks have the same crosslink 710 + * status. 711 + */ 712 + while (bno < rgbno_next) { 713 + bool also_crosslinked; 714 + 715 + error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo, 716 + &also_crosslinked); 717 + if (error) 718 + goto out_cur; 719 + 720 + if (*crosslinked != also_crosslinked) 721 + break; 722 + 723 + len++; 724 + bno++; 725 + } 726 + 727 + *rglenp = len; 728 + trace_xreap_agextent_select(rtg_group(sc->sr.rtg), rgbno, len, 729 + *crosslinked); 730 + out_cur: 731 + xfs_btree_del_cursor(cur, error); 732 + return error; 733 + } 734 + 735 + /* 736 + * Dispose of as much of the beginning of this rtgroup extent as possible. 737 + * The number of blocks disposed of will be returned in @rglenp. 738 + */ 739 + STATIC int 740 + xreap_rgextent_iter( 741 + struct xreap_state *rs, 742 + xfs_rgblock_t rgbno, 743 + xfs_extlen_t *rglenp, 744 + bool crosslinked) 745 + { 746 + struct xfs_scrub *sc = rs->sc; 747 + xfs_rtblock_t rtbno; 748 + int error; 749 + 750 + /* 751 + * The only caller so far is CoW fork repair, so we only know how to 752 + * unlink or free CoW staging extents. Here we don't have to worry 753 + * about invalidating buffers! 754 + */ 755 + if (rs->oinfo != &XFS_RMAP_OINFO_COW) { 756 + ASSERT(rs->oinfo == &XFS_RMAP_OINFO_COW); 757 + return -EFSCORRUPTED; 758 + } 759 + ASSERT(rs->resv == XFS_AG_RESV_NONE); 760 + 761 + rtbno = xfs_rgbno_to_rtb(sc->sr.rtg, rgbno); 762 + 763 + /* 764 + * If there are other rmappings, this block is cross linked and must 765 + * not be freed. Remove the forward and reverse mapping and move on. 766 + */ 767 + if (crosslinked) { 768 + trace_xreap_dispose_unmap_extent(rtg_group(sc->sr.rtg), rgbno, 769 + *rglenp); 770 + 771 + xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp); 772 + rs->deferred++; 773 + return 0; 774 + } 775 + 776 + trace_xreap_dispose_free_extent(rtg_group(sc->sr.rtg), rgbno, *rglenp); 777 + 778 + /* 779 + * The CoW staging extent is not crosslinked. Use deferred work items 780 + * to remove the refcountbt records (which removes the rmap records) 781 + * and free the extent. We're not worried about the system going down 782 + * here because log recovery walks the refcount btree to clean out the 783 + * CoW staging extents. 784 + */ 785 + xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp); 786 + error = xfs_free_extent_later(sc->tp, rtbno, *rglenp, NULL, 787 + rs->resv, 788 + XFS_FREE_EXTENT_REALTIME | 789 + XFS_FREE_EXTENT_SKIP_DISCARD); 790 + if (error) 791 + return error; 792 + 793 + rs->deferred++; 794 + return 0; 795 + } 796 + 797 + #define XREAP_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \ 798 + XFS_RTGLOCK_RMAP | \ 799 + XFS_RTGLOCK_REFCOUNT) 800 + 801 + /* 802 + * Break a rt file metadata extent into sub-extents by fate (crosslinked, not 803 + * crosslinked), and dispose of each sub-extent separately. The extent must 804 + * be aligned to a realtime extent. 805 + */ 806 + STATIC int 807 + xreap_rtmeta_extent( 808 + uint64_t rtbno, 809 + uint64_t len, 810 + void *priv) 811 + { 812 + struct xreap_state *rs = priv; 813 + struct xfs_scrub *sc = rs->sc; 814 + xfs_rgblock_t rgbno = xfs_rtb_to_rgbno(sc->mp, rtbno); 815 + xfs_rgblock_t rgbno_next = rgbno + len; 816 + int error = 0; 817 + 818 + ASSERT(sc->ip != NULL); 819 + ASSERT(!sc->sr.rtg); 820 + 821 + /* 822 + * We're reaping blocks after repairing file metadata, which means that 823 + * we have to init the xchk_ag structure ourselves. 824 + */ 825 + sc->sr.rtg = xfs_rtgroup_get(sc->mp, xfs_rtb_to_rgno(sc->mp, rtbno)); 826 + if (!sc->sr.rtg) 827 + return -EFSCORRUPTED; 828 + 829 + xfs_rtgroup_lock(sc->sr.rtg, XREAP_RTGLOCK_ALL); 830 + 831 + while (rgbno < rgbno_next) { 832 + xfs_extlen_t rglen; 833 + bool crosslinked; 834 + 835 + error = xreap_rgextent_select(rs, rgbno, rgbno_next, 836 + &crosslinked, &rglen); 837 + if (error) 838 + goto out_unlock; 839 + 840 + error = xreap_rgextent_iter(rs, rgbno, &rglen, crosslinked); 841 + if (error) 842 + goto out_unlock; 843 + 844 + if (xreap_want_defer_finish(rs)) { 845 + error = xfs_defer_finish(&sc->tp); 846 + if (error) 847 + goto out_unlock; 848 + xreap_defer_finish_reset(rs); 849 + } else if (xreap_want_roll(rs)) { 850 + error = xfs_trans_roll_inode(&sc->tp, sc->ip); 851 + if (error) 852 + goto out_unlock; 853 + xreap_reset(rs); 854 + } 855 + 856 + rgbno += rglen; 857 + } 858 + 859 + out_unlock: 860 + xfs_rtgroup_unlock(sc->sr.rtg, XREAP_RTGLOCK_ALL); 861 + xfs_rtgroup_put(sc->sr.rtg); 862 + sc->sr.rtg = NULL; 863 + return error; 864 + } 865 + 866 + /* 867 + * Dispose of every block of every rt metadata extent in the bitmap. 868 + * Do not use this to dispose of the mappings in an ondisk inode fork. 869 + */ 870 + int 871 + xrep_reap_rtblocks( 872 + struct xfs_scrub *sc, 873 + struct xrtb_bitmap *bitmap, 874 + const struct xfs_owner_info *oinfo) 875 + { 876 + struct xreap_state rs = { 877 + .sc = sc, 878 + .oinfo = oinfo, 879 + .resv = XFS_AG_RESV_NONE, 880 + }; 881 + int error; 882 + 883 + ASSERT(xfs_has_rmapbt(sc->mp)); 884 + ASSERT(sc->ip != NULL); 885 + 886 + error = xrtb_bitmap_walk(bitmap, xreap_rtmeta_extent, &rs); 887 + if (error) 888 + return error; 889 + 890 + if (xreap_dirty(&rs)) 891 + return xrep_defer_finish(sc); 892 + 893 + return 0; 894 + } 895 + #endif /* CONFIG_XFS_RT */ 896 + 897 + /* 898 + * Dispose of every block of an old metadata btree that used to be rooted in a 899 + * metadata directory file. 900 + */ 901 + int 902 + xrep_reap_metadir_fsblocks( 903 + struct xfs_scrub *sc, 904 + struct xfsb_bitmap *bitmap) 905 + { 906 + /* 907 + * Reap old metadir btree blocks with XFS_AG_RESV_NONE because the old 908 + * blocks are no longer mapped by the inode, and inode metadata space 909 + * reservations can only account freed space to the i_nblocks. 910 + */ 911 + struct xfs_owner_info oinfo; 912 + struct xreap_state rs = { 913 + .sc = sc, 914 + .oinfo = &oinfo, 915 + .resv = XFS_AG_RESV_NONE, 916 + }; 917 + int error; 918 + 919 + ASSERT(xfs_has_rmapbt(sc->mp)); 920 + ASSERT(sc->ip != NULL); 921 + ASSERT(xfs_is_metadir_inode(sc->ip)); 922 + 923 + xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK); 924 + 925 + error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); 926 + if (error) 927 + return error; 928 + 929 + if (xreap_dirty(&rs)) 930 + return xrep_defer_finish(sc); 931 + 932 + return 0; 933 + } 934 + 687 935 /* 688 936 * Metadata files are not supposed to share blocks with anything else. 689 937 * If blocks are shared, we remove the reverse mapping (thus reducing the ··· 995 729 } 996 730 997 731 imap->br_blockcount = len; 998 - trace_xreap_bmapi_select(sc->sa.pag, agbno, len, *crosslinked); 732 + trace_xreap_bmapi_select(pag_group(sc->sa.pag), agbno, len, 733 + *crosslinked); 999 734 out_cur: 1000 735 xfs_btree_del_cursor(cur, error); 1001 736 return error; ··· 1135 868 } 1136 869 1137 870 out: 1138 - trace_xreap_bmapi_binval(sc->sa.pag, agbno, imap->br_blockcount); 871 + trace_xreap_bmapi_binval(pag_group(sc->sa.pag), agbno, 872 + imap->br_blockcount); 1139 873 return 0; 1140 874 } 1141 875 ··· 1163 895 * anybody else who thinks they own the block, even though that 1164 896 * runs the risk of stale buffer warnings in the future. 1165 897 */ 1166 - trace_xreap_dispose_unmap_extent(sc->sa.pag, 898 + trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), 1167 899 XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock), 1168 900 imap->br_blockcount); 1169 901 ··· 1186 918 * by a block starting before the first block of the extent but overlap 1187 919 * anyway. 1188 920 */ 1189 - trace_xreap_dispose_free_extent(sc->sa.pag, 921 + trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), 1190 922 XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock), 1191 923 imap->br_blockcount); 1192 924

+9

fs/xfs/scrub/reap.h

··· 14 14 int xrep_reap_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap, 15 15 const struct xfs_owner_info *oinfo); 16 16 int xrep_reap_ifork(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork); 17 + int xrep_reap_metadir_fsblocks(struct xfs_scrub *sc, 18 + struct xfsb_bitmap *bitmap); 19 + 20 + #ifdef CONFIG_XFS_RT 21 + int xrep_reap_rtblocks(struct xfs_scrub *sc, struct xrtb_bitmap *bitmap, 22 + const struct xfs_owner_info *oinfo); 23 + #else 24 + # define xrep_reap_rtblocks(...) (-EOPNOTSUPP) 25 + #endif /* CONFIG_XFS_RT */ 17 26 18 27 /* Buffer cache scan context. */ 19 28 struct xrep_bufscan {

+1 -1

fs/xfs/scrub/refcount.c

··· 421 421 if (r1->rc_refcount != r2->rc_refcount) 422 422 return false; 423 423 if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount > 424 - MAXREFCEXTLEN) 424 + XFS_REFC_LEN_MAX) 425 425 return false; 426 426 427 427 return true;

+3 -3

fs/xfs/scrub/refcount_repair.c

··· 183 183 if (xchk_should_terminate(sc, &error)) 184 184 return error; 185 185 186 - irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount); 186 + irec.rc_refcount = min_t(uint64_t, XFS_REFC_REFCOUNT_MAX, refcount); 187 187 188 188 error = xrep_refc_check_ext(rr->sc, &irec); 189 189 if (error) 190 190 return error; 191 191 192 - trace_xrep_refc_found(sc->sa.pag, &irec); 192 + trace_xrep_refc_found(pag_group(sc->sa.pag), &irec); 193 193 194 194 return xfarray_append(rr->refcount_records, &irec); 195 195 } ··· 422 422 /* 423 423 * Set up a bag to store all the rmap records that we're tracking to 424 424 * generate a reference count record. If the size of the bag exceeds 425 - * MAXREFCOUNT, we clamp rc_refcount. 425 + * XFS_REFC_REFCOUNT_MAX, we clamp rc_refcount. 426 426 */ 427 427 error = rcbag_init(sc->mp, sc->xmbtp, &rcstack); 428 428 if (error)

+197

fs/xfs/scrub/repair.c

··· 37 37 #include "xfs_da_btree.h" 38 38 #include "xfs_attr.h" 39 39 #include "xfs_dir2.h" 40 + #include "xfs_rtrmap_btree.h" 41 + #include "xfs_rtbitmap.h" 42 + #include "xfs_rtgroup.h" 43 + #include "xfs_rtalloc.h" 44 + #include "xfs_metafile.h" 45 + #include "xfs_rtrefcount_btree.h" 40 46 #include "scrub/scrub.h" 41 47 #include "scrub/common.h" 42 48 #include "scrub/trace.h" ··· 68 62 trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error); 69 63 70 64 xchk_ag_btcur_free(&sc->sa); 65 + xchk_rtgroup_btcur_free(&sc->sr); 71 66 72 67 /* Repair whatever's broken. */ 73 68 ASSERT(sc->ops->repair); ··· 384 377 385 378 return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz)); 386 379 } 380 + 381 + #ifdef CONFIG_XFS_RT 382 + /* 383 + * Figure out how many blocks to reserve for a rtgroup repair. We calculate 384 + * the worst case estimate for the number of blocks we'd need to rebuild one of 385 + * any type of per-rtgroup btree. 386 + */ 387 + xfs_extlen_t 388 + xrep_calc_rtgroup_resblks( 389 + struct xfs_scrub *sc) 390 + { 391 + struct xfs_mount *mp = sc->mp; 392 + struct xfs_scrub_metadata *sm = sc->sm; 393 + uint64_t usedlen; 394 + xfs_extlen_t rmapbt_sz = 0; 395 + 396 + if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) 397 + return 0; 398 + if (!xfs_has_rtgroups(mp)) { 399 + ASSERT(0); 400 + return -EFSCORRUPTED; 401 + } 402 + 403 + usedlen = xfs_rtbxlen_to_blen(mp, xfs_rtgroup_extents(mp, sm->sm_agno)); 404 + ASSERT(usedlen <= XFS_MAX_RGBLOCKS); 405 + 406 + if (xfs_has_rmapbt(mp)) 407 + rmapbt_sz = xfs_rtrmapbt_calc_size(mp, usedlen); 408 + 409 + trace_xrep_calc_rtgroup_resblks_btsize(mp, sm->sm_agno, usedlen, 410 + rmapbt_sz); 411 + 412 + return rmapbt_sz; 413 + } 414 + #endif /* CONFIG_XFS_RT */ 387 415 388 416 /* 389 417 * Reconstructing per-AG Btrees ··· 996 954 } 997 955 998 956 #ifdef CONFIG_XFS_RT 957 + /* Initialize all the btree cursors for a RT repair. */ 958 + void 959 + xrep_rtgroup_btcur_init( 960 + struct xfs_scrub *sc, 961 + struct xchk_rt *sr) 962 + { 963 + struct xfs_mount *mp = sc->mp; 964 + 965 + ASSERT(sr->rtg != NULL); 966 + 967 + if (sc->sm->sm_type != XFS_SCRUB_TYPE_RTRMAPBT && 968 + (sr->rtlock_flags & XFS_RTGLOCK_RMAP) && 969 + xfs_has_rtrmapbt(mp)) 970 + sr->rmap_cur = xfs_rtrmapbt_init_cursor(sc->tp, sr->rtg); 971 + 972 + if (sc->sm->sm_type != XFS_SCRUB_TYPE_RTREFCBT && 973 + (sr->rtlock_flags & XFS_RTGLOCK_REFCOUNT) && 974 + xfs_has_rtreflink(mp)) 975 + sr->refc_cur = xfs_rtrefcountbt_init_cursor(sc->tp, sr->rtg); 976 + } 977 + 999 978 /* 1000 979 * Given a reference to a rtgroup structure, lock rtgroup btree inodes and 1001 980 * create btree cursors. Must only be called to repair a regular rt file. ··· 1035 972 1036 973 /* Grab our own passive reference from the caller's ref. */ 1037 974 sr->rtg = xfs_rtgroup_hold(rtg); 975 + xrep_rtgroup_btcur_init(sc, sr); 976 + return 0; 977 + } 978 + 979 + /* Ensure that all rt blocks in the given range are not marked free. */ 980 + int 981 + xrep_require_rtext_inuse( 982 + struct xfs_scrub *sc, 983 + xfs_rgblock_t rgbno, 984 + xfs_filblks_t len) 985 + { 986 + struct xfs_mount *mp = sc->mp; 987 + xfs_rtxnum_t startrtx; 988 + xfs_rtxnum_t endrtx; 989 + bool is_free = false; 990 + int error; 991 + 992 + startrtx = xfs_rgbno_to_rtx(mp, rgbno); 993 + endrtx = xfs_rgbno_to_rtx(mp, rgbno + len - 1); 994 + 995 + error = xfs_rtalloc_extent_is_free(sc->sr.rtg, sc->tp, startrtx, 996 + endrtx - startrtx + 1, &is_free); 997 + if (error) 998 + return error; 999 + if (is_free) 1000 + return -EFSCORRUPTED; 1001 + 1038 1002 return 0; 1039 1003 } 1040 1004 #endif /* CONFIG_XFS_RT */ ··· 1326 1236 bp->b_error = old_error; 1327 1237 1328 1238 return fa == NULL; 1239 + } 1240 + 1241 + /* Check the sanity of a rmap record for a metadata btree inode. */ 1242 + int 1243 + xrep_check_ino_btree_mapping( 1244 + struct xfs_scrub *sc, 1245 + const struct xfs_rmap_irec *rec) 1246 + { 1247 + enum xbtree_recpacking outcome; 1248 + int error; 1249 + 1250 + /* 1251 + * Metadata btree inodes never have extended attributes, and all blocks 1252 + * should have the bmbt block flag set. 1253 + */ 1254 + if ((rec->rm_flags & XFS_RMAP_ATTR_FORK) || 1255 + !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) 1256 + return -EFSCORRUPTED; 1257 + 1258 + /* Make sure the block is within the AG. */ 1259 + if (!xfs_verify_agbext(sc->sa.pag, rec->rm_startblock, 1260 + rec->rm_blockcount)) 1261 + return -EFSCORRUPTED; 1262 + 1263 + /* Make sure this isn't free space. */ 1264 + error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock, 1265 + rec->rm_blockcount, &outcome); 1266 + if (error) 1267 + return error; 1268 + if (outcome != XBTREE_RECPACKING_EMPTY) 1269 + return -EFSCORRUPTED; 1270 + 1271 + return 0; 1272 + } 1273 + 1274 + /* 1275 + * Reset the block count of the inode being repaired, and adjust the dquot 1276 + * block usage to match. The inode must not have an xattr fork. 1277 + */ 1278 + void 1279 + xrep_inode_set_nblocks( 1280 + struct xfs_scrub *sc, 1281 + int64_t new_blocks) 1282 + { 1283 + int64_t delta = 1284 + new_blocks - sc->ip->i_nblocks; 1285 + 1286 + sc->ip->i_nblocks = new_blocks; 1287 + 1288 + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); 1289 + if (delta != 0) 1290 + xfs_trans_mod_dquot_byino(sc->tp, sc->ip, XFS_TRANS_DQ_BCOUNT, 1291 + delta); 1292 + } 1293 + 1294 + /* Reset the block reservation for a metadata inode. */ 1295 + int 1296 + xrep_reset_metafile_resv( 1297 + struct xfs_scrub *sc) 1298 + { 1299 + struct xfs_inode *ip = sc->ip; 1300 + int64_t delta; 1301 + int error; 1302 + 1303 + delta = ip->i_nblocks + ip->i_delayed_blks - ip->i_meta_resv_asked; 1304 + if (delta == 0) 1305 + return 0; 1306 + 1307 + /* 1308 + * Too many blocks have been reserved, transfer some from the incore 1309 + * reservation back to the filesystem. 1310 + */ 1311 + if (delta > 0) { 1312 + int64_t give_back; 1313 + 1314 + give_back = min_t(uint64_t, delta, ip->i_delayed_blks); 1315 + if (give_back > 0) { 1316 + xfs_mod_delalloc(ip, 0, -give_back); 1317 + xfs_add_fdblocks(ip->i_mount, give_back); 1318 + ip->i_delayed_blks -= give_back; 1319 + } 1320 + 1321 + return 0; 1322 + } 1323 + 1324 + /* 1325 + * Not enough reservation; try to take some blocks from the filesystem 1326 + * to the metadata inode. @delta is negative here, so invert the sign. 1327 + */ 1328 + delta = -delta; 1329 + error = xfs_dec_fdblocks(sc->mp, delta, true); 1330 + while (error == -ENOSPC) { 1331 + delta--; 1332 + if (delta == 0) { 1333 + xfs_warn(sc->mp, 1334 + "Insufficient free space to reset space reservation for inode 0x%llx after repair.", 1335 + ip->i_ino); 1336 + return 0; 1337 + } 1338 + error = xfs_dec_fdblocks(sc->mp, delta, true); 1339 + } 1340 + if (error) 1341 + return error; 1342 + 1343 + xfs_mod_delalloc(ip, 0, delta); 1344 + ip->i_delayed_blks += delta; 1345 + return 0; 1329 1346 }

+24

fs/xfs/scrub/repair.h

··· 50 50 51 51 struct xbitmap; 52 52 struct xagb_bitmap; 53 + struct xrgb_bitmap; 53 54 struct xfsb_bitmap; 55 + struct xrtb_bitmap; 54 56 55 57 int xrep_fix_freelist(struct xfs_scrub *sc, int alloc_flags); 56 58 ··· 99 97 int xrep_setup_nlinks(struct xfs_scrub *sc); 100 98 int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *resblks); 101 99 int xrep_setup_dirtree(struct xfs_scrub *sc); 100 + int xrep_setup_rtrmapbt(struct xfs_scrub *sc); 101 + int xrep_setup_rtrefcountbt(struct xfs_scrub *sc); 102 102 103 103 /* Repair setup functions */ 104 104 int xrep_setup_ag_allocbt(struct xfs_scrub *sc); ··· 114 110 #ifdef CONFIG_XFS_RT 115 111 int xrep_rtgroup_init(struct xfs_scrub *sc, struct xfs_rtgroup *rtg, 116 112 struct xchk_rt *sr, unsigned int rtglock_flags); 113 + void xrep_rtgroup_btcur_init(struct xfs_scrub *sc, struct xchk_rt *sr); 114 + int xrep_require_rtext_inuse(struct xfs_scrub *sc, xfs_rgblock_t rgbno, 115 + xfs_filblks_t len); 116 + xfs_extlen_t xrep_calc_rtgroup_resblks(struct xfs_scrub *sc); 117 117 #else 118 118 # define xrep_rtgroup_init(sc, rtg, sr, lockflags) (-ENOSYS) 119 + # define xrep_calc_rtgroup_resblks(sc) (0) 119 120 #endif /* CONFIG_XFS_RT */ 121 + 122 + int xrep_check_ino_btree_mapping(struct xfs_scrub *sc, 123 + const struct xfs_rmap_irec *rec); 120 124 121 125 /* Metadata revalidators */ 122 126 ··· 159 147 int xrep_rtbitmap(struct xfs_scrub *sc); 160 148 int xrep_rtsummary(struct xfs_scrub *sc); 161 149 int xrep_rgsuperblock(struct xfs_scrub *sc); 150 + int xrep_rtrmapbt(struct xfs_scrub *sc); 151 + int xrep_rtrefcountbt(struct xfs_scrub *sc); 162 152 #else 163 153 # define xrep_rtbitmap xrep_notsupported 164 154 # define xrep_rtsummary xrep_notsupported 165 155 # define xrep_rgsuperblock xrep_notsupported 156 + # define xrep_rtrmapbt xrep_notsupported 157 + # define xrep_rtrefcountbt xrep_notsupported 166 158 #endif /* CONFIG_XFS_RT */ 167 159 168 160 #ifdef CONFIG_XFS_QUOTA ··· 185 169 void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp); 186 170 187 171 bool xrep_buf_verify_struct(struct xfs_buf *bp, const struct xfs_buf_ops *ops); 172 + void xrep_inode_set_nblocks(struct xfs_scrub *sc, int64_t new_blocks); 173 + int xrep_reset_metafile_resv(struct xfs_scrub *sc); 188 174 189 175 #else 190 176 ··· 209 191 { 210 192 return 0; 211 193 } 194 + 195 + #define xrep_calc_rtgroup_resblks xrep_calc_ag_resblks 212 196 213 197 static inline int 214 198 xrep_reset_perag_resv( ··· 239 219 #define xrep_setup_nlinks xrep_setup_nothing 240 220 #define xrep_setup_dirtree xrep_setup_nothing 241 221 #define xrep_setup_metapath xrep_setup_nothing 222 + #define xrep_setup_rtrmapbt xrep_setup_nothing 223 + #define xrep_setup_rtrefcountbt xrep_setup_nothing 242 224 243 225 #define xrep_setup_inode(sc, imap) ((void)0) 244 226 ··· 278 256 #define xrep_dirtree xrep_notsupported 279 257 #define xrep_metapath xrep_notsupported 280 258 #define xrep_rgsuperblock xrep_notsupported 259 + #define xrep_rtrmapbt xrep_notsupported 260 + #define xrep_rtrefcountbt xrep_notsupported 281 261 282 262 #endif /* CONFIG_XFS_ONLINE_REPAIR */ 283 263

+37

fs/xfs/scrub/rgb_bitmap.h

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <djwong@kernel.org> 5 + */ 6 + #ifndef __XFS_SCRUB_RGB_BITMAP_H__ 7 + #define __XFS_SCRUB_RGB_BITMAP_H__ 8 + 9 + /* Bitmaps, but for type-checked for xfs_rgblock_t */ 10 + 11 + struct xrgb_bitmap { 12 + struct xbitmap32 rgbitmap; 13 + }; 14 + 15 + static inline void xrgb_bitmap_init(struct xrgb_bitmap *bitmap) 16 + { 17 + xbitmap32_init(&bitmap->rgbitmap); 18 + } 19 + 20 + static inline void xrgb_bitmap_destroy(struct xrgb_bitmap *bitmap) 21 + { 22 + xbitmap32_destroy(&bitmap->rgbitmap); 23 + } 24 + 25 + static inline int xrgb_bitmap_set(struct xrgb_bitmap *bitmap, 26 + xfs_rgblock_t start, xfs_extlen_t len) 27 + { 28 + return xbitmap32_set(&bitmap->rgbitmap, start, len); 29 + } 30 + 31 + static inline int xrgb_bitmap_walk(struct xrgb_bitmap *bitmap, 32 + xbitmap32_walk_fn fn, void *priv) 33 + { 34 + return xbitmap32_walk(&bitmap->rgbitmap, fn, priv); 35 + } 36 + 37 + #endif /* __XFS_SCRUB_RGB_BITMAP_H__ */

+5 -1

fs/xfs/scrub/rgsuper.c

··· 13 13 #include "xfs_log_format.h" 14 14 #include "xfs_trans.h" 15 15 #include "xfs_sb.h" 16 + #include "xfs_rmap.h" 16 17 #include "scrub/scrub.h" 17 18 #include "scrub/common.h" 18 19 #include "scrub/repair.h" ··· 35 34 return; 36 35 37 36 xchk_xref_is_used_rt_space(sc, xfs_rgbno_to_rtb(sc->sr.rtg, 0), 1); 37 + xchk_xref_is_only_rt_owned_by(sc, 0, 1, &XFS_RMAP_OINFO_FS); 38 38 } 39 39 40 40 int ··· 63 61 if (!xchk_xref_process_error(sc, 0, 0, &error)) 64 62 return error; 65 63 66 - xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP_SHARED); 64 + error = xchk_rtgroup_lock(sc, &sc->sr, XFS_RTGLOCK_BITMAP_SHARED); 65 + if (error) 66 + return error; 67 67 68 68 /* 69 69 * Since we already validated the rt superblock at mount time, we don't

+80 -11

fs/xfs/scrub/rmap_repair.c

··· 31 31 #include "xfs_refcount.h" 32 32 #include "xfs_refcount_btree.h" 33 33 #include "xfs_ag.h" 34 + #include "xfs_rtrmap_btree.h" 35 + #include "xfs_rtgroup.h" 36 + #include "xfs_rtrefcount_btree.h" 34 37 #include "scrub/xfs_scrub.h" 35 38 #include "scrub/scrub.h" 36 39 #include "scrub/common.h" ··· 502 499 return xrep_rmap_stash_accumulated(rf); 503 500 } 504 501 502 + static int 503 + xrep_rmap_scan_meta_btree( 504 + struct xrep_rmap_ifork *rf, 505 + struct xfs_inode *ip) 506 + { 507 + struct xfs_scrub *sc = rf->rr->sc; 508 + struct xfs_rtgroup *rtg = NULL; 509 + struct xfs_btree_cur *cur = NULL; 510 + enum xfs_rtg_inodes type; 511 + int error; 512 + 513 + if (rf->whichfork != XFS_DATA_FORK) 514 + return -EFSCORRUPTED; 515 + 516 + switch (ip->i_metatype) { 517 + case XFS_METAFILE_RTRMAP: 518 + type = XFS_RTGI_RMAP; 519 + break; 520 + case XFS_METAFILE_RTREFCOUNT: 521 + type = XFS_RTGI_REFCOUNT; 522 + break; 523 + default: 524 + ASSERT(0); 525 + return -EFSCORRUPTED; 526 + } 527 + 528 + while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) { 529 + if (ip == rtg->rtg_inodes[type]) 530 + goto found; 531 + } 532 + 533 + /* 534 + * We should never find an rt metadata btree inode that isn't 535 + * associated with an rtgroup yet has ondisk blocks allocated to it. 536 + */ 537 + if (ip->i_nblocks) { 538 + ASSERT(0); 539 + return -EFSCORRUPTED; 540 + } 541 + 542 + return 0; 543 + 544 + found: 545 + switch (ip->i_metatype) { 546 + case XFS_METAFILE_RTRMAP: 547 + cur = xfs_rtrmapbt_init_cursor(sc->tp, rtg); 548 + break; 549 + case XFS_METAFILE_RTREFCOUNT: 550 + cur = xfs_rtrefcountbt_init_cursor(sc->tp, rtg); 551 + break; 552 + default: 553 + ASSERT(0); 554 + error = -EFSCORRUPTED; 555 + goto out_rtg; 556 + } 557 + 558 + error = xrep_rmap_scan_iroot_btree(rf, cur); 559 + xfs_btree_del_cursor(cur, error); 560 + out_rtg: 561 + xfs_rtgroup_rele(rtg); 562 + return error; 563 + } 564 + 505 565 /* Find all the extents from a given AG in an inode fork. */ 506 566 STATIC int 507 567 xrep_rmap_scan_ifork( ··· 578 512 .whichfork = whichfork, 579 513 }; 580 514 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); 515 + bool mappings_done; 581 516 int error = 0; 582 517 583 518 if (!ifp) 584 519 return 0; 585 520 586 - if (ifp->if_format == XFS_DINODE_FMT_BTREE) { 587 - bool mappings_done; 588 - 521 + switch (ifp->if_format) { 522 + case XFS_DINODE_FMT_BTREE: 589 523 /* 590 524 * Scan the bmap btree for data device mappings. This includes 591 525 * the btree blocks themselves, even if this is a realtime ··· 594 528 error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done); 595 529 if (error || mappings_done) 596 530 return error; 597 - } else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) { 598 - return 0; 531 + fallthrough; 532 + case XFS_DINODE_FMT_EXTENTS: 533 + /* Scan incore extent cache if this isn't a realtime file. */ 534 + if (xfs_ifork_is_realtime(ip, whichfork)) 535 + return 0; 536 + 537 + return xrep_rmap_scan_iext(&rf, ifp); 538 + case XFS_DINODE_FMT_META_BTREE: 539 + return xrep_rmap_scan_meta_btree(&rf, ip); 599 540 } 600 541 601 - /* Scan incore extent cache if this isn't a realtime file. */ 602 - if (xfs_ifork_is_realtime(ip, whichfork)) 603 - return 0; 604 - 605 - return xrep_rmap_scan_iext(&rf, ifp); 542 + return 0; 606 543 } 607 544 608 545 /* ··· 1621 1552 if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo)) 1622 1553 goto out_unlock; 1623 1554 1624 - trace_xrep_rmap_live_update(rr->sc->sa.pag, action, p); 1555 + trace_xrep_rmap_live_update(pag_group(rr->sc->sa.pag), action, p); 1625 1556 1626 1557 error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp); 1627 1558 if (error)

+37

fs/xfs/scrub/rtb_bitmap.h

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <djwong@kernel.org> 5 + */ 6 + #ifndef __XFS_SCRUB_RTB_BITMAP_H__ 7 + #define __XFS_SCRUB_RTB_BITMAP_H__ 8 + 9 + /* Bitmaps, but for type-checked for xfs_rtblock_t */ 10 + 11 + struct xrtb_bitmap { 12 + struct xbitmap64 rtbitmap; 13 + }; 14 + 15 + static inline void xrtb_bitmap_init(struct xrtb_bitmap *bitmap) 16 + { 17 + xbitmap64_init(&bitmap->rtbitmap); 18 + } 19 + 20 + static inline void xrtb_bitmap_destroy(struct xrtb_bitmap *bitmap) 21 + { 22 + xbitmap64_destroy(&bitmap->rtbitmap); 23 + } 24 + 25 + static inline int xrtb_bitmap_set(struct xrtb_bitmap *bitmap, 26 + xfs_rtblock_t start, xfs_filblks_t len) 27 + { 28 + return xbitmap64_set(&bitmap->rtbitmap, start, len); 29 + } 30 + 31 + static inline int xrtb_bitmap_walk(struct xrtb_bitmap *bitmap, 32 + xbitmap64_walk_fn fn, void *priv) 33 + { 34 + return xbitmap64_walk(&bitmap->rtbitmap, fn, priv); 35 + } 36 + 37 + #endif /* __XFS_SCRUB_RTB_BITMAP_H__ */

+67 -10

fs/xfs/scrub/rtbitmap.c

··· 9 9 #include "xfs_format.h" 10 10 #include "xfs_trans_resv.h" 11 11 #include "xfs_mount.h" 12 + #include "xfs_btree.h" 12 13 #include "xfs_log_format.h" 13 14 #include "xfs_trans.h" 14 15 #include "xfs_rtbitmap.h" 15 16 #include "xfs_inode.h" 16 17 #include "xfs_bmap.h" 17 18 #include "xfs_bit.h" 19 + #include "xfs_rtgroup.h" 18 20 #include "xfs_sb.h" 21 + #include "xfs_rmap.h" 22 + #include "xfs_rtrmap_btree.h" 23 + #include "xfs_exchmaps.h" 19 24 #include "scrub/scrub.h" 20 25 #include "scrub/common.h" 21 26 #include "scrub/repair.h" 27 + #include "scrub/tempexch.h" 22 28 #include "scrub/rtbitmap.h" 29 + #include "scrub/btree.h" 23 30 24 31 /* Set us up with the realtime metadata locked. */ 25 32 int ··· 37 30 struct xchk_rtbitmap *rtb; 38 31 int error; 39 32 40 - rtb = kzalloc(sizeof(struct xchk_rtbitmap), XCHK_GFP_FLAGS); 33 + if (xchk_need_intent_drain(sc)) 34 + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); 35 + 36 + rtb = kzalloc(struct_size(rtb, words, xchk_rtbitmap_wordcnt(sc)), 37 + XCHK_GFP_FLAGS); 41 38 if (!rtb) 42 39 return -ENOMEM; 43 40 sc->buf = rtb; 41 + rtb->sc = sc; 44 42 45 43 error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr); 46 44 if (error) ··· 61 49 if (error) 62 50 return error; 63 51 64 - error = xchk_install_live_inode(sc, 65 - sc->sr.rtg->rtg_inodes[XFS_RTGI_BITMAP]); 52 + error = xchk_install_live_inode(sc, rtg_bitmap(sc->sr.rtg)); 66 53 if (error) 67 54 return error; 68 55 69 56 error = xchk_ino_dqattach(sc); 57 + if (error) 58 + return error; 59 + 60 + error = xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL); 70 61 if (error) 71 62 return error; 72 63 ··· 78 63 * trying to expand the bitmap or change the size of the rt volume. 79 64 * Hence it is safe to compute and check the geometry values. 80 65 */ 81 - xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP); 82 66 if (mp->m_sb.sb_rblocks) { 83 67 rtb->rextents = xfs_blen_to_rtbxlen(mp, mp->m_sb.sb_rblocks); 84 68 rtb->rextslog = xfs_compute_rextslog(rtb->rextents); ··· 87 73 return 0; 88 74 } 89 75 90 - /* Realtime bitmap. */ 76 + /* Per-rtgroup bitmap contents. */ 77 + 78 + /* Cross-reference rtbitmap entries with other metadata. */ 79 + STATIC void 80 + xchk_rtbitmap_xref( 81 + struct xchk_rtbitmap *rtb, 82 + xfs_rtblock_t startblock, 83 + xfs_rtblock_t blockcount) 84 + { 85 + struct xfs_scrub *sc = rtb->sc; 86 + xfs_rgblock_t rgbno = xfs_rtb_to_rgbno(sc->mp, startblock); 87 + 88 + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 89 + return; 90 + if (!sc->sr.rmap_cur) 91 + return; 92 + 93 + xchk_xref_has_no_rt_owner(sc, rgbno, blockcount); 94 + xchk_xref_is_not_rt_shared(sc, rgbno, blockcount); 95 + xchk_xref_is_not_rt_cow_staging(sc, rgbno, blockcount); 96 + 97 + if (rtb->next_free_rgbno < rgbno) 98 + xchk_xref_has_rt_owner(sc, rtb->next_free_rgbno, 99 + rgbno - rtb->next_free_rgbno); 100 + rtb->next_free_rgbno = rgbno + blockcount; 101 + } 91 102 92 103 /* Scrub a free extent record from the realtime bitmap. */ 93 104 STATIC int ··· 122 83 const struct xfs_rtalloc_rec *rec, 123 84 void *priv) 124 85 { 125 - struct xfs_scrub *sc = priv; 86 + struct xchk_rtbitmap *rtb = priv; 87 + struct xfs_scrub *sc = rtb->sc; 126 88 xfs_rtblock_t startblock; 127 89 xfs_filblks_t blockcount; 128 90 ··· 132 92 133 93 if (!xfs_verify_rtbext(rtg_mount(rtg), startblock, blockcount)) 134 94 xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); 95 + 96 + xchk_rtbitmap_xref(rtb, startblock, blockcount); 97 + 98 + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 99 + return -ECANCELED; 100 + 135 101 return 0; 136 102 } 137 103 ··· 185 139 return error; 186 140 } 187 141 188 - /* Scrub the realtime bitmap. */ 142 + /* Scrub this group's realtime bitmap. */ 189 143 int 190 144 xchk_rtbitmap( 191 145 struct xfs_scrub *sc) 192 146 { 193 147 struct xfs_mount *mp = sc->mp; 194 148 struct xfs_rtgroup *rtg = sc->sr.rtg; 195 - struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP]; 149 + struct xfs_inode *rbmip = rtg_bitmap(rtg); 196 150 struct xchk_rtbitmap *rtb = sc->buf; 151 + xfs_rgblock_t last_rgbno; 197 152 int error; 198 153 199 154 /* Is sb_rextents correct? */ ··· 247 200 if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) 248 201 return error; 249 202 250 - error = xfs_rtalloc_query_all(rtg, sc->tp, xchk_rtbitmap_rec, sc); 203 + rtb->next_free_rgbno = 0; 204 + error = xfs_rtalloc_query_all(rtg, sc->tp, xchk_rtbitmap_rec, rtb); 251 205 if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) 252 206 return error; 253 207 208 + /* 209 + * Check that the are rmappings for all rt extents between the end of 210 + * the last free extent we saw and the last possible extent in the rt 211 + * group. 212 + */ 213 + last_rgbno = rtg->rtg_extents * mp->m_sb.sb_rextsize - 1; 214 + if (rtb->next_free_rgbno < last_rgbno) 215 + xchk_xref_has_rt_owner(sc, rtb->next_free_rgbno, 216 + last_rgbno - rtb->next_free_rgbno); 254 217 return 0; 255 218 } 256 219 ··· 272 215 xfs_extlen_t len) 273 216 { 274 217 struct xfs_rtgroup *rtg = sc->sr.rtg; 275 - struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP]; 218 + struct xfs_inode *rbmip = rtg_bitmap(rtg); 276 219 xfs_rtxnum_t startext; 277 220 xfs_rtxnum_t endext; 278 221 bool is_free;

+55

fs/xfs/scrub/rtbitmap.h

··· 6 6 #ifndef __XFS_SCRUB_RTBITMAP_H__ 7 7 #define __XFS_SCRUB_RTBITMAP_H__ 8 8 9 + /* 10 + * We use an xfile to construct new bitmap blocks for the portion of the 11 + * rtbitmap file that we're replacing. Whereas the ondisk bitmap must be 12 + * accessed through the buffer cache, the xfile bitmap supports direct 13 + * word-level accesses. Therefore, we create a small abstraction for linear 14 + * access. 15 + */ 16 + typedef unsigned long long xrep_wordoff_t; 17 + typedef unsigned int xrep_wordcnt_t; 18 + 19 + /* Mask to round an rtx down to the nearest bitmap word. */ 20 + #define XREP_RTBMP_WORDMASK ((1ULL << XFS_NBWORDLOG) - 1) 21 + 22 + 9 23 struct xchk_rtbitmap { 24 + struct xfs_scrub *sc; 25 + 10 26 uint64_t rextents; 11 27 uint64_t rbmblocks; 12 28 unsigned int rextslog; 13 29 unsigned int resblks; 30 + 31 + /* The next free rt group block number that we expect to see. */ 32 + xfs_rgblock_t next_free_rgbno; 33 + 34 + #ifdef CONFIG_XFS_ONLINE_REPAIR 35 + /* stuff for staging a new bitmap */ 36 + struct xfs_rtalloc_args args; 37 + struct xrep_tempexch tempexch; 38 + #endif 39 + 40 + /* The next rtgroup block we expect to see during our rtrmapbt walk. */ 41 + xfs_rgblock_t next_rgbno; 42 + 43 + /* rtgroup lock flags */ 44 + unsigned int rtglock_flags; 45 + 46 + /* rtword position of xfile as we write buffers to disk. */ 47 + xrep_wordoff_t prep_wordoff; 48 + 49 + /* In-Memory rtbitmap for repair. */ 50 + union xfs_rtword_raw words[]; 14 51 }; 15 52 16 53 #ifdef CONFIG_XFS_ONLINE_REPAIR 17 54 int xrep_setup_rtbitmap(struct xfs_scrub *sc, struct xchk_rtbitmap *rtb); 55 + 56 + /* 57 + * How big should the words[] buffer be? 58 + * 59 + * For repairs, we want a full fsblock worth of space so that we can memcpy a 60 + * buffer full of 1s into the xfile bitmap. The xfile bitmap doesn't have 61 + * rtbitmap block headers, so we don't use blockwsize. Scrub doesn't use the 62 + * words buffer at all. 63 + */ 64 + static inline unsigned int 65 + xchk_rtbitmap_wordcnt( 66 + struct xfs_scrub *sc) 67 + { 68 + if (xchk_could_repair(sc)) 69 + return sc->mp->m_sb.sb_blocksize >> XFS_WORDLOG; 70 + return 0; 71 + } 18 72 #else 19 73 # define xrep_setup_rtbitmap(sc, rtb) (0) 74 + # define xchk_rtbitmap_wordcnt(sc) (0) 20 75 #endif /* CONFIG_XFS_ONLINE_REPAIR */ 21 76 22 77 #endif /* __XFS_SCRUB_RTBITMAP_H__ */

+443 -8

fs/xfs/scrub/rtbitmap_repair.c

··· 12 12 #include "xfs_btree.h" 13 13 #include "xfs_log_format.h" 14 14 #include "xfs_trans.h" 15 + #include "xfs_rtalloc.h" 15 16 #include "xfs_inode.h" 16 17 #include "xfs_bit.h" 17 18 #include "xfs_bmap.h" 18 19 #include "xfs_bmap_btree.h" 20 + #include "xfs_rmap.h" 21 + #include "xfs_rtrmap_btree.h" 22 + #include "xfs_exchmaps.h" 23 + #include "xfs_rtbitmap.h" 24 + #include "xfs_rtgroup.h" 25 + #include "xfs_extent_busy.h" 26 + #include "xfs_refcount.h" 19 27 #include "scrub/scrub.h" 20 28 #include "scrub/common.h" 21 29 #include "scrub/trace.h" 22 30 #include "scrub/repair.h" 23 31 #include "scrub/xfile.h" 32 + #include "scrub/tempfile.h" 33 + #include "scrub/tempexch.h" 34 + #include "scrub/reap.h" 24 35 #include "scrub/rtbitmap.h" 25 36 26 - /* Set up to repair the realtime bitmap file metadata. */ 37 + /* rt bitmap content repairs */ 38 + 39 + /* Set up to repair the realtime bitmap for this group. */ 27 40 int 28 41 xrep_setup_rtbitmap( 29 42 struct xfs_scrub *sc, 30 43 struct xchk_rtbitmap *rtb) 31 44 { 32 45 struct xfs_mount *mp = sc->mp; 33 - unsigned long long blocks = 0; 46 + char *descr; 47 + unsigned long long blocks = mp->m_sb.sb_rbmblocks; 48 + int error; 49 + 50 + error = xrep_tempfile_create(sc, S_IFREG); 51 + if (error) 52 + return error; 53 + 54 + /* Create an xfile to hold our reconstructed bitmap. */ 55 + descr = xchk_xfile_rtgroup_descr(sc, "bitmap file"); 56 + error = xfile_create(descr, blocks * mp->m_sb.sb_blocksize, &sc->xfile); 57 + kfree(descr); 58 + if (error) 59 + return error; 34 60 35 61 /* 36 - * Reserve enough blocks to write out a completely new bmbt for a 37 - * maximally fragmented bitmap file. We do not hold the rtbitmap 38 - * ILOCK yet, so this is entirely speculative. 62 + * Reserve enough blocks to write out a completely new bitmap file, 63 + * plus twice as many blocks as we would need if we can only allocate 64 + * one block per data fork mapping. This should cover the 65 + * preallocation of the temporary file and exchanging the extent 66 + * mappings. 67 + * 68 + * We cannot use xfs_exchmaps_estimate because we have not yet 69 + * constructed the replacement bitmap and therefore do not know how 70 + * many extents it will use. By the time we do, we will have a dirty 71 + * transaction (which we cannot drop because we cannot drop the 72 + * rtbitmap ILOCK) and cannot ask for more reservation. 39 73 */ 40 - blocks = xfs_bmbt_calc_size(mp, mp->m_sb.sb_rbmblocks); 74 + blocks += xfs_bmbt_calc_size(mp, blocks) * 2; 41 75 if (blocks > UINT_MAX) 42 76 return -EOPNOTSUPP; 43 77 44 78 rtb->resblks += blocks; 79 + return 0; 80 + } 81 + 82 + static inline xrep_wordoff_t 83 + rtx_to_wordoff( 84 + struct xfs_mount *mp, 85 + xfs_rtxnum_t rtx) 86 + { 87 + return rtx >> XFS_NBWORDLOG; 88 + } 89 + 90 + static inline xrep_wordcnt_t 91 + rtxlen_to_wordcnt( 92 + xfs_rtxlen_t rtxlen) 93 + { 94 + return rtxlen >> XFS_NBWORDLOG; 95 + } 96 + 97 + /* Helper functions to record rtwords in an xfile. */ 98 + 99 + static inline int 100 + xfbmp_load( 101 + struct xchk_rtbitmap *rtb, 102 + xrep_wordoff_t wordoff, 103 + xfs_rtword_t *word) 104 + { 105 + union xfs_rtword_raw urk; 106 + int error; 107 + 108 + ASSERT(xfs_has_rtgroups(rtb->sc->mp)); 109 + 110 + error = xfile_load(rtb->sc->xfile, &urk, 111 + sizeof(union xfs_rtword_raw), 112 + wordoff << XFS_WORDLOG); 113 + if (error) 114 + return error; 115 + 116 + *word = be32_to_cpu(urk.rtg); 117 + return 0; 118 + } 119 + 120 + static inline int 121 + xfbmp_store( 122 + struct xchk_rtbitmap *rtb, 123 + xrep_wordoff_t wordoff, 124 + const xfs_rtword_t word) 125 + { 126 + union xfs_rtword_raw urk; 127 + 128 + ASSERT(xfs_has_rtgroups(rtb->sc->mp)); 129 + 130 + urk.rtg = cpu_to_be32(word); 131 + return xfile_store(rtb->sc->xfile, &urk, 132 + sizeof(union xfs_rtword_raw), 133 + wordoff << XFS_WORDLOG); 134 + } 135 + 136 + static inline int 137 + xfbmp_copyin( 138 + struct xchk_rtbitmap *rtb, 139 + xrep_wordoff_t wordoff, 140 + const union xfs_rtword_raw *word, 141 + xrep_wordcnt_t nr_words) 142 + { 143 + return xfile_store(rtb->sc->xfile, word, nr_words << XFS_WORDLOG, 144 + wordoff << XFS_WORDLOG); 145 + } 146 + 147 + static inline int 148 + xfbmp_copyout( 149 + struct xchk_rtbitmap *rtb, 150 + xrep_wordoff_t wordoff, 151 + union xfs_rtword_raw *word, 152 + xrep_wordcnt_t nr_words) 153 + { 154 + return xfile_load(rtb->sc->xfile, word, nr_words << XFS_WORDLOG, 155 + wordoff << XFS_WORDLOG); 156 + } 157 + 158 + /* Perform a logical OR operation on an rtword in the incore bitmap. */ 159 + static int 160 + xrep_rtbitmap_or( 161 + struct xchk_rtbitmap *rtb, 162 + xrep_wordoff_t wordoff, 163 + xfs_rtword_t mask) 164 + { 165 + xfs_rtword_t word; 166 + int error; 167 + 168 + error = xfbmp_load(rtb, wordoff, &word); 169 + if (error) 170 + return error; 171 + 172 + trace_xrep_rtbitmap_or(rtb->sc->mp, wordoff, mask, word); 173 + 174 + return xfbmp_store(rtb, wordoff, word | mask); 175 + } 176 + 177 + /* 178 + * Mark as free every rt extent between the next rt block we expected to see 179 + * in the rtrmap records and the given rt block. 180 + */ 181 + STATIC int 182 + xrep_rtbitmap_mark_free( 183 + struct xchk_rtbitmap *rtb, 184 + xfs_rgblock_t rgbno) 185 + { 186 + struct xfs_mount *mp = rtb->sc->mp; 187 + struct xchk_rt *sr = &rtb->sc->sr; 188 + struct xfs_rtgroup *rtg = sr->rtg; 189 + xfs_rtxnum_t startrtx; 190 + xfs_rtxnum_t nextrtx; 191 + xrep_wordoff_t wordoff, nextwordoff; 192 + unsigned int bit; 193 + unsigned int bufwsize; 194 + xfs_extlen_t mod; 195 + xfs_rtword_t mask; 196 + enum xbtree_recpacking outcome; 197 + int error; 198 + 199 + if (!xfs_verify_rgbext(rtg, rtb->next_rgbno, rgbno - rtb->next_rgbno)) 200 + return -EFSCORRUPTED; 201 + 202 + /* 203 + * Convert rt blocks to rt extents The block range we find must be 204 + * aligned to an rtextent boundary on both ends. 205 + */ 206 + startrtx = xfs_rgbno_to_rtx(mp, rtb->next_rgbno); 207 + mod = xfs_rgbno_to_rtxoff(mp, rtb->next_rgbno); 208 + if (mod) 209 + return -EFSCORRUPTED; 210 + 211 + nextrtx = xfs_rgbno_to_rtx(mp, rgbno - 1) + 1; 212 + mod = xfs_rgbno_to_rtxoff(mp, rgbno - 1); 213 + if (mod != mp->m_sb.sb_rextsize - 1) 214 + return -EFSCORRUPTED; 215 + 216 + /* Must not be shared or CoW staging. */ 217 + if (sr->refc_cur) { 218 + error = xfs_refcount_has_records(sr->refc_cur, 219 + XFS_REFC_DOMAIN_SHARED, rtb->next_rgbno, 220 + rgbno - rtb->next_rgbno, &outcome); 221 + if (error) 222 + return error; 223 + if (outcome != XBTREE_RECPACKING_EMPTY) 224 + return -EFSCORRUPTED; 225 + 226 + error = xfs_refcount_has_records(sr->refc_cur, 227 + XFS_REFC_DOMAIN_COW, rtb->next_rgbno, 228 + rgbno - rtb->next_rgbno, &outcome); 229 + if (error) 230 + return error; 231 + if (outcome != XBTREE_RECPACKING_EMPTY) 232 + return -EFSCORRUPTED; 233 + } 234 + 235 + trace_xrep_rtbitmap_record_free(mp, startrtx, nextrtx - 1); 236 + 237 + /* Set bits as needed to round startrtx up to the nearest word. */ 238 + bit = startrtx & XREP_RTBMP_WORDMASK; 239 + if (bit) { 240 + xfs_rtblock_t len = nextrtx - startrtx; 241 + unsigned int lastbit; 242 + 243 + lastbit = min(bit + len, XFS_NBWORD); 244 + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; 245 + 246 + error = xrep_rtbitmap_or(rtb, rtx_to_wordoff(mp, startrtx), 247 + mask); 248 + if (error || lastbit - bit == len) 249 + return error; 250 + startrtx += XFS_NBWORD - bit; 251 + } 252 + 253 + /* Set bits as needed to round nextrtx down to the nearest word. */ 254 + bit = nextrtx & XREP_RTBMP_WORDMASK; 255 + if (bit) { 256 + mask = ((xfs_rtword_t)1 << bit) - 1; 257 + 258 + error = xrep_rtbitmap_or(rtb, rtx_to_wordoff(mp, nextrtx), 259 + mask); 260 + if (error || startrtx + bit == nextrtx) 261 + return error; 262 + nextrtx -= bit; 263 + } 264 + 265 + trace_xrep_rtbitmap_record_free_bulk(mp, startrtx, nextrtx - 1); 266 + 267 + /* Set all the words in between, up to a whole fs block at once. */ 268 + wordoff = rtx_to_wordoff(mp, startrtx); 269 + nextwordoff = rtx_to_wordoff(mp, nextrtx); 270 + bufwsize = mp->m_sb.sb_blocksize >> XFS_WORDLOG; 271 + 272 + while (wordoff < nextwordoff) { 273 + xrep_wordoff_t rem; 274 + xrep_wordcnt_t wordcnt; 275 + 276 + wordcnt = min_t(xrep_wordcnt_t, nextwordoff - wordoff, 277 + bufwsize); 278 + 279 + /* 280 + * Try to keep us aligned to the rtwords buffer to reduce the 281 + * number of xfile writes. 282 + */ 283 + rem = wordoff & (bufwsize - 1); 284 + if (rem) 285 + wordcnt = min_t(xrep_wordcnt_t, wordcnt, 286 + bufwsize - rem); 287 + 288 + error = xfbmp_copyin(rtb, wordoff, rtb->words, wordcnt); 289 + if (error) 290 + return error; 291 + 292 + wordoff += wordcnt; 293 + } 294 + 295 + return 0; 296 + } 297 + 298 + /* Set free space in the rtbitmap based on rtrmapbt records. */ 299 + STATIC int 300 + xrep_rtbitmap_walk_rtrmap( 301 + struct xfs_btree_cur *cur, 302 + const struct xfs_rmap_irec *rec, 303 + void *priv) 304 + { 305 + struct xchk_rtbitmap *rtb = priv; 306 + int error = 0; 307 + 308 + if (xchk_should_terminate(rtb->sc, &error)) 309 + return error; 310 + 311 + if (rtb->next_rgbno < rec->rm_startblock) { 312 + error = xrep_rtbitmap_mark_free(rtb, rec->rm_startblock); 313 + if (error) 314 + return error; 315 + } 316 + 317 + rtb->next_rgbno = max(rtb->next_rgbno, 318 + rec->rm_startblock + rec->rm_blockcount); 319 + return 0; 320 + } 321 + 322 + /* 323 + * Walk the rtrmapbt to find all the gaps between records, and mark the gaps 324 + * in the realtime bitmap that we're computing. 325 + */ 326 + STATIC int 327 + xrep_rtbitmap_find_freespace( 328 + struct xchk_rtbitmap *rtb) 329 + { 330 + struct xfs_scrub *sc = rtb->sc; 331 + struct xfs_mount *mp = sc->mp; 332 + struct xfs_rtgroup *rtg = sc->sr.rtg; 333 + uint64_t blockcount; 334 + int error; 335 + 336 + /* Prepare a buffer of ones so that we can accelerate bulk setting. */ 337 + memset(rtb->words, 0xFF, mp->m_sb.sb_blocksize); 338 + 339 + xrep_rtgroup_btcur_init(sc, &sc->sr); 340 + error = xfs_rmap_query_all(sc->sr.rmap_cur, xrep_rtbitmap_walk_rtrmap, 341 + rtb); 342 + if (error) 343 + goto out; 344 + 345 + /* 346 + * Mark as free every possible rt extent from the last one we saw to 347 + * the end of the rt group. 348 + */ 349 + blockcount = rtg->rtg_extents * mp->m_sb.sb_rextsize; 350 + if (rtb->next_rgbno < blockcount) { 351 + error = xrep_rtbitmap_mark_free(rtb, blockcount); 352 + if (error) 353 + goto out; 354 + } 355 + 356 + out: 357 + xchk_rtgroup_btcur_free(&sc->sr); 358 + return error; 359 + } 360 + 361 + static int 362 + xrep_rtbitmap_prep_buf( 363 + struct xfs_scrub *sc, 364 + struct xfs_buf *bp, 365 + void *data) 366 + { 367 + struct xchk_rtbitmap *rtb = data; 368 + struct xfs_mount *mp = sc->mp; 369 + union xfs_rtword_raw *ondisk; 370 + int error; 371 + 372 + rtb->args.mp = sc->mp; 373 + rtb->args.tp = sc->tp; 374 + rtb->args.rbmbp = bp; 375 + ondisk = xfs_rbmblock_wordptr(&rtb->args, 0); 376 + rtb->args.rbmbp = NULL; 377 + 378 + error = xfbmp_copyout(rtb, rtb->prep_wordoff, ondisk, 379 + mp->m_blockwsize); 380 + if (error) 381 + return error; 382 + 383 + if (xfs_has_rtgroups(sc->mp)) { 384 + struct xfs_rtbuf_blkinfo *hdr = bp->b_addr; 385 + 386 + hdr->rt_magic = cpu_to_be32(XFS_RTBITMAP_MAGIC); 387 + hdr->rt_owner = cpu_to_be64(sc->ip->i_ino); 388 + hdr->rt_blkno = cpu_to_be64(xfs_buf_daddr(bp)); 389 + hdr->rt_lsn = 0; 390 + uuid_copy(&hdr->rt_uuid, &sc->mp->m_sb.sb_meta_uuid); 391 + bp->b_ops = &xfs_rtbitmap_buf_ops; 392 + } else { 393 + bp->b_ops = &xfs_rtbuf_ops; 394 + } 395 + 396 + rtb->prep_wordoff += mp->m_blockwsize; 397 + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_RTBITMAP_BUF); 45 398 return 0; 46 399 } 47 400 ··· 513 160 { 514 161 struct xchk_rtbitmap *rtb = sc->buf; 515 162 struct xfs_mount *mp = sc->mp; 163 + struct xfs_group *xg = rtg_group(sc->sr.rtg); 516 164 unsigned long long blocks = 0; 165 + unsigned int busy_gen; 517 166 int error; 167 + 168 + /* We require the realtime rmapbt to rebuild anything. */ 169 + if (!xfs_has_rtrmapbt(sc->mp)) 170 + return -EOPNOTSUPP; 171 + /* We require atomic file exchange range to rebuild anything. */ 172 + if (!xfs_has_exchange_range(sc->mp)) 173 + return -EOPNOTSUPP; 518 174 519 175 /* Impossibly large rtbitmap means we can't touch the filesystem. */ 520 176 if (rtb->rbmblocks > U32_MAX) ··· 557 195 if (error) 558 196 return error; 559 197 560 - /* Fix inconsistent bitmap geometry */ 561 - return xrep_rtbitmap_geometry(sc, rtb); 198 + /* 199 + * Fix inconsistent bitmap geometry. This function returns with a 200 + * clean scrub transaction. 201 + */ 202 + error = xrep_rtbitmap_geometry(sc, rtb); 203 + if (error) 204 + return error; 205 + 206 + /* 207 + * Make sure the busy extent list is clear because we can't put extents 208 + * on there twice. 209 + */ 210 + if (!xfs_extent_busy_list_empty(xg, &busy_gen)) { 211 + error = xfs_extent_busy_flush(sc->tp, xg, busy_gen, 0); 212 + if (error) 213 + return error; 214 + } 215 + 216 + /* 217 + * Generate the new rtbitmap data. We don't need the rtbmp information 218 + * once this call is finished. 219 + */ 220 + error = xrep_rtbitmap_find_freespace(rtb); 221 + if (error) 222 + return error; 223 + 224 + /* 225 + * Try to take ILOCK_EXCL of the temporary file. We had better be the 226 + * only ones holding onto this inode, but we can't block while holding 227 + * the rtbitmap file's ILOCK_EXCL. 228 + */ 229 + while (!xrep_tempfile_ilock_nowait(sc)) { 230 + if (xchk_should_terminate(sc, &error)) 231 + return error; 232 + delay(1); 233 + } 234 + 235 + /* 236 + * Make sure we have space allocated for the part of the bitmap 237 + * file that corresponds to this group. We already joined sc->ip. 238 + */ 239 + xfs_trans_ijoin(sc->tp, sc->tempip, 0); 240 + error = xrep_tempfile_prealloc(sc, 0, rtb->rbmblocks); 241 + if (error) 242 + return error; 243 + 244 + /* Last chance to abort before we start committing fixes. */ 245 + if (xchk_should_terminate(sc, &error)) 246 + return error; 247 + 248 + /* Copy the bitmap file that we generated. */ 249 + error = xrep_tempfile_copyin(sc, 0, rtb->rbmblocks, 250 + xrep_rtbitmap_prep_buf, rtb); 251 + if (error) 252 + return error; 253 + error = xrep_tempfile_set_isize(sc, 254 + XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)); 255 + if (error) 256 + return error; 257 + 258 + /* 259 + * Now exchange the data fork contents. We're done with the temporary 260 + * buffer, so we can reuse it for the tempfile exchmaps information. 261 + */ 262 + error = xrep_tempexch_trans_reserve(sc, XFS_DATA_FORK, 0, 263 + rtb->rbmblocks, &rtb->tempexch); 264 + if (error) 265 + return error; 266 + 267 + error = xrep_tempexch_contents(sc, &rtb->tempexch); 268 + if (error) 269 + return error; 270 + 271 + /* Free the old rtbitmap blocks if they're not in use. */ 272 + return xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK); 562 273 }

+661

fs/xfs/scrub/rtrefcount.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <djwong@kernel.org> 5 + */ 6 + #include "xfs.h" 7 + #include "xfs_fs.h" 8 + #include "xfs_shared.h" 9 + #include "xfs_format.h" 10 + #include "xfs_log_format.h" 11 + #include "xfs_trans_resv.h" 12 + #include "xfs_mount.h" 13 + #include "xfs_trans.h" 14 + #include "xfs_btree.h" 15 + #include "xfs_rmap.h" 16 + #include "xfs_refcount.h" 17 + #include "xfs_inode.h" 18 + #include "xfs_rtbitmap.h" 19 + #include "xfs_rtgroup.h" 20 + #include "xfs_metafile.h" 21 + #include "xfs_rtrefcount_btree.h" 22 + #include "xfs_rtalloc.h" 23 + #include "scrub/scrub.h" 24 + #include "scrub/common.h" 25 + #include "scrub/btree.h" 26 + #include "scrub/repair.h" 27 + 28 + /* Set us up with the realtime refcount metadata locked. */ 29 + int 30 + xchk_setup_rtrefcountbt( 31 + struct xfs_scrub *sc) 32 + { 33 + int error; 34 + 35 + if (xchk_need_intent_drain(sc)) 36 + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); 37 + 38 + if (xchk_could_repair(sc)) { 39 + error = xrep_setup_rtrefcountbt(sc); 40 + if (error) 41 + return error; 42 + } 43 + 44 + error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr); 45 + if (error) 46 + return error; 47 + 48 + error = xchk_setup_rt(sc); 49 + if (error) 50 + return error; 51 + 52 + error = xchk_install_live_inode(sc, rtg_refcount(sc->sr.rtg)); 53 + if (error) 54 + return error; 55 + 56 + return xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL); 57 + } 58 + 59 + /* Realtime Reference count btree scrubber. */ 60 + 61 + /* 62 + * Confirming Reference Counts via Reverse Mappings 63 + * 64 + * We want to count the reverse mappings overlapping a refcount record 65 + * (bno, len, refcount), allowing for the possibility that some of the 66 + * overlap may come from smaller adjoining reverse mappings, while some 67 + * comes from single extents which overlap the range entirely. The 68 + * outer loop is as follows: 69 + * 70 + * 1. For all reverse mappings overlapping the refcount extent, 71 + * a. If a given rmap completely overlaps, mark it as seen. 72 + * b. Otherwise, record the fragment (in agbno order) for later 73 + * processing. 74 + * 75 + * Once we've seen all the rmaps, we know that for all blocks in the 76 + * refcount record we want to find $refcount owners and we've already 77 + * visited $seen extents that overlap all the blocks. Therefore, we 78 + * need to find ($refcount - $seen) owners for every block in the 79 + * extent; call that quantity $target_nr. Proceed as follows: 80 + * 81 + * 2. Pull the first $target_nr fragments from the list; all of them 82 + * should start at or before the start of the extent. 83 + * Call this subset of fragments the working set. 84 + * 3. Until there are no more unprocessed fragments, 85 + * a. Find the shortest fragments in the set and remove them. 86 + * b. Note the block number of the end of these fragments. 87 + * c. Pull the same number of fragments from the list. All of these 88 + * fragments should start at the block number recorded in the 89 + * previous step. 90 + * d. Put those fragments in the set. 91 + * 4. Check that there are $target_nr fragments remaining in the list, 92 + * and that they all end at or beyond the end of the refcount extent. 93 + * 94 + * If the refcount is correct, all the check conditions in the algorithm 95 + * should always hold true. If not, the refcount is incorrect. 96 + */ 97 + struct xchk_rtrefcnt_frag { 98 + struct list_head list; 99 + struct xfs_rmap_irec rm; 100 + }; 101 + 102 + struct xchk_rtrefcnt_check { 103 + struct xfs_scrub *sc; 104 + struct list_head fragments; 105 + 106 + /* refcount extent we're examining */ 107 + xfs_rgblock_t bno; 108 + xfs_extlen_t len; 109 + xfs_nlink_t refcount; 110 + 111 + /* number of owners seen */ 112 + xfs_nlink_t seen; 113 + }; 114 + 115 + /* 116 + * Decide if the given rmap is large enough that we can redeem it 117 + * towards refcount verification now, or if it's a fragment, in 118 + * which case we'll hang onto it in the hopes that we'll later 119 + * discover that we've collected exactly the correct number of 120 + * fragments as the rtrefcountbt says we should have. 121 + */ 122 + STATIC int 123 + xchk_rtrefcountbt_rmap_check( 124 + struct xfs_btree_cur *cur, 125 + const struct xfs_rmap_irec *rec, 126 + void *priv) 127 + { 128 + struct xchk_rtrefcnt_check *refchk = priv; 129 + struct xchk_rtrefcnt_frag *frag; 130 + xfs_rgblock_t rm_last; 131 + xfs_rgblock_t rc_last; 132 + int error = 0; 133 + 134 + if (xchk_should_terminate(refchk->sc, &error)) 135 + return error; 136 + 137 + rm_last = rec->rm_startblock + rec->rm_blockcount - 1; 138 + rc_last = refchk->bno + refchk->len - 1; 139 + 140 + /* Confirm that a single-owner refc extent is a CoW stage. */ 141 + if (refchk->refcount == 1 && rec->rm_owner != XFS_RMAP_OWN_COW) { 142 + xchk_btree_xref_set_corrupt(refchk->sc, cur, 0); 143 + return 0; 144 + } 145 + 146 + if (rec->rm_startblock <= refchk->bno && rm_last >= rc_last) { 147 + /* 148 + * The rmap overlaps the refcount record, so we can confirm 149 + * one refcount owner seen. 150 + */ 151 + refchk->seen++; 152 + } else { 153 + /* 154 + * This rmap covers only part of the refcount record, so 155 + * save the fragment for later processing. If the rmapbt 156 + * is healthy each rmap_irec we see will be in agbno order 157 + * so we don't need insertion sort here. 158 + */ 159 + frag = kmalloc(sizeof(struct xchk_rtrefcnt_frag), 160 + XCHK_GFP_FLAGS); 161 + if (!frag) 162 + return -ENOMEM; 163 + memcpy(&frag->rm, rec, sizeof(frag->rm)); 164 + list_add_tail(&frag->list, &refchk->fragments); 165 + } 166 + 167 + return 0; 168 + } 169 + 170 + /* 171 + * Given a bunch of rmap fragments, iterate through them, keeping 172 + * a running tally of the refcount. If this ever deviates from 173 + * what we expect (which is the rtrefcountbt's refcount minus the 174 + * number of extents that totally covered the rtrefcountbt extent), 175 + * we have a rtrefcountbt error. 176 + */ 177 + STATIC void 178 + xchk_rtrefcountbt_process_rmap_fragments( 179 + struct xchk_rtrefcnt_check *refchk) 180 + { 181 + struct list_head worklist; 182 + struct xchk_rtrefcnt_frag *frag; 183 + struct xchk_rtrefcnt_frag *n; 184 + xfs_rgblock_t bno; 185 + xfs_rgblock_t rbno; 186 + xfs_rgblock_t next_rbno; 187 + xfs_nlink_t nr; 188 + xfs_nlink_t target_nr; 189 + 190 + target_nr = refchk->refcount - refchk->seen; 191 + if (target_nr == 0) 192 + return; 193 + 194 + /* 195 + * There are (refchk->rc.rc_refcount - refchk->nr refcount) 196 + * references we haven't found yet. Pull that many off the 197 + * fragment list and figure out where the smallest rmap ends 198 + * (and therefore the next rmap should start). All the rmaps 199 + * we pull off should start at or before the beginning of the 200 + * refcount record's range. 201 + */ 202 + INIT_LIST_HEAD(&worklist); 203 + rbno = NULLRGBLOCK; 204 + 205 + /* Make sure the fragments actually /are/ in bno order. */ 206 + bno = 0; 207 + list_for_each_entry(frag, &refchk->fragments, list) { 208 + if (frag->rm.rm_startblock < bno) 209 + goto done; 210 + bno = frag->rm.rm_startblock; 211 + } 212 + 213 + /* 214 + * Find all the rmaps that start at or before the refc extent, 215 + * and put them on the worklist. 216 + */ 217 + nr = 0; 218 + list_for_each_entry_safe(frag, n, &refchk->fragments, list) { 219 + if (frag->rm.rm_startblock > refchk->bno || nr > target_nr) 220 + break; 221 + bno = frag->rm.rm_startblock + frag->rm.rm_blockcount; 222 + if (bno < rbno) 223 + rbno = bno; 224 + list_move_tail(&frag->list, &worklist); 225 + nr++; 226 + } 227 + 228 + /* 229 + * We should have found exactly $target_nr rmap fragments starting 230 + * at or before the refcount extent. 231 + */ 232 + if (nr != target_nr) 233 + goto done; 234 + 235 + while (!list_empty(&refchk->fragments)) { 236 + /* Discard any fragments ending at rbno from the worklist. */ 237 + nr = 0; 238 + next_rbno = NULLRGBLOCK; 239 + list_for_each_entry_safe(frag, n, &worklist, list) { 240 + bno = frag->rm.rm_startblock + frag->rm.rm_blockcount; 241 + if (bno != rbno) { 242 + if (bno < next_rbno) 243 + next_rbno = bno; 244 + continue; 245 + } 246 + list_del(&frag->list); 247 + kfree(frag); 248 + nr++; 249 + } 250 + 251 + /* Try to add nr rmaps starting at rbno to the worklist. */ 252 + list_for_each_entry_safe(frag, n, &refchk->fragments, list) { 253 + bno = frag->rm.rm_startblock + frag->rm.rm_blockcount; 254 + if (frag->rm.rm_startblock != rbno) 255 + goto done; 256 + list_move_tail(&frag->list, &worklist); 257 + if (next_rbno > bno) 258 + next_rbno = bno; 259 + nr--; 260 + if (nr == 0) 261 + break; 262 + } 263 + 264 + /* 265 + * If we get here and nr > 0, this means that we added fewer 266 + * items to the worklist than we discarded because the fragment 267 + * list ran out of items. Therefore, we cannot maintain the 268 + * required refcount. Something is wrong, so we're done. 269 + */ 270 + if (nr) 271 + goto done; 272 + 273 + rbno = next_rbno; 274 + } 275 + 276 + /* 277 + * Make sure the last extent we processed ends at or beyond 278 + * the end of the refcount extent. 279 + */ 280 + if (rbno < refchk->bno + refchk->len) 281 + goto done; 282 + 283 + /* Actually record us having seen the remaining refcount. */ 284 + refchk->seen = refchk->refcount; 285 + done: 286 + /* Delete fragments and work list. */ 287 + list_for_each_entry_safe(frag, n, &worklist, list) { 288 + list_del(&frag->list); 289 + kfree(frag); 290 + } 291 + list_for_each_entry_safe(frag, n, &refchk->fragments, list) { 292 + list_del(&frag->list); 293 + kfree(frag); 294 + } 295 + } 296 + 297 + /* Use the rmap entries covering this extent to verify the refcount. */ 298 + STATIC void 299 + xchk_rtrefcountbt_xref_rmap( 300 + struct xfs_scrub *sc, 301 + const struct xfs_refcount_irec *irec) 302 + { 303 + struct xchk_rtrefcnt_check refchk = { 304 + .sc = sc, 305 + .bno = irec->rc_startblock, 306 + .len = irec->rc_blockcount, 307 + .refcount = irec->rc_refcount, 308 + .seen = 0, 309 + }; 310 + struct xfs_rmap_irec low; 311 + struct xfs_rmap_irec high; 312 + struct xchk_rtrefcnt_frag *frag; 313 + struct xchk_rtrefcnt_frag *n; 314 + int error; 315 + 316 + if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm)) 317 + return; 318 + 319 + /* Cross-reference with the rmapbt to confirm the refcount. */ 320 + memset(&low, 0, sizeof(low)); 321 + low.rm_startblock = irec->rc_startblock; 322 + memset(&high, 0xFF, sizeof(high)); 323 + high.rm_startblock = irec->rc_startblock + irec->rc_blockcount - 1; 324 + 325 + INIT_LIST_HEAD(&refchk.fragments); 326 + error = xfs_rmap_query_range(sc->sr.rmap_cur, &low, &high, 327 + xchk_rtrefcountbt_rmap_check, &refchk); 328 + if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur)) 329 + goto out_free; 330 + 331 + xchk_rtrefcountbt_process_rmap_fragments(&refchk); 332 + if (irec->rc_refcount != refchk.seen) 333 + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); 334 + 335 + out_free: 336 + list_for_each_entry_safe(frag, n, &refchk.fragments, list) { 337 + list_del(&frag->list); 338 + kfree(frag); 339 + } 340 + } 341 + 342 + /* Cross-reference with the other btrees. */ 343 + STATIC void 344 + xchk_rtrefcountbt_xref( 345 + struct xfs_scrub *sc, 346 + const struct xfs_refcount_irec *irec) 347 + { 348 + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 349 + return; 350 + 351 + xchk_xref_is_used_rt_space(sc, 352 + xfs_rgbno_to_rtb(sc->sr.rtg, irec->rc_startblock), 353 + irec->rc_blockcount); 354 + xchk_rtrefcountbt_xref_rmap(sc, irec); 355 + } 356 + 357 + struct xchk_rtrefcbt_records { 358 + /* Previous refcount record. */ 359 + struct xfs_refcount_irec prev_rec; 360 + 361 + /* The next rtgroup block where we aren't expecting shared extents. */ 362 + xfs_rgblock_t next_unshared_rgbno; 363 + 364 + /* Number of CoW blocks we expect. */ 365 + xfs_extlen_t cow_blocks; 366 + 367 + /* Was the last record a shared or CoW staging extent? */ 368 + enum xfs_refc_domain prev_domain; 369 + }; 370 + 371 + static inline bool 372 + xchk_rtrefcount_mergeable( 373 + struct xchk_rtrefcbt_records *rrc, 374 + const struct xfs_refcount_irec *r2) 375 + { 376 + const struct xfs_refcount_irec *r1 = &rrc->prev_rec; 377 + 378 + /* Ignore if prev_rec is not yet initialized. */ 379 + if (r1->rc_blockcount > 0) 380 + return false; 381 + 382 + if (r1->rc_startblock + r1->rc_blockcount != r2->rc_startblock) 383 + return false; 384 + if (r1->rc_refcount != r2->rc_refcount) 385 + return false; 386 + if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount > 387 + XFS_REFC_LEN_MAX) 388 + return false; 389 + 390 + return true; 391 + } 392 + 393 + /* Flag failures for records that could be merged. */ 394 + STATIC void 395 + xchk_rtrefcountbt_check_mergeable( 396 + struct xchk_btree *bs, 397 + struct xchk_rtrefcbt_records *rrc, 398 + const struct xfs_refcount_irec *irec) 399 + { 400 + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 401 + return; 402 + 403 + if (xchk_rtrefcount_mergeable(rrc, irec)) 404 + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); 405 + 406 + memcpy(&rrc->prev_rec, irec, sizeof(struct xfs_refcount_irec)); 407 + } 408 + 409 + STATIC int 410 + xchk_rtrefcountbt_rmap_check_gap( 411 + struct xfs_btree_cur *cur, 412 + const struct xfs_rmap_irec *rec, 413 + void *priv) 414 + { 415 + xfs_rgblock_t *next_bno = priv; 416 + 417 + if (*next_bno != NULLRGBLOCK && rec->rm_startblock < *next_bno) 418 + return -ECANCELED; 419 + 420 + *next_bno = rec->rm_startblock + rec->rm_blockcount; 421 + return 0; 422 + } 423 + 424 + /* 425 + * Make sure that a gap in the reference count records does not correspond to 426 + * overlapping records (i.e. shared extents) in the reverse mappings. 427 + */ 428 + static inline void 429 + xchk_rtrefcountbt_xref_gaps( 430 + struct xfs_scrub *sc, 431 + struct xchk_rtrefcbt_records *rrc, 432 + xfs_rtblock_t bno) 433 + { 434 + struct xfs_rmap_irec low; 435 + struct xfs_rmap_irec high; 436 + xfs_rgblock_t next_bno = NULLRGBLOCK; 437 + int error; 438 + 439 + if (bno <= rrc->next_unshared_rgbno || !sc->sr.rmap_cur || 440 + xchk_skip_xref(sc->sm)) 441 + return; 442 + 443 + memset(&low, 0, sizeof(low)); 444 + low.rm_startblock = rrc->next_unshared_rgbno; 445 + memset(&high, 0xFF, sizeof(high)); 446 + high.rm_startblock = bno - 1; 447 + 448 + error = xfs_rmap_query_range(sc->sr.rmap_cur, &low, &high, 449 + xchk_rtrefcountbt_rmap_check_gap, &next_bno); 450 + if (error == -ECANCELED) 451 + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); 452 + else 453 + xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur); 454 + } 455 + 456 + /* Scrub a rtrefcountbt record. */ 457 + STATIC int 458 + xchk_rtrefcountbt_rec( 459 + struct xchk_btree *bs, 460 + const union xfs_btree_rec *rec) 461 + { 462 + struct xfs_mount *mp = bs->cur->bc_mp; 463 + struct xchk_rtrefcbt_records *rrc = bs->private; 464 + struct xfs_refcount_irec irec; 465 + u32 mod; 466 + 467 + xfs_refcount_btrec_to_irec(rec, &irec); 468 + if (xfs_rtrefcount_check_irec(to_rtg(bs->cur->bc_group), &irec) != 469 + NULL) { 470 + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); 471 + return 0; 472 + } 473 + 474 + /* We can only share full rt extents. */ 475 + mod = xfs_rgbno_to_rtxoff(mp, irec.rc_startblock); 476 + if (mod) 477 + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); 478 + mod = xfs_extlen_to_rtxmod(mp, irec.rc_blockcount); 479 + if (mod) 480 + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); 481 + 482 + if (irec.rc_domain == XFS_REFC_DOMAIN_COW) 483 + rrc->cow_blocks += irec.rc_blockcount; 484 + 485 + /* Shared records always come before CoW records. */ 486 + if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED && 487 + rrc->prev_domain == XFS_REFC_DOMAIN_COW) 488 + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); 489 + rrc->prev_domain = irec.rc_domain; 490 + 491 + xchk_rtrefcountbt_check_mergeable(bs, rrc, &irec); 492 + xchk_rtrefcountbt_xref(bs->sc, &irec); 493 + 494 + /* 495 + * If this is a record for a shared extent, check that all blocks 496 + * between the previous record and this one have at most one reverse 497 + * mapping. 498 + */ 499 + if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED) { 500 + xchk_rtrefcountbt_xref_gaps(bs->sc, rrc, irec.rc_startblock); 501 + rrc->next_unshared_rgbno = irec.rc_startblock + 502 + irec.rc_blockcount; 503 + } 504 + 505 + return 0; 506 + } 507 + 508 + /* Make sure we have as many refc blocks as the rmap says. */ 509 + STATIC void 510 + xchk_refcount_xref_rmap( 511 + struct xfs_scrub *sc, 512 + const struct xfs_owner_info *btree_oinfo, 513 + xfs_extlen_t cow_blocks) 514 + { 515 + xfs_filblks_t refcbt_blocks = 0; 516 + xfs_filblks_t blocks; 517 + int error; 518 + 519 + if (!sc->sr.rmap_cur || !sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) 520 + return; 521 + 522 + /* Check that we saw as many refcbt blocks as the rmap knows about. */ 523 + error = xfs_btree_count_blocks(sc->sr.refc_cur, &refcbt_blocks); 524 + if (!xchk_btree_process_error(sc, sc->sr.refc_cur, 0, &error)) 525 + return; 526 + error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, btree_oinfo, 527 + &blocks); 528 + if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) 529 + return; 530 + if (blocks != refcbt_blocks) 531 + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); 532 + 533 + /* Check that we saw as many cow blocks as the rmap knows about. */ 534 + error = xchk_count_rmap_ownedby_ag(sc, sc->sr.rmap_cur, 535 + &XFS_RMAP_OINFO_COW, &blocks); 536 + if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur)) 537 + return; 538 + if (blocks != cow_blocks) 539 + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); 540 + } 541 + 542 + /* Scrub the refcount btree for some AG. */ 543 + int 544 + xchk_rtrefcountbt( 545 + struct xfs_scrub *sc) 546 + { 547 + struct xfs_owner_info btree_oinfo; 548 + struct xchk_rtrefcbt_records rrc = { 549 + .cow_blocks = 0, 550 + .next_unshared_rgbno = 0, 551 + .prev_domain = XFS_REFC_DOMAIN_SHARED, 552 + }; 553 + int error; 554 + 555 + error = xchk_metadata_inode_forks(sc); 556 + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) 557 + return error; 558 + 559 + xfs_rmap_ino_bmbt_owner(&btree_oinfo, rtg_refcount(sc->sr.rtg)->i_ino, 560 + XFS_DATA_FORK); 561 + error = xchk_btree(sc, sc->sr.refc_cur, xchk_rtrefcountbt_rec, 562 + &btree_oinfo, &rrc); 563 + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) 564 + return error; 565 + 566 + /* 567 + * Check that all blocks between the last refcount > 1 record and the 568 + * end of the rt volume have at most one reverse mapping. 569 + */ 570 + xchk_rtrefcountbt_xref_gaps(sc, &rrc, sc->mp->m_sb.sb_rblocks); 571 + 572 + xchk_refcount_xref_rmap(sc, &btree_oinfo, rrc.cow_blocks); 573 + 574 + return 0; 575 + } 576 + 577 + /* xref check that a cow staging extent is marked in the rtrefcountbt. */ 578 + void 579 + xchk_xref_is_rt_cow_staging( 580 + struct xfs_scrub *sc, 581 + xfs_rgblock_t bno, 582 + xfs_extlen_t len) 583 + { 584 + struct xfs_refcount_irec rc; 585 + int has_refcount; 586 + int error; 587 + 588 + if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm)) 589 + return; 590 + 591 + /* Find the CoW staging extent. */ 592 + error = xfs_refcount_lookup_le(sc->sr.refc_cur, XFS_REFC_DOMAIN_COW, 593 + bno, &has_refcount); 594 + if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur)) 595 + return; 596 + if (!has_refcount) { 597 + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); 598 + return; 599 + } 600 + 601 + error = xfs_refcount_get_rec(sc->sr.refc_cur, &rc, &has_refcount); 602 + if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur)) 603 + return; 604 + if (!has_refcount) { 605 + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); 606 + return; 607 + } 608 + 609 + /* CoW lookup returned a shared extent record? */ 610 + if (rc.rc_domain != XFS_REFC_DOMAIN_COW) 611 + xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); 612 + 613 + /* Must be at least as long as what was passed in */ 614 + if (rc.rc_blockcount < len) 615 + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); 616 + } 617 + 618 + /* 619 + * xref check that the extent is not shared. Only file data blocks 620 + * can have multiple owners. 621 + */ 622 + void 623 + xchk_xref_is_not_rt_shared( 624 + struct xfs_scrub *sc, 625 + xfs_rgblock_t bno, 626 + xfs_extlen_t len) 627 + { 628 + enum xbtree_recpacking outcome; 629 + int error; 630 + 631 + if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm)) 632 + return; 633 + 634 + error = xfs_refcount_has_records(sc->sr.refc_cur, 635 + XFS_REFC_DOMAIN_SHARED, bno, len, &outcome); 636 + if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur)) 637 + return; 638 + if (outcome != XBTREE_RECPACKING_EMPTY) 639 + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); 640 + } 641 + 642 + /* xref check that the extent is not being used for CoW staging. */ 643 + void 644 + xchk_xref_is_not_rt_cow_staging( 645 + struct xfs_scrub *sc, 646 + xfs_rgblock_t bno, 647 + xfs_extlen_t len) 648 + { 649 + enum xbtree_recpacking outcome; 650 + int error; 651 + 652 + if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm)) 653 + return; 654 + 655 + error = xfs_refcount_has_records(sc->sr.refc_cur, XFS_REFC_DOMAIN_COW, 656 + bno, len, &outcome); 657 + if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur)) 658 + return; 659 + if (outcome != XBTREE_RECPACKING_EMPTY) 660 + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); 661 + }

+783

fs/xfs/scrub/rtrefcount_repair.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <djwong@kernel.org> 5 + */ 6 + #include "xfs.h" 7 + #include "xfs_fs.h" 8 + #include "xfs_shared.h" 9 + #include "xfs_format.h" 10 + #include "xfs_trans_resv.h" 11 + #include "xfs_mount.h" 12 + #include "xfs_defer.h" 13 + #include "xfs_btree.h" 14 + #include "xfs_btree_staging.h" 15 + #include "xfs_bit.h" 16 + #include "xfs_log_format.h" 17 + #include "xfs_trans.h" 18 + #include "xfs_sb.h" 19 + #include "xfs_alloc.h" 20 + #include "xfs_ialloc.h" 21 + #include "xfs_rmap.h" 22 + #include "xfs_rmap_btree.h" 23 + #include "xfs_rtrmap_btree.h" 24 + #include "xfs_refcount.h" 25 + #include "xfs_rtrefcount_btree.h" 26 + #include "xfs_error.h" 27 + #include "xfs_health.h" 28 + #include "xfs_inode.h" 29 + #include "xfs_quota.h" 30 + #include "xfs_rtalloc.h" 31 + #include "xfs_ag.h" 32 + #include "xfs_rtgroup.h" 33 + #include "xfs_rtbitmap.h" 34 + #include "scrub/xfs_scrub.h" 35 + #include "scrub/scrub.h" 36 + #include "scrub/common.h" 37 + #include "scrub/btree.h" 38 + #include "scrub/trace.h" 39 + #include "scrub/repair.h" 40 + #include "scrub/bitmap.h" 41 + #include "scrub/fsb_bitmap.h" 42 + #include "scrub/xfile.h" 43 + #include "scrub/xfarray.h" 44 + #include "scrub/newbt.h" 45 + #include "scrub/reap.h" 46 + #include "scrub/rcbag.h" 47 + 48 + /* 49 + * Rebuilding the Reference Count Btree 50 + * ==================================== 51 + * 52 + * This algorithm is "borrowed" from xfs_repair. Imagine the rmap 53 + * entries as rectangles representing extents of physical blocks, and 54 + * that the rectangles can be laid down to allow them to overlap each 55 + * other; then we know that we must emit a refcnt btree entry wherever 56 + * the amount of overlap changes, i.e. the emission stimulus is 57 + * level-triggered: 58 + * 59 + * - --- 60 + * -- ----- ---- --- ------ 61 + * -- ---- ----------- ---- --------- 62 + * -------------------------------- ----------- 63 + * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^ 64 + * 2 1 23 21 3 43 234 2123 1 01 2 3 0 65 + * 66 + * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner). 67 + * 68 + * Note that in the actual refcnt btree we don't store the refcount < 2 69 + * cases because the bnobt tells us which blocks are free; single-use 70 + * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt 71 + * supports storing multiple entries covering a given block we could 72 + * theoretically dispense with the refcntbt and simply count rmaps, but 73 + * that's inefficient in the (hot) write path, so we'll take the cost of 74 + * the extra tree to save time. Also there's no guarantee that rmap 75 + * will be enabled. 76 + * 77 + * Given an array of rmaps sorted by physical block number, a starting 78 + * physical block (sp), a bag to hold rmaps that cover sp, and the next 79 + * physical block where the level changes (np), we can reconstruct the 80 + * rt refcount btree as follows: 81 + * 82 + * While there are still unprocessed rmaps in the array, 83 + * - Set sp to the physical block (pblk) of the next unprocessed rmap. 84 + * - Add to the bag all rmaps in the array where startblock == sp. 85 + * - Set np to the physical block where the bag size will change. This 86 + * is the minimum of (the pblk of the next unprocessed rmap) and 87 + * (startblock + len of each rmap in the bag). 88 + * - Record the bag size as old_bag_size. 89 + * 90 + * - While the bag isn't empty, 91 + * - Remove from the bag all rmaps where startblock + len == np. 92 + * - Add to the bag all rmaps in the array where startblock == np. 93 + * - If the bag size isn't old_bag_size, store the refcount entry 94 + * (sp, np - sp, bag_size) in the refcnt btree. 95 + * - If the bag is empty, break out of the inner loop. 96 + * - Set old_bag_size to the bag size 97 + * - Set sp = np. 98 + * - Set np to the physical block where the bag size will change. 99 + * This is the minimum of (the pblk of the next unprocessed rmap) 100 + * and (startblock + len of each rmap in the bag). 101 + * 102 + * Like all the other repairers, we make a list of all the refcount 103 + * records we need, then reinitialize the rt refcount btree root and 104 + * insert all the records. 105 + */ 106 + 107 + struct xrep_rtrefc { 108 + /* refcount extents */ 109 + struct xfarray *refcount_records; 110 + 111 + /* new refcountbt information */ 112 + struct xrep_newbt new_btree; 113 + 114 + /* old refcountbt blocks */ 115 + struct xfsb_bitmap old_rtrefcountbt_blocks; 116 + 117 + struct xfs_scrub *sc; 118 + 119 + /* get_records()'s position in the rt refcount record array. */ 120 + xfarray_idx_t array_cur; 121 + 122 + /* # of refcountbt blocks */ 123 + xfs_filblks_t btblocks; 124 + }; 125 + 126 + /* Set us up to repair refcount btrees. */ 127 + int 128 + xrep_setup_rtrefcountbt( 129 + struct xfs_scrub *sc) 130 + { 131 + char *descr; 132 + int error; 133 + 134 + descr = xchk_xfile_ag_descr(sc, "rmap record bag"); 135 + error = xrep_setup_xfbtree(sc, descr); 136 + kfree(descr); 137 + return error; 138 + } 139 + 140 + /* Check for any obvious conflicts with this shared/CoW staging extent. */ 141 + STATIC int 142 + xrep_rtrefc_check_ext( 143 + struct xfs_scrub *sc, 144 + const struct xfs_refcount_irec *rec) 145 + { 146 + xfs_rgblock_t last; 147 + 148 + if (xfs_rtrefcount_check_irec(sc->sr.rtg, rec) != NULL) 149 + return -EFSCORRUPTED; 150 + 151 + if (xfs_rgbno_to_rtxoff(sc->mp, rec->rc_startblock) != 0) 152 + return -EFSCORRUPTED; 153 + 154 + last = rec->rc_startblock + rec->rc_blockcount - 1; 155 + if (xfs_rgbno_to_rtxoff(sc->mp, last) != sc->mp->m_sb.sb_rextsize - 1) 156 + return -EFSCORRUPTED; 157 + 158 + /* Make sure this isn't free space or misaligned. */ 159 + return xrep_require_rtext_inuse(sc, rec->rc_startblock, 160 + rec->rc_blockcount); 161 + } 162 + 163 + /* Record a reference count extent. */ 164 + STATIC int 165 + xrep_rtrefc_stash( 166 + struct xrep_rtrefc *rr, 167 + enum xfs_refc_domain domain, 168 + xfs_rgblock_t bno, 169 + xfs_extlen_t len, 170 + uint64_t refcount) 171 + { 172 + struct xfs_refcount_irec irec = { 173 + .rc_startblock = bno, 174 + .rc_blockcount = len, 175 + .rc_refcount = refcount, 176 + .rc_domain = domain, 177 + }; 178 + int error = 0; 179 + 180 + if (xchk_should_terminate(rr->sc, &error)) 181 + return error; 182 + 183 + irec.rc_refcount = min_t(uint64_t, XFS_REFC_REFCOUNT_MAX, refcount); 184 + 185 + error = xrep_rtrefc_check_ext(rr->sc, &irec); 186 + if (error) 187 + return error; 188 + 189 + trace_xrep_refc_found(rtg_group(rr->sc->sr.rtg), &irec); 190 + 191 + return xfarray_append(rr->refcount_records, &irec); 192 + } 193 + 194 + /* Record a CoW staging extent. */ 195 + STATIC int 196 + xrep_rtrefc_stash_cow( 197 + struct xrep_rtrefc *rr, 198 + xfs_rgblock_t bno, 199 + xfs_extlen_t len) 200 + { 201 + return xrep_rtrefc_stash(rr, XFS_REFC_DOMAIN_COW, bno, len, 1); 202 + } 203 + 204 + /* Decide if an rmap could describe a shared extent. */ 205 + static inline bool 206 + xrep_rtrefc_rmap_shareable( 207 + const struct xfs_rmap_irec *rmap) 208 + { 209 + /* rt metadata are never sharable */ 210 + if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) 211 + return false; 212 + 213 + /* Unwritten file blocks are not shareable. */ 214 + if (rmap->rm_flags & XFS_RMAP_UNWRITTEN) 215 + return false; 216 + 217 + return true; 218 + } 219 + 220 + /* Grab the next (abbreviated) rmap record from the rmapbt. */ 221 + STATIC int 222 + xrep_rtrefc_walk_rmaps( 223 + struct xrep_rtrefc *rr, 224 + struct xfs_rmap_irec *rmap, 225 + bool *have_rec) 226 + { 227 + struct xfs_btree_cur *cur = rr->sc->sr.rmap_cur; 228 + struct xfs_mount *mp = cur->bc_mp; 229 + int have_gt; 230 + int error = 0; 231 + 232 + *have_rec = false; 233 + 234 + /* 235 + * Loop through the remaining rmaps. Remember CoW staging 236 + * extents and the refcountbt blocks from the old tree for later 237 + * disposal. We can only share written data fork extents, so 238 + * keep looping until we find an rmap for one. 239 + */ 240 + do { 241 + if (xchk_should_terminate(rr->sc, &error)) 242 + return error; 243 + 244 + error = xfs_btree_increment(cur, 0, &have_gt); 245 + if (error) 246 + return error; 247 + if (!have_gt) 248 + return 0; 249 + 250 + error = xfs_rmap_get_rec(cur, rmap, &have_gt); 251 + if (error) 252 + return error; 253 + if (XFS_IS_CORRUPT(mp, !have_gt)) { 254 + xfs_btree_mark_sick(cur); 255 + return -EFSCORRUPTED; 256 + } 257 + 258 + if (rmap->rm_owner == XFS_RMAP_OWN_COW) { 259 + error = xrep_rtrefc_stash_cow(rr, rmap->rm_startblock, 260 + rmap->rm_blockcount); 261 + if (error) 262 + return error; 263 + } else if (xfs_is_sb_inum(mp, rmap->rm_owner) || 264 + (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | 265 + XFS_RMAP_BMBT_BLOCK))) { 266 + xfs_btree_mark_sick(cur); 267 + return -EFSCORRUPTED; 268 + } 269 + } while (!xrep_rtrefc_rmap_shareable(rmap)); 270 + 271 + *have_rec = true; 272 + return 0; 273 + } 274 + 275 + static inline uint32_t 276 + xrep_rtrefc_encode_startblock( 277 + const struct xfs_refcount_irec *irec) 278 + { 279 + uint32_t start; 280 + 281 + start = irec->rc_startblock & ~XFS_REFC_COWFLAG; 282 + if (irec->rc_domain == XFS_REFC_DOMAIN_COW) 283 + start |= XFS_REFC_COWFLAG; 284 + 285 + return start; 286 + } 287 + 288 + /* 289 + * Compare two refcount records. We want to sort in order of increasing block 290 + * number. 291 + */ 292 + static int 293 + xrep_rtrefc_extent_cmp( 294 + const void *a, 295 + const void *b) 296 + { 297 + const struct xfs_refcount_irec *ap = a; 298 + const struct xfs_refcount_irec *bp = b; 299 + uint32_t sa, sb; 300 + 301 + sa = xrep_rtrefc_encode_startblock(ap); 302 + sb = xrep_rtrefc_encode_startblock(bp); 303 + 304 + if (sa > sb) 305 + return 1; 306 + if (sa < sb) 307 + return -1; 308 + return 0; 309 + } 310 + 311 + /* 312 + * Sort the refcount extents by startblock or else the btree records will be in 313 + * the wrong order. Make sure the records do not overlap in physical space. 314 + */ 315 + STATIC int 316 + xrep_rtrefc_sort_records( 317 + struct xrep_rtrefc *rr) 318 + { 319 + struct xfs_refcount_irec irec; 320 + xfarray_idx_t cur; 321 + enum xfs_refc_domain dom = XFS_REFC_DOMAIN_SHARED; 322 + xfs_rgblock_t next_rgbno = 0; 323 + int error; 324 + 325 + error = xfarray_sort(rr->refcount_records, xrep_rtrefc_extent_cmp, 326 + XFARRAY_SORT_KILLABLE); 327 + if (error) 328 + return error; 329 + 330 + foreach_xfarray_idx(rr->refcount_records, cur) { 331 + if (xchk_should_terminate(rr->sc, &error)) 332 + return error; 333 + 334 + error = xfarray_load(rr->refcount_records, cur, &irec); 335 + if (error) 336 + return error; 337 + 338 + if (dom == XFS_REFC_DOMAIN_SHARED && 339 + irec.rc_domain == XFS_REFC_DOMAIN_COW) { 340 + dom = irec.rc_domain; 341 + next_rgbno = 0; 342 + } 343 + 344 + if (dom != irec.rc_domain) 345 + return -EFSCORRUPTED; 346 + if (irec.rc_startblock < next_rgbno) 347 + return -EFSCORRUPTED; 348 + 349 + next_rgbno = irec.rc_startblock + irec.rc_blockcount; 350 + } 351 + 352 + return error; 353 + } 354 + 355 + /* Record extents that belong to the realtime refcount inode. */ 356 + STATIC int 357 + xrep_rtrefc_walk_rmap( 358 + struct xfs_btree_cur *cur, 359 + const struct xfs_rmap_irec *rec, 360 + void *priv) 361 + { 362 + struct xrep_rtrefc *rr = priv; 363 + int error = 0; 364 + 365 + if (xchk_should_terminate(rr->sc, &error)) 366 + return error; 367 + 368 + /* Skip extents which are not owned by this inode and fork. */ 369 + if (rec->rm_owner != rr->sc->ip->i_ino) 370 + return 0; 371 + 372 + error = xrep_check_ino_btree_mapping(rr->sc, rec); 373 + if (error) 374 + return error; 375 + 376 + return xfsb_bitmap_set(&rr->old_rtrefcountbt_blocks, 377 + xfs_gbno_to_fsb(cur->bc_group, rec->rm_startblock), 378 + rec->rm_blockcount); 379 + } 380 + 381 + /* 382 + * Walk forward through the rmap btree to collect all rmaps starting at 383 + * @bno in @rmap_bag. These represent the file(s) that share ownership of 384 + * the current block. Upon return, the rmap cursor points to the last record 385 + * satisfying the startblock constraint. 386 + */ 387 + static int 388 + xrep_rtrefc_push_rmaps_at( 389 + struct xrep_rtrefc *rr, 390 + struct rcbag *rcstack, 391 + xfs_rgblock_t bno, 392 + struct xfs_rmap_irec *rmap, 393 + bool *have) 394 + { 395 + struct xfs_scrub *sc = rr->sc; 396 + int have_gt; 397 + int error; 398 + 399 + while (*have && rmap->rm_startblock == bno) { 400 + error = rcbag_add(rcstack, rr->sc->tp, rmap); 401 + if (error) 402 + return error; 403 + 404 + error = xrep_rtrefc_walk_rmaps(rr, rmap, have); 405 + if (error) 406 + return error; 407 + } 408 + 409 + error = xfs_btree_decrement(sc->sr.rmap_cur, 0, &have_gt); 410 + if (error) 411 + return error; 412 + if (XFS_IS_CORRUPT(sc->mp, !have_gt)) { 413 + xfs_btree_mark_sick(sc->sr.rmap_cur); 414 + return -EFSCORRUPTED; 415 + } 416 + 417 + return 0; 418 + } 419 + 420 + /* Scan one AG for reverse mappings for the realtime refcount btree. */ 421 + STATIC int 422 + xrep_rtrefc_scan_ag( 423 + struct xrep_rtrefc *rr, 424 + struct xfs_perag *pag) 425 + { 426 + struct xfs_scrub *sc = rr->sc; 427 + int error; 428 + 429 + error = xrep_ag_init(sc, pag, &sc->sa); 430 + if (error) 431 + return error; 432 + 433 + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_rtrefc_walk_rmap, rr); 434 + xchk_ag_free(sc, &sc->sa); 435 + return error; 436 + } 437 + 438 + /* Iterate all the rmap records to generate reference count data. */ 439 + STATIC int 440 + xrep_rtrefc_find_refcounts( 441 + struct xrep_rtrefc *rr) 442 + { 443 + struct xfs_scrub *sc = rr->sc; 444 + struct rcbag *rcstack; 445 + struct xfs_perag *pag = NULL; 446 + uint64_t old_stack_height; 447 + xfs_rgblock_t sbno; 448 + xfs_rgblock_t cbno; 449 + xfs_rgblock_t nbno; 450 + bool have; 451 + int error; 452 + 453 + /* Scan for old rtrefc btree blocks. */ 454 + while ((pag = xfs_perag_next(sc->mp, pag))) { 455 + error = xrep_rtrefc_scan_ag(rr, pag); 456 + if (error) { 457 + xfs_perag_rele(pag); 458 + return error; 459 + } 460 + } 461 + 462 + xrep_rtgroup_btcur_init(sc, &sc->sr); 463 + 464 + /* 465 + * Set up a bag to store all the rmap records that we're tracking to 466 + * generate a reference count record. If this exceeds 467 + * XFS_REFC_REFCOUNT_MAX, we clamp rc_refcount. 468 + */ 469 + error = rcbag_init(sc->mp, sc->xmbtp, &rcstack); 470 + if (error) 471 + goto out_cur; 472 + 473 + /* Start the rtrmapbt cursor to the left of all records. */ 474 + error = xfs_btree_goto_left_edge(sc->sr.rmap_cur); 475 + if (error) 476 + goto out_bag; 477 + 478 + /* Process reverse mappings into refcount data. */ 479 + while (xfs_btree_has_more_records(sc->sr.rmap_cur)) { 480 + struct xfs_rmap_irec rmap; 481 + 482 + /* Push all rmaps with pblk == sbno onto the stack */ 483 + error = xrep_rtrefc_walk_rmaps(rr, &rmap, &have); 484 + if (error) 485 + goto out_bag; 486 + if (!have) 487 + break; 488 + sbno = cbno = rmap.rm_startblock; 489 + error = xrep_rtrefc_push_rmaps_at(rr, rcstack, sbno, &rmap, 490 + &have); 491 + if (error) 492 + goto out_bag; 493 + 494 + /* Set nbno to the bno of the next refcount change */ 495 + error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno); 496 + if (error) 497 + goto out_bag; 498 + 499 + ASSERT(nbno > sbno); 500 + old_stack_height = rcbag_count(rcstack); 501 + 502 + /* While stack isn't empty... */ 503 + while (rcbag_count(rcstack) > 0) { 504 + /* Pop all rmaps that end at nbno */ 505 + error = rcbag_remove_ending_at(rcstack, sc->tp, nbno); 506 + if (error) 507 + goto out_bag; 508 + 509 + /* Push array items that start at nbno */ 510 + error = xrep_rtrefc_walk_rmaps(rr, &rmap, &have); 511 + if (error) 512 + goto out_bag; 513 + if (have) { 514 + error = xrep_rtrefc_push_rmaps_at(rr, rcstack, 515 + nbno, &rmap, &have); 516 + if (error) 517 + goto out_bag; 518 + } 519 + 520 + /* Emit refcount if necessary */ 521 + ASSERT(nbno > cbno); 522 + if (rcbag_count(rcstack) != old_stack_height) { 523 + if (old_stack_height > 1) { 524 + error = xrep_rtrefc_stash(rr, 525 + XFS_REFC_DOMAIN_SHARED, 526 + cbno, nbno - cbno, 527 + old_stack_height); 528 + if (error) 529 + goto out_bag; 530 + } 531 + cbno = nbno; 532 + } 533 + 534 + /* Stack empty, go find the next rmap */ 535 + if (rcbag_count(rcstack) == 0) 536 + break; 537 + old_stack_height = rcbag_count(rcstack); 538 + sbno = nbno; 539 + 540 + /* Set nbno to the bno of the next refcount change */ 541 + error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, 542 + &nbno); 543 + if (error) 544 + goto out_bag; 545 + 546 + ASSERT(nbno > sbno); 547 + } 548 + } 549 + 550 + ASSERT(rcbag_count(rcstack) == 0); 551 + out_bag: 552 + rcbag_free(&rcstack); 553 + out_cur: 554 + xchk_rtgroup_btcur_free(&sc->sr); 555 + return error; 556 + } 557 + 558 + /* Retrieve refcountbt data for bulk load. */ 559 + STATIC int 560 + xrep_rtrefc_get_records( 561 + struct xfs_btree_cur *cur, 562 + unsigned int idx, 563 + struct xfs_btree_block *block, 564 + unsigned int nr_wanted, 565 + void *priv) 566 + { 567 + struct xrep_rtrefc *rr = priv; 568 + union xfs_btree_rec *block_rec; 569 + unsigned int loaded; 570 + int error; 571 + 572 + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { 573 + error = xfarray_load(rr->refcount_records, rr->array_cur++, 574 + &cur->bc_rec.rc); 575 + if (error) 576 + return error; 577 + 578 + block_rec = xfs_btree_rec_addr(cur, idx, block); 579 + cur->bc_ops->init_rec_from_cur(cur, block_rec); 580 + } 581 + 582 + return loaded; 583 + } 584 + 585 + /* Feed one of the new btree blocks to the bulk loader. */ 586 + STATIC int 587 + xrep_rtrefc_claim_block( 588 + struct xfs_btree_cur *cur, 589 + union xfs_btree_ptr *ptr, 590 + void *priv) 591 + { 592 + struct xrep_rtrefc *rr = priv; 593 + 594 + return xrep_newbt_claim_block(cur, &rr->new_btree, ptr); 595 + } 596 + 597 + /* Figure out how much space we need to create the incore btree root block. */ 598 + STATIC size_t 599 + xrep_rtrefc_iroot_size( 600 + struct xfs_btree_cur *cur, 601 + unsigned int level, 602 + unsigned int nr_this_level, 603 + void *priv) 604 + { 605 + return xfs_rtrefcount_broot_space_calc(cur->bc_mp, level, 606 + nr_this_level); 607 + } 608 + 609 + /* 610 + * Use the collected refcount information to stage a new rt refcount btree. If 611 + * this is successful we'll return with the new btree root information logged 612 + * to the repair transaction but not yet committed. 613 + */ 614 + STATIC int 615 + xrep_rtrefc_build_new_tree( 616 + struct xrep_rtrefc *rr) 617 + { 618 + struct xfs_scrub *sc = rr->sc; 619 + struct xfs_rtgroup *rtg = sc->sr.rtg; 620 + struct xfs_btree_cur *refc_cur; 621 + int error; 622 + 623 + error = xrep_rtrefc_sort_records(rr); 624 + if (error) 625 + return error; 626 + 627 + /* 628 + * Prepare to construct the new btree by reserving disk space for the 629 + * new btree and setting up all the accounting information we'll need 630 + * to root the new btree while it's under construction and before we 631 + * attach it to the realtime refcount inode. 632 + */ 633 + error = xrep_newbt_init_metadir_inode(&rr->new_btree, sc); 634 + if (error) 635 + return error; 636 + 637 + rr->new_btree.bload.get_records = xrep_rtrefc_get_records; 638 + rr->new_btree.bload.claim_block = xrep_rtrefc_claim_block; 639 + rr->new_btree.bload.iroot_size = xrep_rtrefc_iroot_size; 640 + 641 + refc_cur = xfs_rtrefcountbt_init_cursor(NULL, rtg); 642 + xfs_btree_stage_ifakeroot(refc_cur, &rr->new_btree.ifake); 643 + 644 + /* Compute how many blocks we'll need. */ 645 + error = xfs_btree_bload_compute_geometry(refc_cur, &rr->new_btree.bload, 646 + xfarray_length(rr->refcount_records)); 647 + if (error) 648 + goto err_cur; 649 + 650 + /* Last chance to abort before we start committing fixes. */ 651 + if (xchk_should_terminate(sc, &error)) 652 + goto err_cur; 653 + 654 + /* 655 + * Guess how many blocks we're going to need to rebuild an entire 656 + * rtrefcountbt from the number of extents we found, and pump up our 657 + * transaction to have sufficient block reservation. We're allowed 658 + * to exceed quota to repair inconsistent metadata, though this is 659 + * unlikely. 660 + */ 661 + error = xfs_trans_reserve_more_inode(sc->tp, rtg_refcount(rtg), 662 + rr->new_btree.bload.nr_blocks, 0, true); 663 + if (error) 664 + goto err_cur; 665 + 666 + /* Reserve the space we'll need for the new btree. */ 667 + error = xrep_newbt_alloc_blocks(&rr->new_btree, 668 + rr->new_btree.bload.nr_blocks); 669 + if (error) 670 + goto err_cur; 671 + 672 + /* Add all observed refcount records. */ 673 + rr->new_btree.ifake.if_fork->if_format = XFS_DINODE_FMT_META_BTREE; 674 + rr->array_cur = XFARRAY_CURSOR_INIT; 675 + error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr); 676 + if (error) 677 + goto err_cur; 678 + 679 + /* 680 + * Install the new rtrefc btree in the inode. After this point the old 681 + * btree is no longer accessible, the new tree is live, and we can 682 + * delete the cursor. 683 + */ 684 + xfs_rtrefcountbt_commit_staged_btree(refc_cur, sc->tp); 685 + xrep_inode_set_nblocks(rr->sc, rr->new_btree.ifake.if_blocks); 686 + xfs_btree_del_cursor(refc_cur, 0); 687 + 688 + /* Dispose of any unused blocks and the accounting information. */ 689 + error = xrep_newbt_commit(&rr->new_btree); 690 + if (error) 691 + return error; 692 + 693 + return xrep_roll_trans(sc); 694 + err_cur: 695 + xfs_btree_del_cursor(refc_cur, error); 696 + xrep_newbt_cancel(&rr->new_btree); 697 + return error; 698 + } 699 + 700 + /* 701 + * Now that we've logged the roots of the new btrees, invalidate all of the 702 + * old blocks and free them. 703 + */ 704 + STATIC int 705 + xrep_rtrefc_remove_old_tree( 706 + struct xrep_rtrefc *rr) 707 + { 708 + int error; 709 + 710 + /* 711 + * Free all the extents that were allocated to the former rtrefcountbt 712 + * and aren't cross-linked with something else. 713 + */ 714 + error = xrep_reap_metadir_fsblocks(rr->sc, 715 + &rr->old_rtrefcountbt_blocks); 716 + if (error) 717 + return error; 718 + 719 + /* 720 + * Ensure the proper reservation for the rtrefcount inode so that we 721 + * don't fail to expand the btree. 722 + */ 723 + return xrep_reset_metafile_resv(rr->sc); 724 + } 725 + 726 + /* Rebuild the rt refcount btree. */ 727 + int 728 + xrep_rtrefcountbt( 729 + struct xfs_scrub *sc) 730 + { 731 + struct xrep_rtrefc *rr; 732 + struct xfs_mount *mp = sc->mp; 733 + char *descr; 734 + int error; 735 + 736 + /* We require the rmapbt to rebuild anything. */ 737 + if (!xfs_has_rtrmapbt(mp)) 738 + return -EOPNOTSUPP; 739 + 740 + /* Make sure any problems with the fork are fixed. */ 741 + error = xrep_metadata_inode_forks(sc); 742 + if (error) 743 + return error; 744 + 745 + rr = kzalloc(sizeof(struct xrep_rtrefc), XCHK_GFP_FLAGS); 746 + if (!rr) 747 + return -ENOMEM; 748 + rr->sc = sc; 749 + 750 + /* Set up enough storage to handle one refcount record per rt extent. */ 751 + descr = xchk_xfile_ag_descr(sc, "reference count records"); 752 + error = xfarray_create(descr, mp->m_sb.sb_rextents, 753 + sizeof(struct xfs_refcount_irec), 754 + &rr->refcount_records); 755 + kfree(descr); 756 + if (error) 757 + goto out_rr; 758 + 759 + /* Collect all reference counts. */ 760 + xfsb_bitmap_init(&rr->old_rtrefcountbt_blocks); 761 + error = xrep_rtrefc_find_refcounts(rr); 762 + if (error) 763 + goto out_bitmap; 764 + 765 + xfs_trans_ijoin(sc->tp, sc->ip, 0); 766 + 767 + /* Rebuild the refcount information. */ 768 + error = xrep_rtrefc_build_new_tree(rr); 769 + if (error) 770 + goto out_bitmap; 771 + 772 + /* Kill the old tree. */ 773 + error = xrep_rtrefc_remove_old_tree(rr); 774 + if (error) 775 + goto out_bitmap; 776 + 777 + out_bitmap: 778 + xfsb_bitmap_destroy(&rr->old_rtrefcountbt_blocks); 779 + xfarray_destroy(rr->refcount_records); 780 + out_rr: 781 + kfree(rr); 782 + return error; 783 + }

+323

fs/xfs/scrub/rtrmap.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <djwong@kernel.org> 5 + */ 6 + #include "xfs.h" 7 + #include "xfs_fs.h" 8 + #include "xfs_shared.h" 9 + #include "xfs_format.h" 10 + #include "xfs_trans_resv.h" 11 + #include "xfs_mount.h" 12 + #include "xfs_defer.h" 13 + #include "xfs_btree.h" 14 + #include "xfs_bit.h" 15 + #include "xfs_log_format.h" 16 + #include "xfs_trans.h" 17 + #include "xfs_sb.h" 18 + #include "xfs_rmap.h" 19 + #include "xfs_rmap_btree.h" 20 + #include "xfs_rtrmap_btree.h" 21 + #include "xfs_inode.h" 22 + #include "xfs_rtalloc.h" 23 + #include "xfs_rtgroup.h" 24 + #include "xfs_metafile.h" 25 + #include "xfs_refcount.h" 26 + #include "scrub/xfs_scrub.h" 27 + #include "scrub/scrub.h" 28 + #include "scrub/common.h" 29 + #include "scrub/btree.h" 30 + #include "scrub/trace.h" 31 + #include "scrub/repair.h" 32 + 33 + /* Set us up with the realtime metadata locked. */ 34 + int 35 + xchk_setup_rtrmapbt( 36 + struct xfs_scrub *sc) 37 + { 38 + int error; 39 + 40 + if (xchk_need_intent_drain(sc)) 41 + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); 42 + 43 + if (xchk_could_repair(sc)) { 44 + error = xrep_setup_rtrmapbt(sc); 45 + if (error) 46 + return error; 47 + } 48 + 49 + error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr); 50 + if (error) 51 + return error; 52 + 53 + error = xchk_setup_rt(sc); 54 + if (error) 55 + return error; 56 + 57 + error = xchk_install_live_inode(sc, rtg_rmap(sc->sr.rtg)); 58 + if (error) 59 + return error; 60 + 61 + return xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL); 62 + } 63 + 64 + /* Realtime reverse mapping. */ 65 + 66 + struct xchk_rtrmap { 67 + /* 68 + * The furthest-reaching of the rmapbt records that we've already 69 + * processed. This enables us to detect overlapping records for space 70 + * allocations that cannot be shared. 71 + */ 72 + struct xfs_rmap_irec overlap_rec; 73 + 74 + /* 75 + * The previous rmapbt record, so that we can check for two records 76 + * that could be one. 77 + */ 78 + struct xfs_rmap_irec prev_rec; 79 + }; 80 + 81 + static inline bool 82 + xchk_rtrmapbt_is_shareable( 83 + struct xfs_scrub *sc, 84 + const struct xfs_rmap_irec *irec) 85 + { 86 + if (!xfs_has_rtreflink(sc->mp)) 87 + return false; 88 + if (irec->rm_flags & XFS_RMAP_UNWRITTEN) 89 + return false; 90 + return true; 91 + } 92 + 93 + /* Flag failures for records that overlap but cannot. */ 94 + STATIC void 95 + xchk_rtrmapbt_check_overlapping( 96 + struct xchk_btree *bs, 97 + struct xchk_rtrmap *cr, 98 + const struct xfs_rmap_irec *irec) 99 + { 100 + xfs_rtblock_t pnext, inext; 101 + 102 + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 103 + return; 104 + 105 + /* No previous record? */ 106 + if (cr->overlap_rec.rm_blockcount == 0) 107 + goto set_prev; 108 + 109 + /* Do overlap_rec and irec overlap? */ 110 + pnext = cr->overlap_rec.rm_startblock + cr->overlap_rec.rm_blockcount; 111 + if (pnext <= irec->rm_startblock) 112 + goto set_prev; 113 + 114 + /* Overlap is only allowed if both records are data fork mappings. */ 115 + if (!xchk_rtrmapbt_is_shareable(bs->sc, &cr->overlap_rec) || 116 + !xchk_rtrmapbt_is_shareable(bs->sc, irec)) 117 + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); 118 + 119 + /* Save whichever rmap record extends furthest. */ 120 + inext = irec->rm_startblock + irec->rm_blockcount; 121 + if (pnext > inext) 122 + return; 123 + 124 + set_prev: 125 + memcpy(&cr->overlap_rec, irec, sizeof(struct xfs_rmap_irec)); 126 + } 127 + 128 + /* Decide if two reverse-mapping records can be merged. */ 129 + static inline bool 130 + xchk_rtrmap_mergeable( 131 + struct xchk_rtrmap *cr, 132 + const struct xfs_rmap_irec *r2) 133 + { 134 + const struct xfs_rmap_irec *r1 = &cr->prev_rec; 135 + 136 + /* Ignore if prev_rec is not yet initialized. */ 137 + if (cr->prev_rec.rm_blockcount == 0) 138 + return false; 139 + 140 + if (r1->rm_owner != r2->rm_owner) 141 + return false; 142 + if (r1->rm_startblock + r1->rm_blockcount != r2->rm_startblock) 143 + return false; 144 + if ((unsigned long long)r1->rm_blockcount + r2->rm_blockcount > 145 + XFS_RMAP_LEN_MAX) 146 + return false; 147 + if (r1->rm_flags != r2->rm_flags) 148 + return false; 149 + return r1->rm_offset + r1->rm_blockcount == r2->rm_offset; 150 + } 151 + 152 + /* Flag failures for records that could be merged. */ 153 + STATIC void 154 + xchk_rtrmapbt_check_mergeable( 155 + struct xchk_btree *bs, 156 + struct xchk_rtrmap *cr, 157 + const struct xfs_rmap_irec *irec) 158 + { 159 + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 160 + return; 161 + 162 + if (xchk_rtrmap_mergeable(cr, irec)) 163 + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); 164 + 165 + memcpy(&cr->prev_rec, irec, sizeof(struct xfs_rmap_irec)); 166 + } 167 + 168 + /* Cross-reference a rmap against the refcount btree. */ 169 + STATIC void 170 + xchk_rtrmapbt_xref_rtrefc( 171 + struct xfs_scrub *sc, 172 + struct xfs_rmap_irec *irec) 173 + { 174 + xfs_rgblock_t fbno; 175 + xfs_extlen_t flen; 176 + bool is_inode; 177 + bool is_bmbt; 178 + bool is_attr; 179 + bool is_unwritten; 180 + int error; 181 + 182 + if (!sc->sr.refc_cur || xchk_skip_xref(sc->sm)) 183 + return; 184 + 185 + is_inode = !XFS_RMAP_NON_INODE_OWNER(irec->rm_owner); 186 + is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK; 187 + is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK; 188 + is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN; 189 + 190 + /* If this is shared, must be a data fork extent. */ 191 + error = xfs_refcount_find_shared(sc->sr.refc_cur, irec->rm_startblock, 192 + irec->rm_blockcount, &fbno, &flen, false); 193 + if (!xchk_should_check_xref(sc, &error, &sc->sr.refc_cur)) 194 + return; 195 + if (flen != 0 && (!is_inode || is_attr || is_bmbt || is_unwritten)) 196 + xchk_btree_xref_set_corrupt(sc, sc->sr.refc_cur, 0); 197 + } 198 + 199 + /* Cross-reference with other metadata. */ 200 + STATIC void 201 + xchk_rtrmapbt_xref( 202 + struct xfs_scrub *sc, 203 + struct xfs_rmap_irec *irec) 204 + { 205 + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 206 + return; 207 + 208 + xchk_xref_is_used_rt_space(sc, 209 + xfs_rgbno_to_rtb(sc->sr.rtg, irec->rm_startblock), 210 + irec->rm_blockcount); 211 + if (irec->rm_owner == XFS_RMAP_OWN_COW) 212 + xchk_xref_is_cow_staging(sc, irec->rm_startblock, 213 + irec->rm_blockcount); 214 + else 215 + xchk_rtrmapbt_xref_rtrefc(sc, irec); 216 + } 217 + 218 + /* Scrub a realtime rmapbt record. */ 219 + STATIC int 220 + xchk_rtrmapbt_rec( 221 + struct xchk_btree *bs, 222 + const union xfs_btree_rec *rec) 223 + { 224 + struct xchk_rtrmap *cr = bs->private; 225 + struct xfs_rmap_irec irec; 226 + 227 + if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL || 228 + xfs_rtrmap_check_irec(to_rtg(bs->cur->bc_group), &irec) != NULL) { 229 + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); 230 + return 0; 231 + } 232 + 233 + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 234 + return 0; 235 + 236 + xchk_rtrmapbt_check_mergeable(bs, cr, &irec); 237 + xchk_rtrmapbt_check_overlapping(bs, cr, &irec); 238 + xchk_rtrmapbt_xref(bs->sc, &irec); 239 + return 0; 240 + } 241 + 242 + /* Scrub the realtime rmap btree. */ 243 + int 244 + xchk_rtrmapbt( 245 + struct xfs_scrub *sc) 246 + { 247 + struct xfs_inode *ip = rtg_rmap(sc->sr.rtg); 248 + struct xfs_owner_info oinfo; 249 + struct xchk_rtrmap cr = { }; 250 + int error; 251 + 252 + error = xchk_metadata_inode_forks(sc); 253 + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) 254 + return error; 255 + 256 + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, XFS_DATA_FORK); 257 + return xchk_btree(sc, sc->sr.rmap_cur, xchk_rtrmapbt_rec, &oinfo, &cr); 258 + } 259 + 260 + /* xref check that the extent has no realtime reverse mapping at all */ 261 + void 262 + xchk_xref_has_no_rt_owner( 263 + struct xfs_scrub *sc, 264 + xfs_rgblock_t bno, 265 + xfs_extlen_t len) 266 + { 267 + enum xbtree_recpacking outcome; 268 + int error; 269 + 270 + if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm)) 271 + return; 272 + 273 + error = xfs_rmap_has_records(sc->sr.rmap_cur, bno, len, &outcome); 274 + if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur)) 275 + return; 276 + if (outcome != XBTREE_RECPACKING_EMPTY) 277 + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); 278 + } 279 + 280 + /* xref check that the extent is completely mapped */ 281 + void 282 + xchk_xref_has_rt_owner( 283 + struct xfs_scrub *sc, 284 + xfs_rgblock_t bno, 285 + xfs_extlen_t len) 286 + { 287 + enum xbtree_recpacking outcome; 288 + int error; 289 + 290 + if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm)) 291 + return; 292 + 293 + error = xfs_rmap_has_records(sc->sr.rmap_cur, bno, len, &outcome); 294 + if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur)) 295 + return; 296 + if (outcome != XBTREE_RECPACKING_FULL) 297 + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); 298 + } 299 + 300 + /* xref check that the extent is only owned by a given owner */ 301 + void 302 + xchk_xref_is_only_rt_owned_by( 303 + struct xfs_scrub *sc, 304 + xfs_agblock_t bno, 305 + xfs_extlen_t len, 306 + const struct xfs_owner_info *oinfo) 307 + { 308 + struct xfs_rmap_matches res; 309 + int error; 310 + 311 + if (!sc->sr.rmap_cur || xchk_skip_xref(sc->sm)) 312 + return; 313 + 314 + error = xfs_rmap_count_owners(sc->sr.rmap_cur, bno, len, oinfo, &res); 315 + if (!xchk_should_check_xref(sc, &error, &sc->sr.rmap_cur)) 316 + return; 317 + if (res.matches != 1) 318 + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); 319 + if (res.bad_non_owner_matches) 320 + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); 321 + if (res.non_owner_matches) 322 + xchk_btree_xref_set_corrupt(sc, sc->sr.rmap_cur, 0); 323 + }

+1006

fs/xfs/scrub/rtrmap_repair.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <djwong@kernel.org> 5 + */ 6 + #include "xfs.h" 7 + #include "xfs_fs.h" 8 + #include "xfs_shared.h" 9 + #include "xfs_format.h" 10 + #include "xfs_trans_resv.h" 11 + #include "xfs_mount.h" 12 + #include "xfs_defer.h" 13 + #include "xfs_btree.h" 14 + #include "xfs_btree_staging.h" 15 + #include "xfs_buf_mem.h" 16 + #include "xfs_btree_mem.h" 17 + #include "xfs_bit.h" 18 + #include "xfs_log_format.h" 19 + #include "xfs_trans.h" 20 + #include "xfs_sb.h" 21 + #include "xfs_alloc.h" 22 + #include "xfs_rmap.h" 23 + #include "xfs_rmap_btree.h" 24 + #include "xfs_rtrmap_btree.h" 25 + #include "xfs_inode.h" 26 + #include "xfs_icache.h" 27 + #include "xfs_bmap.h" 28 + #include "xfs_bmap_btree.h" 29 + #include "xfs_quota.h" 30 + #include "xfs_rtalloc.h" 31 + #include "xfs_ag.h" 32 + #include "xfs_rtgroup.h" 33 + #include "xfs_refcount.h" 34 + #include "scrub/xfs_scrub.h" 35 + #include "scrub/scrub.h" 36 + #include "scrub/common.h" 37 + #include "scrub/btree.h" 38 + #include "scrub/trace.h" 39 + #include "scrub/repair.h" 40 + #include "scrub/bitmap.h" 41 + #include "scrub/fsb_bitmap.h" 42 + #include "scrub/rgb_bitmap.h" 43 + #include "scrub/xfile.h" 44 + #include "scrub/xfarray.h" 45 + #include "scrub/iscan.h" 46 + #include "scrub/newbt.h" 47 + #include "scrub/reap.h" 48 + 49 + /* 50 + * Realtime Reverse Mapping Btree Repair 51 + * ===================================== 52 + * 53 + * This isn't quite as difficult as repairing the rmap btree on the data 54 + * device, since we only store the data fork extents of realtime files on the 55 + * realtime device. We still have to freeze the filesystem and stop the 56 + * background threads like we do for the rmap repair, but we only have to scan 57 + * realtime inodes. 58 + * 59 + * Collecting entries for the new realtime rmap btree is easy -- all we have 60 + * to do is generate rtrmap entries from the data fork mappings of all realtime 61 + * files in the filesystem. We then scan the rmap btrees of the data device 62 + * looking for extents belonging to the old btree and note them in a bitmap. 63 + * 64 + * To rebuild the realtime rmap btree, we bulk-load the collected mappings into 65 + * a new btree cursor and atomically swap that into the realtime inode. Then 66 + * we can free the blocks from the old btree. 67 + * 68 + * We use the 'xrep_rtrmap' prefix for all the rmap functions. 69 + */ 70 + 71 + /* Context for collecting rmaps */ 72 + struct xrep_rtrmap { 73 + /* new rtrmapbt information */ 74 + struct xrep_newbt new_btree; 75 + 76 + /* lock for the xfbtree and xfile */ 77 + struct mutex lock; 78 + 79 + /* rmap records generated from primary metadata */ 80 + struct xfbtree rtrmap_btree; 81 + 82 + struct xfs_scrub *sc; 83 + 84 + /* bitmap of old rtrmapbt blocks */ 85 + struct xfsb_bitmap old_rtrmapbt_blocks; 86 + 87 + /* Hooks into rtrmap update code. */ 88 + struct xfs_rmap_hook rhook; 89 + 90 + /* inode scan cursor */ 91 + struct xchk_iscan iscan; 92 + 93 + /* in-memory btree cursor for the ->get_blocks walk */ 94 + struct xfs_btree_cur *mcur; 95 + 96 + /* Number of records we're staging in the new btree. */ 97 + uint64_t nr_records; 98 + }; 99 + 100 + /* Set us up to repair rt reverse mapping btrees. */ 101 + int 102 + xrep_setup_rtrmapbt( 103 + struct xfs_scrub *sc) 104 + { 105 + struct xrep_rtrmap *rr; 106 + char *descr; 107 + int error; 108 + 109 + xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP); 110 + 111 + descr = xchk_xfile_rtgroup_descr(sc, "reverse mapping records"); 112 + error = xrep_setup_xfbtree(sc, descr); 113 + kfree(descr); 114 + if (error) 115 + return error; 116 + 117 + rr = kzalloc(sizeof(struct xrep_rtrmap), XCHK_GFP_FLAGS); 118 + if (!rr) 119 + return -ENOMEM; 120 + 121 + rr->sc = sc; 122 + sc->buf = rr; 123 + return 0; 124 + } 125 + 126 + /* Make sure there's nothing funny about this mapping. */ 127 + STATIC int 128 + xrep_rtrmap_check_mapping( 129 + struct xfs_scrub *sc, 130 + const struct xfs_rmap_irec *rec) 131 + { 132 + if (xfs_rtrmap_check_irec(sc->sr.rtg, rec) != NULL) 133 + return -EFSCORRUPTED; 134 + 135 + /* Make sure this isn't free space. */ 136 + return xrep_require_rtext_inuse(sc, rec->rm_startblock, 137 + rec->rm_blockcount); 138 + } 139 + 140 + /* Store a reverse-mapping record. */ 141 + static inline int 142 + xrep_rtrmap_stash( 143 + struct xrep_rtrmap *rr, 144 + xfs_rgblock_t startblock, 145 + xfs_extlen_t blockcount, 146 + uint64_t owner, 147 + uint64_t offset, 148 + unsigned int flags) 149 + { 150 + struct xfs_rmap_irec rmap = { 151 + .rm_startblock = startblock, 152 + .rm_blockcount = blockcount, 153 + .rm_owner = owner, 154 + .rm_offset = offset, 155 + .rm_flags = flags, 156 + }; 157 + struct xfs_scrub *sc = rr->sc; 158 + struct xfs_btree_cur *mcur; 159 + int error = 0; 160 + 161 + if (xchk_should_terminate(sc, &error)) 162 + return error; 163 + 164 + if (xchk_iscan_aborted(&rr->iscan)) 165 + return -EFSCORRUPTED; 166 + 167 + trace_xrep_rtrmap_found(sc->mp, &rmap); 168 + 169 + /* Add entry to in-memory btree. */ 170 + mutex_lock(&rr->lock); 171 + mcur = xfs_rtrmapbt_mem_cursor(sc->sr.rtg, sc->tp, &rr->rtrmap_btree); 172 + error = xfs_rmap_map_raw(mcur, &rmap); 173 + xfs_btree_del_cursor(mcur, error); 174 + if (error) 175 + goto out_cancel; 176 + 177 + error = xfbtree_trans_commit(&rr->rtrmap_btree, sc->tp); 178 + if (error) 179 + goto out_abort; 180 + 181 + mutex_unlock(&rr->lock); 182 + return 0; 183 + 184 + out_cancel: 185 + xfbtree_trans_cancel(&rr->rtrmap_btree, sc->tp); 186 + out_abort: 187 + xchk_iscan_abort(&rr->iscan); 188 + mutex_unlock(&rr->lock); 189 + return error; 190 + } 191 + 192 + /* Finding all file and bmbt extents. */ 193 + 194 + /* Context for accumulating rmaps for an inode fork. */ 195 + struct xrep_rtrmap_ifork { 196 + /* 197 + * Accumulate rmap data here to turn multiple adjacent bmaps into a 198 + * single rmap. 199 + */ 200 + struct xfs_rmap_irec accum; 201 + 202 + struct xrep_rtrmap *rr; 203 + }; 204 + 205 + /* Stash an rmap that we accumulated while walking an inode fork. */ 206 + STATIC int 207 + xrep_rtrmap_stash_accumulated( 208 + struct xrep_rtrmap_ifork *rf) 209 + { 210 + if (rf->accum.rm_blockcount == 0) 211 + return 0; 212 + 213 + return xrep_rtrmap_stash(rf->rr, rf->accum.rm_startblock, 214 + rf->accum.rm_blockcount, rf->accum.rm_owner, 215 + rf->accum.rm_offset, rf->accum.rm_flags); 216 + } 217 + 218 + /* Accumulate a bmbt record. */ 219 + STATIC int 220 + xrep_rtrmap_visit_bmbt( 221 + struct xfs_btree_cur *cur, 222 + struct xfs_bmbt_irec *rec, 223 + void *priv) 224 + { 225 + struct xrep_rtrmap_ifork *rf = priv; 226 + struct xfs_rmap_irec *accum = &rf->accum; 227 + struct xfs_mount *mp = rf->rr->sc->mp; 228 + xfs_rgblock_t rgbno; 229 + unsigned int rmap_flags = 0; 230 + int error; 231 + 232 + if (xfs_rtb_to_rgno(mp, rec->br_startblock) != 233 + rtg_rgno(rf->rr->sc->sr.rtg)) 234 + return 0; 235 + 236 + if (rec->br_state == XFS_EXT_UNWRITTEN) 237 + rmap_flags |= XFS_RMAP_UNWRITTEN; 238 + 239 + /* If this bmap is adjacent to the previous one, just add it. */ 240 + rgbno = xfs_rtb_to_rgbno(mp, rec->br_startblock); 241 + if (accum->rm_blockcount > 0 && 242 + rec->br_startoff == accum->rm_offset + accum->rm_blockcount && 243 + rgbno == accum->rm_startblock + accum->rm_blockcount && 244 + rmap_flags == accum->rm_flags) { 245 + accum->rm_blockcount += rec->br_blockcount; 246 + return 0; 247 + } 248 + 249 + /* Otherwise stash the old rmap and start accumulating a new one. */ 250 + error = xrep_rtrmap_stash_accumulated(rf); 251 + if (error) 252 + return error; 253 + 254 + accum->rm_startblock = rgbno; 255 + accum->rm_blockcount = rec->br_blockcount; 256 + accum->rm_offset = rec->br_startoff; 257 + accum->rm_flags = rmap_flags; 258 + return 0; 259 + } 260 + 261 + /* 262 + * Iterate the block mapping btree to collect rmap records for anything in this 263 + * fork that maps to the rt volume. Sets @mappings_done to true if we've 264 + * scanned the block mappings in this fork. 265 + */ 266 + STATIC int 267 + xrep_rtrmap_scan_bmbt( 268 + struct xrep_rtrmap_ifork *rf, 269 + struct xfs_inode *ip, 270 + bool *mappings_done) 271 + { 272 + struct xrep_rtrmap *rr = rf->rr; 273 + struct xfs_btree_cur *cur; 274 + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); 275 + int error = 0; 276 + 277 + *mappings_done = false; 278 + 279 + /* 280 + * If the incore extent cache is already loaded, we'll just use the 281 + * incore extent scanner to record mappings. Don't bother walking the 282 + * ondisk extent tree. 283 + */ 284 + if (!xfs_need_iread_extents(ifp)) 285 + return 0; 286 + 287 + /* Accumulate all the mappings in the bmap btree. */ 288 + cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, XFS_DATA_FORK); 289 + error = xfs_bmap_query_all(cur, xrep_rtrmap_visit_bmbt, rf); 290 + xfs_btree_del_cursor(cur, error); 291 + if (error) 292 + return error; 293 + 294 + /* Stash any remaining accumulated rmaps and exit. */ 295 + *mappings_done = true; 296 + return xrep_rtrmap_stash_accumulated(rf); 297 + } 298 + 299 + /* 300 + * Iterate the in-core extent cache to collect rmap records for anything in 301 + * this fork that matches the AG. 302 + */ 303 + STATIC int 304 + xrep_rtrmap_scan_iext( 305 + struct xrep_rtrmap_ifork *rf, 306 + struct xfs_ifork *ifp) 307 + { 308 + struct xfs_bmbt_irec rec; 309 + struct xfs_iext_cursor icur; 310 + int error; 311 + 312 + for_each_xfs_iext(ifp, &icur, &rec) { 313 + if (isnullstartblock(rec.br_startblock)) 314 + continue; 315 + error = xrep_rtrmap_visit_bmbt(NULL, &rec, rf); 316 + if (error) 317 + return error; 318 + } 319 + 320 + return xrep_rtrmap_stash_accumulated(rf); 321 + } 322 + 323 + /* Find all the extents on the realtime device mapped by an inode fork. */ 324 + STATIC int 325 + xrep_rtrmap_scan_dfork( 326 + struct xrep_rtrmap *rr, 327 + struct xfs_inode *ip) 328 + { 329 + struct xrep_rtrmap_ifork rf = { 330 + .accum = { .rm_owner = ip->i_ino, }, 331 + .rr = rr, 332 + }; 333 + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); 334 + int error = 0; 335 + 336 + if (ifp->if_format == XFS_DINODE_FMT_BTREE) { 337 + bool mappings_done; 338 + 339 + /* 340 + * Scan the bmbt for mappings. If the incore extent tree is 341 + * loaded, we want to scan the cached mappings since that's 342 + * faster when the extent counts are very high. 343 + */ 344 + error = xrep_rtrmap_scan_bmbt(&rf, ip, &mappings_done); 345 + if (error || mappings_done) 346 + return error; 347 + } else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) { 348 + /* realtime data forks should only be extents or btree */ 349 + return -EFSCORRUPTED; 350 + } 351 + 352 + /* Scan incore extent cache. */ 353 + return xrep_rtrmap_scan_iext(&rf, ifp); 354 + } 355 + 356 + /* Record reverse mappings for a file. */ 357 + STATIC int 358 + xrep_rtrmap_scan_inode( 359 + struct xrep_rtrmap *rr, 360 + struct xfs_inode *ip) 361 + { 362 + unsigned int lock_mode; 363 + int error = 0; 364 + 365 + /* Skip the rt rmap btree inode. */ 366 + if (rr->sc->ip == ip) 367 + return 0; 368 + 369 + lock_mode = xfs_ilock_data_map_shared(ip); 370 + 371 + /* Check the data fork if it's on the realtime device. */ 372 + if (XFS_IS_REALTIME_INODE(ip)) { 373 + error = xrep_rtrmap_scan_dfork(rr, ip); 374 + if (error) 375 + goto out_unlock; 376 + } 377 + 378 + xchk_iscan_mark_visited(&rr->iscan, ip); 379 + out_unlock: 380 + xfs_iunlock(ip, lock_mode); 381 + return error; 382 + } 383 + 384 + /* Record extents that belong to the realtime rmap inode. */ 385 + STATIC int 386 + xrep_rtrmap_walk_rmap( 387 + struct xfs_btree_cur *cur, 388 + const struct xfs_rmap_irec *rec, 389 + void *priv) 390 + { 391 + struct xrep_rtrmap *rr = priv; 392 + int error = 0; 393 + 394 + if (xchk_should_terminate(rr->sc, &error)) 395 + return error; 396 + 397 + /* Skip extents which are not owned by this inode and fork. */ 398 + if (rec->rm_owner != rr->sc->ip->i_ino) 399 + return 0; 400 + 401 + error = xrep_check_ino_btree_mapping(rr->sc, rec); 402 + if (error) 403 + return error; 404 + 405 + return xfsb_bitmap_set(&rr->old_rtrmapbt_blocks, 406 + xfs_gbno_to_fsb(cur->bc_group, rec->rm_startblock), 407 + rec->rm_blockcount); 408 + } 409 + 410 + /* Scan one AG for reverse mappings for the realtime rmap btree. */ 411 + STATIC int 412 + xrep_rtrmap_scan_ag( 413 + struct xrep_rtrmap *rr, 414 + struct xfs_perag *pag) 415 + { 416 + struct xfs_scrub *sc = rr->sc; 417 + int error; 418 + 419 + error = xrep_ag_init(sc, pag, &sc->sa); 420 + if (error) 421 + return error; 422 + 423 + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_rtrmap_walk_rmap, rr); 424 + xchk_ag_free(sc, &sc->sa); 425 + return error; 426 + } 427 + 428 + struct xrep_rtrmap_stash_run { 429 + struct xrep_rtrmap *rr; 430 + uint64_t owner; 431 + }; 432 + 433 + static int 434 + xrep_rtrmap_stash_run( 435 + uint32_t start, 436 + uint32_t len, 437 + void *priv) 438 + { 439 + struct xrep_rtrmap_stash_run *rsr = priv; 440 + struct xrep_rtrmap *rr = rsr->rr; 441 + xfs_rgblock_t rgbno = start; 442 + 443 + return xrep_rtrmap_stash(rr, rgbno, len, rsr->owner, 0, 0); 444 + } 445 + 446 + /* 447 + * Emit rmaps for every extent of bits set in the bitmap. Caller must ensure 448 + * that the ranges are in units of FS blocks. 449 + */ 450 + STATIC int 451 + xrep_rtrmap_stash_bitmap( 452 + struct xrep_rtrmap *rr, 453 + struct xrgb_bitmap *bitmap, 454 + const struct xfs_owner_info *oinfo) 455 + { 456 + struct xrep_rtrmap_stash_run rsr = { 457 + .rr = rr, 458 + .owner = oinfo->oi_owner, 459 + }; 460 + 461 + return xrgb_bitmap_walk(bitmap, xrep_rtrmap_stash_run, &rsr); 462 + } 463 + 464 + /* Record a CoW staging extent. */ 465 + STATIC int 466 + xrep_rtrmap_walk_cowblocks( 467 + struct xfs_btree_cur *cur, 468 + const struct xfs_refcount_irec *irec, 469 + void *priv) 470 + { 471 + struct xrgb_bitmap *bitmap = priv; 472 + 473 + if (!xfs_refcount_check_domain(irec) || 474 + irec->rc_domain != XFS_REFC_DOMAIN_COW) 475 + return -EFSCORRUPTED; 476 + 477 + return xrgb_bitmap_set(bitmap, irec->rc_startblock, 478 + irec->rc_blockcount); 479 + } 480 + 481 + /* 482 + * Collect rmaps for the blocks containing the refcount btree, and all CoW 483 + * staging extents. 484 + */ 485 + STATIC int 486 + xrep_rtrmap_find_refcount_rmaps( 487 + struct xrep_rtrmap *rr) 488 + { 489 + struct xrgb_bitmap cow_blocks; /* COWBIT */ 490 + struct xfs_refcount_irec low = { 491 + .rc_startblock = 0, 492 + .rc_domain = XFS_REFC_DOMAIN_COW, 493 + }; 494 + struct xfs_refcount_irec high = { 495 + .rc_startblock = -1U, 496 + .rc_domain = XFS_REFC_DOMAIN_COW, 497 + }; 498 + struct xfs_scrub *sc = rr->sc; 499 + int error; 500 + 501 + if (!xfs_has_rtreflink(sc->mp)) 502 + return 0; 503 + 504 + xrgb_bitmap_init(&cow_blocks); 505 + 506 + /* Collect rmaps for CoW staging extents. */ 507 + error = xfs_refcount_query_range(sc->sr.refc_cur, &low, &high, 508 + xrep_rtrmap_walk_cowblocks, &cow_blocks); 509 + if (error) 510 + goto out_bitmap; 511 + 512 + /* Generate rmaps for everything. */ 513 + error = xrep_rtrmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW); 514 + if (error) 515 + goto out_bitmap; 516 + 517 + out_bitmap: 518 + xrgb_bitmap_destroy(&cow_blocks); 519 + return error; 520 + } 521 + 522 + /* Count and check all collected records. */ 523 + STATIC int 524 + xrep_rtrmap_check_record( 525 + struct xfs_btree_cur *cur, 526 + const struct xfs_rmap_irec *rec, 527 + void *priv) 528 + { 529 + struct xrep_rtrmap *rr = priv; 530 + int error; 531 + 532 + error = xrep_rtrmap_check_mapping(rr->sc, rec); 533 + if (error) 534 + return error; 535 + 536 + rr->nr_records++; 537 + return 0; 538 + } 539 + 540 + /* Generate all the reverse-mappings for the realtime device. */ 541 + STATIC int 542 + xrep_rtrmap_find_rmaps( 543 + struct xrep_rtrmap *rr) 544 + { 545 + struct xfs_scrub *sc = rr->sc; 546 + struct xfs_perag *pag = NULL; 547 + struct xfs_inode *ip; 548 + struct xfs_btree_cur *mcur; 549 + int error; 550 + 551 + /* Generate rmaps for the realtime superblock */ 552 + if (xfs_has_rtsb(sc->mp) && rtg_rgno(rr->sc->sr.rtg) == 0) { 553 + error = xrep_rtrmap_stash(rr, 0, sc->mp->m_sb.sb_rextsize, 554 + XFS_RMAP_OWN_FS, 0, 0); 555 + if (error) 556 + return error; 557 + } 558 + 559 + /* Find CoW staging extents. */ 560 + xrep_rtgroup_btcur_init(sc, &sc->sr); 561 + error = xrep_rtrmap_find_refcount_rmaps(rr); 562 + xchk_rtgroup_btcur_free(&sc->sr); 563 + if (error) 564 + return error; 565 + 566 + /* 567 + * Set up for a potentially lengthy filesystem scan by reducing our 568 + * transaction resource usage for the duration. Specifically: 569 + * 570 + * Unlock the realtime metadata inodes and cancel the transaction to 571 + * release the log grant space while we scan the filesystem. 572 + * 573 + * Create a new empty transaction to eliminate the possibility of the 574 + * inode scan deadlocking on cyclical metadata. 575 + * 576 + * We pass the empty transaction to the file scanning function to avoid 577 + * repeatedly cycling empty transactions. This can be done even though 578 + * we take the IOLOCK to quiesce the file because empty transactions 579 + * do not take sb_internal. 580 + */ 581 + xchk_trans_cancel(sc); 582 + xchk_rtgroup_unlock(&sc->sr); 583 + error = xchk_trans_alloc_empty(sc); 584 + if (error) 585 + return error; 586 + 587 + while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) { 588 + error = xrep_rtrmap_scan_inode(rr, ip); 589 + xchk_irele(sc, ip); 590 + if (error) 591 + break; 592 + 593 + if (xchk_should_terminate(sc, &error)) 594 + break; 595 + } 596 + xchk_iscan_iter_finish(&rr->iscan); 597 + if (error) 598 + return error; 599 + 600 + /* 601 + * Switch out for a real transaction and lock the RT metadata in 602 + * preparation for building a new tree. 603 + */ 604 + xchk_trans_cancel(sc); 605 + error = xchk_setup_rt(sc); 606 + if (error) 607 + return error; 608 + error = xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL); 609 + if (error) 610 + return error; 611 + 612 + /* 613 + * If a hook failed to update the in-memory btree, we lack the data to 614 + * continue the repair. 615 + */ 616 + if (xchk_iscan_aborted(&rr->iscan)) 617 + return -EFSCORRUPTED; 618 + 619 + /* Scan for old rtrmap blocks. */ 620 + while ((pag = xfs_perag_next(sc->mp, pag))) { 621 + error = xrep_rtrmap_scan_ag(rr, pag); 622 + if (error) { 623 + xfs_perag_rele(pag); 624 + return error; 625 + } 626 + } 627 + 628 + /* 629 + * Now that we have everything locked again, we need to count the 630 + * number of rmap records stashed in the btree. This should reflect 631 + * all actively-owned rt files in the filesystem. At the same time, 632 + * check all our records before we start building a new btree, which 633 + * requires the rtbitmap lock. 634 + */ 635 + mcur = xfs_rtrmapbt_mem_cursor(rr->sc->sr.rtg, NULL, &rr->rtrmap_btree); 636 + rr->nr_records = 0; 637 + error = xfs_rmap_query_all(mcur, xrep_rtrmap_check_record, rr); 638 + xfs_btree_del_cursor(mcur, error); 639 + 640 + return error; 641 + } 642 + 643 + /* Building the new rtrmap btree. */ 644 + 645 + /* Retrieve rtrmapbt data for bulk load. */ 646 + STATIC int 647 + xrep_rtrmap_get_records( 648 + struct xfs_btree_cur *cur, 649 + unsigned int idx, 650 + struct xfs_btree_block *block, 651 + unsigned int nr_wanted, 652 + void *priv) 653 + { 654 + struct xrep_rtrmap *rr = priv; 655 + union xfs_btree_rec *block_rec; 656 + unsigned int loaded; 657 + int error; 658 + 659 + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { 660 + int stat = 0; 661 + 662 + error = xfs_btree_increment(rr->mcur, 0, &stat); 663 + if (error) 664 + return error; 665 + if (!stat) 666 + return -EFSCORRUPTED; 667 + 668 + error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat); 669 + if (error) 670 + return error; 671 + if (!stat) 672 + return -EFSCORRUPTED; 673 + 674 + block_rec = xfs_btree_rec_addr(cur, idx, block); 675 + cur->bc_ops->init_rec_from_cur(cur, block_rec); 676 + } 677 + 678 + return loaded; 679 + } 680 + 681 + /* Feed one of the new btree blocks to the bulk loader. */ 682 + STATIC int 683 + xrep_rtrmap_claim_block( 684 + struct xfs_btree_cur *cur, 685 + union xfs_btree_ptr *ptr, 686 + void *priv) 687 + { 688 + struct xrep_rtrmap *rr = priv; 689 + 690 + return xrep_newbt_claim_block(cur, &rr->new_btree, ptr); 691 + } 692 + 693 + /* Figure out how much space we need to create the incore btree root block. */ 694 + STATIC size_t 695 + xrep_rtrmap_iroot_size( 696 + struct xfs_btree_cur *cur, 697 + unsigned int level, 698 + unsigned int nr_this_level, 699 + void *priv) 700 + { 701 + return xfs_rtrmap_broot_space_calc(cur->bc_mp, level, nr_this_level); 702 + } 703 + 704 + /* 705 + * Use the collected rmap information to stage a new rmap btree. If this is 706 + * successful we'll return with the new btree root information logged to the 707 + * repair transaction but not yet committed. This implements section (III) 708 + * above. 709 + */ 710 + STATIC int 711 + xrep_rtrmap_build_new_tree( 712 + struct xrep_rtrmap *rr) 713 + { 714 + struct xfs_scrub *sc = rr->sc; 715 + struct xfs_rtgroup *rtg = sc->sr.rtg; 716 + struct xfs_btree_cur *rmap_cur; 717 + int error; 718 + 719 + /* 720 + * Prepare to construct the new btree by reserving disk space for the 721 + * new btree and setting up all the accounting information we'll need 722 + * to root the new btree while it's under construction and before we 723 + * attach it to the realtime rmapbt inode. 724 + */ 725 + error = xrep_newbt_init_metadir_inode(&rr->new_btree, sc); 726 + if (error) 727 + return error; 728 + 729 + rr->new_btree.bload.get_records = xrep_rtrmap_get_records; 730 + rr->new_btree.bload.claim_block = xrep_rtrmap_claim_block; 731 + rr->new_btree.bload.iroot_size = xrep_rtrmap_iroot_size; 732 + 733 + rmap_cur = xfs_rtrmapbt_init_cursor(NULL, rtg); 734 + xfs_btree_stage_ifakeroot(rmap_cur, &rr->new_btree.ifake); 735 + 736 + /* Compute how many blocks we'll need for the rmaps collected. */ 737 + error = xfs_btree_bload_compute_geometry(rmap_cur, 738 + &rr->new_btree.bload, rr->nr_records); 739 + if (error) 740 + goto err_cur; 741 + 742 + /* Last chance to abort before we start committing fixes. */ 743 + if (xchk_should_terminate(sc, &error)) 744 + goto err_cur; 745 + 746 + /* 747 + * Guess how many blocks we're going to need to rebuild an entire 748 + * rtrmapbt from the number of extents we found, and pump up our 749 + * transaction to have sufficient block reservation. We're allowed 750 + * to exceed quota to repair inconsistent metadata, though this is 751 + * unlikely. 752 + */ 753 + error = xfs_trans_reserve_more_inode(sc->tp, rtg_rmap(rtg), 754 + rr->new_btree.bload.nr_blocks, 0, true); 755 + if (error) 756 + goto err_cur; 757 + 758 + /* Reserve the space we'll need for the new btree. */ 759 + error = xrep_newbt_alloc_blocks(&rr->new_btree, 760 + rr->new_btree.bload.nr_blocks); 761 + if (error) 762 + goto err_cur; 763 + 764 + /* 765 + * Create a cursor to the in-memory btree so that we can bulk load the 766 + * new btree. 767 + */ 768 + rr->mcur = xfs_rtrmapbt_mem_cursor(sc->sr.rtg, NULL, &rr->rtrmap_btree); 769 + error = xfs_btree_goto_left_edge(rr->mcur); 770 + if (error) 771 + goto err_mcur; 772 + 773 + /* Add all observed rmap records. */ 774 + rr->new_btree.ifake.if_fork->if_format = XFS_DINODE_FMT_META_BTREE; 775 + error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr); 776 + if (error) 777 + goto err_mcur; 778 + 779 + /* 780 + * Install the new rtrmap btree in the inode. After this point the old 781 + * btree is no longer accessible, the new tree is live, and we can 782 + * delete the cursor. 783 + */ 784 + xfs_rtrmapbt_commit_staged_btree(rmap_cur, sc->tp); 785 + xrep_inode_set_nblocks(rr->sc, rr->new_btree.ifake.if_blocks); 786 + xfs_btree_del_cursor(rmap_cur, 0); 787 + xfs_btree_del_cursor(rr->mcur, 0); 788 + rr->mcur = NULL; 789 + 790 + /* 791 + * Now that we've written the new btree to disk, we don't need to keep 792 + * updating the in-memory btree. Abort the scan to stop live updates. 793 + */ 794 + xchk_iscan_abort(&rr->iscan); 795 + 796 + /* Dispose of any unused blocks and the accounting information. */ 797 + error = xrep_newbt_commit(&rr->new_btree); 798 + if (error) 799 + return error; 800 + 801 + return xrep_roll_trans(sc); 802 + 803 + err_mcur: 804 + xfs_btree_del_cursor(rr->mcur, error); 805 + err_cur: 806 + xfs_btree_del_cursor(rmap_cur, error); 807 + xrep_newbt_cancel(&rr->new_btree); 808 + return error; 809 + } 810 + 811 + /* Reaping the old btree. */ 812 + 813 + /* Reap the old rtrmapbt blocks. */ 814 + STATIC int 815 + xrep_rtrmap_remove_old_tree( 816 + struct xrep_rtrmap *rr) 817 + { 818 + int error; 819 + 820 + /* 821 + * Free all the extents that were allocated to the former rtrmapbt and 822 + * aren't cross-linked with something else. 823 + */ 824 + error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks); 825 + if (error) 826 + return error; 827 + 828 + /* 829 + * Ensure the proper reservation for the rtrmap inode so that we don't 830 + * fail to expand the new btree. 831 + */ 832 + return xrep_reset_metafile_resv(rr->sc); 833 + } 834 + 835 + static inline bool 836 + xrep_rtrmapbt_want_live_update( 837 + struct xchk_iscan *iscan, 838 + const struct xfs_owner_info *oi) 839 + { 840 + if (xchk_iscan_aborted(iscan)) 841 + return false; 842 + 843 + /* 844 + * We scanned the CoW staging extents before we started the iscan, so 845 + * we need all the updates. 846 + */ 847 + if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner)) 848 + return true; 849 + 850 + /* Ignore updates to files that the scanner hasn't visited yet. */ 851 + return xchk_iscan_want_live_update(iscan, oi->oi_owner); 852 + } 853 + 854 + /* 855 + * Apply a rtrmapbt update from the regular filesystem into our shadow btree. 856 + * We're running from the thread that owns the rtrmap ILOCK and is generating 857 + * the update, so we must be careful about which parts of the struct 858 + * xrep_rtrmap that we change. 859 + */ 860 + static int 861 + xrep_rtrmapbt_live_update( 862 + struct notifier_block *nb, 863 + unsigned long action, 864 + void *data) 865 + { 866 + struct xfs_rmap_update_params *p = data; 867 + struct xrep_rtrmap *rr; 868 + struct xfs_mount *mp; 869 + struct xfs_btree_cur *mcur; 870 + struct xfs_trans *tp; 871 + void *txcookie; 872 + int error; 873 + 874 + rr = container_of(nb, struct xrep_rtrmap, rhook.rmap_hook.nb); 875 + mp = rr->sc->mp; 876 + 877 + if (!xrep_rtrmapbt_want_live_update(&rr->iscan, &p->oinfo)) 878 + goto out_unlock; 879 + 880 + trace_xrep_rmap_live_update(rtg_group(rr->sc->sr.rtg), action, p); 881 + 882 + error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp); 883 + if (error) 884 + goto out_abort; 885 + 886 + mutex_lock(&rr->lock); 887 + mcur = xfs_rtrmapbt_mem_cursor(rr->sc->sr.rtg, tp, &rr->rtrmap_btree); 888 + error = __xfs_rmap_finish_intent(mcur, action, p->startblock, 889 + p->blockcount, &p->oinfo, p->unwritten); 890 + xfs_btree_del_cursor(mcur, error); 891 + if (error) 892 + goto out_cancel; 893 + 894 + error = xfbtree_trans_commit(&rr->rtrmap_btree, tp); 895 + if (error) 896 + goto out_cancel; 897 + 898 + xrep_trans_cancel_hook_dummy(&txcookie, tp); 899 + mutex_unlock(&rr->lock); 900 + return NOTIFY_DONE; 901 + 902 + out_cancel: 903 + xfbtree_trans_cancel(&rr->rtrmap_btree, tp); 904 + xrep_trans_cancel_hook_dummy(&txcookie, tp); 905 + out_abort: 906 + xchk_iscan_abort(&rr->iscan); 907 + mutex_unlock(&rr->lock); 908 + out_unlock: 909 + return NOTIFY_DONE; 910 + } 911 + 912 + /* Set up the filesystem scan components. */ 913 + STATIC int 914 + xrep_rtrmap_setup_scan( 915 + struct xrep_rtrmap *rr) 916 + { 917 + struct xfs_scrub *sc = rr->sc; 918 + int error; 919 + 920 + mutex_init(&rr->lock); 921 + xfsb_bitmap_init(&rr->old_rtrmapbt_blocks); 922 + 923 + /* Set up some storage */ 924 + error = xfs_rtrmapbt_mem_init(sc->mp, &rr->rtrmap_btree, sc->xmbtp, 925 + rtg_rgno(sc->sr.rtg)); 926 + if (error) 927 + goto out_bitmap; 928 + 929 + /* Retry iget every tenth of a second for up to 30 seconds. */ 930 + xchk_iscan_start(sc, 30000, 100, &rr->iscan); 931 + 932 + /* 933 + * Hook into live rtrmap operations so that we can update our in-memory 934 + * btree to reflect live changes on the filesystem. Since we drop the 935 + * rtrmap ILOCK to scan all the inodes, we need this piece to avoid 936 + * installing a stale btree. 937 + */ 938 + ASSERT(sc->flags & XCHK_FSGATES_RMAP); 939 + xfs_rmap_hook_setup(&rr->rhook, xrep_rtrmapbt_live_update); 940 + error = xfs_rmap_hook_add(rtg_group(sc->sr.rtg), &rr->rhook); 941 + if (error) 942 + goto out_iscan; 943 + return 0; 944 + 945 + out_iscan: 946 + xchk_iscan_teardown(&rr->iscan); 947 + xfbtree_destroy(&rr->rtrmap_btree); 948 + out_bitmap: 949 + xfsb_bitmap_destroy(&rr->old_rtrmapbt_blocks); 950 + mutex_destroy(&rr->lock); 951 + return error; 952 + } 953 + 954 + /* Tear down scan components. */ 955 + STATIC void 956 + xrep_rtrmap_teardown( 957 + struct xrep_rtrmap *rr) 958 + { 959 + struct xfs_scrub *sc = rr->sc; 960 + 961 + xchk_iscan_abort(&rr->iscan); 962 + xfs_rmap_hook_del(rtg_group(sc->sr.rtg), &rr->rhook); 963 + xchk_iscan_teardown(&rr->iscan); 964 + xfbtree_destroy(&rr->rtrmap_btree); 965 + xfsb_bitmap_destroy(&rr->old_rtrmapbt_blocks); 966 + mutex_destroy(&rr->lock); 967 + } 968 + 969 + /* Repair the realtime rmap btree. */ 970 + int 971 + xrep_rtrmapbt( 972 + struct xfs_scrub *sc) 973 + { 974 + struct xrep_rtrmap *rr = sc->buf; 975 + int error; 976 + 977 + /* Make sure any problems with the fork are fixed. */ 978 + error = xrep_metadata_inode_forks(sc); 979 + if (error) 980 + return error; 981 + 982 + error = xrep_rtrmap_setup_scan(rr); 983 + if (error) 984 + return error; 985 + 986 + /* Collect rmaps for realtime files. */ 987 + error = xrep_rtrmap_find_rmaps(rr); 988 + if (error) 989 + goto out_records; 990 + 991 + xfs_trans_ijoin(sc->tp, sc->ip, 0); 992 + 993 + /* Rebuild the rtrmap information. */ 994 + error = xrep_rtrmap_build_new_tree(rr); 995 + if (error) 996 + goto out_records; 997 + 998 + /* Kill the old tree. */ 999 + error = xrep_rtrmap_remove_old_tree(rr); 1000 + if (error) 1001 + goto out_records; 1002 + 1003 + out_records: 1004 + xrep_rtrmap_teardown(rr); 1005 + return error; 1006 + }

+9 -8

fs/xfs/scrub/rtsummary.c

··· 81 81 if (error) 82 82 return error; 83 83 84 - error = xchk_install_live_inode(sc, 85 - sc->sr.rtg->rtg_inodes[XFS_RTGI_SUMMARY]); 84 + error = xchk_install_live_inode(sc, rtg_summary(sc->sr.rtg)); 86 85 if (error) 87 86 return error; 88 87 89 88 error = xchk_ino_dqattach(sc); 89 + if (error) 90 + return error; 91 + 92 + error = xchk_rtgroup_lock(sc, &sc->sr, XFS_RTGLOCK_BITMAP); 90 93 if (error) 91 94 return error; 92 95 ··· 103 100 * exclusively here. If we ever start caring about running concurrent 104 101 * fsmap with scrub this could be changed. 105 102 */ 106 - xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP); 107 103 if (mp->m_sb.sb_rblocks) { 108 104 rts->rextents = xfs_blen_to_rtbxlen(mp, mp->m_sb.sb_rblocks); 109 105 rts->rbmblocks = xfs_rtbitmap_blockcount(mp); ··· 193 191 rtlen = xfs_rtxlen_to_extlen(mp, rec->ar_extcount); 194 192 195 193 if (!xfs_verify_rtbext(mp, rtbno, rtlen)) { 196 - xchk_ino_xref_set_corrupt(sc, 197 - rtg->rtg_inodes[XFS_RTGI_BITMAP]->i_ino); 194 + xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino); 198 195 return -EFSCORRUPTED; 199 196 } 200 197 ··· 219 218 220 219 /* If the bitmap size doesn't match the computed size, bail. */ 221 220 if (XFS_FSB_TO_B(mp, xfs_rtbitmap_blockcount(mp)) != 222 - rtg->rtg_inodes[XFS_RTGI_BITMAP]->i_disk_size) 221 + rtg_bitmap(rtg)->i_disk_size) 223 222 return -EFSCORRUPTED; 224 223 225 224 return xfs_rtalloc_query_all(rtg, sc->tp, xchk_rtsum_record_free, sc); ··· 311 310 { 312 311 struct xfs_mount *mp = sc->mp; 313 312 struct xfs_rtgroup *rtg = sc->sr.rtg; 314 - struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP]; 315 - struct xfs_inode *rsumip = rtg->rtg_inodes[XFS_RTGI_SUMMARY]; 313 + struct xfs_inode *rbmip = rtg_bitmap(rtg); 314 + struct xfs_inode *rsumip = rtg_summary(rtg); 316 315 struct xchk_rtsummary *rts = sc->buf; 317 316 int error; 318 317

+2 -1

fs/xfs/scrub/rtsummary_repair.c

··· 165 165 * Now exchange the contents. Nothing in repair uses the temporary 166 166 * buffer, so we can reuse it for the tempfile exchrange information. 167 167 */ 168 - error = xrep_tempexch_trans_reserve(sc, XFS_DATA_FORK, &rts->tempexch); 168 + error = xrep_tempexch_trans_reserve(sc, XFS_DATA_FORK, 0, 169 + rts->rsumblocks, &rts->tempexch); 169 170 if (error) 170 171 return error; 171 172

+17 -1

fs/xfs/scrub/scrub.c

··· 164 164 trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL); 165 165 166 166 if (sc->flags & XCHK_FSGATES_DRAIN) 167 - xfs_drain_wait_disable(); 167 + xfs_defer_drain_wait_disable(); 168 168 169 169 if (sc->flags & XCHK_FSGATES_QUOTA) 170 170 xfs_dqtrx_hook_disable(); ··· 218 218 int error) 219 219 { 220 220 xchk_ag_free(sc, &sc->sa); 221 + xchk_rtgroup_btcur_free(&sc->sr); 222 + 221 223 if (sc->tp) { 222 224 if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) 223 225 error = xfs_trans_commit(sc->tp); ··· 459 457 .scrub = xchk_rgsuperblock, 460 458 .has = xfs_has_rtsb, 461 459 .repair = xrep_rgsuperblock, 460 + }, 461 + [XFS_SCRUB_TYPE_RTRMAPBT] = { /* realtime group rmapbt */ 462 + .type = ST_RTGROUP, 463 + .setup = xchk_setup_rtrmapbt, 464 + .scrub = xchk_rtrmapbt, 465 + .has = xfs_has_rtrmapbt, 466 + .repair = xrep_rtrmapbt, 467 + }, 468 + [XFS_SCRUB_TYPE_RTREFCBT] = { /* realtime refcountbt */ 469 + .type = ST_RTGROUP, 470 + .setup = xchk_setup_rtrefcountbt, 471 + .scrub = xchk_rtrefcountbt, 472 + .has = xfs_has_rtreflink, 473 + .repair = xrep_rtrefcountbt, 462 474 }, 463 475 }; 464 476

+27 -1

fs/xfs/scrub/scrub.h

··· 96 96 int (*repair_eval)(struct xfs_scrub *sc); 97 97 98 98 /* Decide if we even have this piece of metadata. */ 99 - bool (*has)(struct xfs_mount *); 99 + bool (*has)(const struct xfs_mount *); 100 100 101 101 /* type describing required/allowed inputs */ 102 102 enum xchk_type type; ··· 126 126 127 127 /* XFS_RTGLOCK_* lock state if locked */ 128 128 unsigned int rtlock_flags; 129 + 130 + /* rtgroup btrees */ 131 + struct xfs_btree_cur *rmap_cur; 132 + struct xfs_btree_cur *refc_cur; 129 133 }; 130 134 131 135 struct xfs_scrub { ··· 284 280 int xchk_rtbitmap(struct xfs_scrub *sc); 285 281 int xchk_rtsummary(struct xfs_scrub *sc); 286 282 int xchk_rgsuperblock(struct xfs_scrub *sc); 283 + int xchk_rtrmapbt(struct xfs_scrub *sc); 284 + int xchk_rtrefcountbt(struct xfs_scrub *sc); 287 285 #else 288 286 # define xchk_rtbitmap xchk_nothing 289 287 # define xchk_rtsummary xchk_nothing 290 288 # define xchk_rgsuperblock xchk_nothing 289 + # define xchk_rtrmapbt xchk_nothing 290 + # define xchk_rtrefcountbt xchk_nothing 291 291 #endif 292 292 #ifdef CONFIG_XFS_QUOTA 293 293 int xchk_quota(struct xfs_scrub *sc); ··· 325 317 #ifdef CONFIG_XFS_RT 326 318 void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno, 327 319 xfs_extlen_t len); 320 + void xchk_xref_has_no_rt_owner(struct xfs_scrub *sc, xfs_rgblock_t rgbno, 321 + xfs_extlen_t len); 322 + void xchk_xref_has_rt_owner(struct xfs_scrub *sc, xfs_rgblock_t rgbno, 323 + xfs_extlen_t len); 324 + void xchk_xref_is_only_rt_owned_by(struct xfs_scrub *sc, xfs_rgblock_t rgbno, 325 + xfs_extlen_t len, const struct xfs_owner_info *oinfo); 326 + void xchk_xref_is_rt_cow_staging(struct xfs_scrub *sc, xfs_rgblock_t rgbno, 327 + xfs_extlen_t len); 328 + void xchk_xref_is_not_rt_shared(struct xfs_scrub *sc, xfs_rgblock_t rgbno, 329 + xfs_extlen_t len); 330 + void xchk_xref_is_not_rt_cow_staging(struct xfs_scrub *sc, xfs_rgblock_t rgbno, 331 + xfs_extlen_t len); 328 332 #else 329 333 # define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0) 334 + # define xchk_xref_has_no_rt_owner(sc, rtbno, len) do { } while (0) 335 + # define xchk_xref_has_rt_owner(sc, rtbno, len) do { } while (0) 336 + # define xchk_xref_is_only_rt_owned_by(sc, bno, len, oinfo) do { } while (0) 337 + # define xchk_xref_is_rt_cow_staging(sc, bno, len) do { } while (0) 338 + # define xchk_xref_is_not_rt_shared(sc, bno, len) do { } while (0) 339 + # define xchk_xref_is_not_rt_cow_staging(sc, bno, len) do { } while (0) 330 340 #endif 331 341 332 342 #endif /* __XFS_SCRUB_SCRUB_H__ */

+2

fs/xfs/scrub/stats.c

··· 82 82 [XFS_SCRUB_TYPE_DIRTREE] = "dirtree", 83 83 [XFS_SCRUB_TYPE_METAPATH] = "metapath", 84 84 [XFS_SCRUB_TYPE_RGSUPER] = "rgsuper", 85 + [XFS_SCRUB_TYPE_RTRMAPBT] = "rtrmapbt", 86 + [XFS_SCRUB_TYPE_RTREFCBT] = "rtrefcountbt", 85 87 }; 86 88 87 89 /* Format the scrub stats into a text buffer, similar to pcp style. */

+1 -1

fs/xfs/scrub/tempexch.h

··· 12 12 }; 13 13 14 14 int xrep_tempexch_trans_reserve(struct xfs_scrub *sc, int whichfork, 15 - struct xrep_tempexch *ti); 15 + xfs_fileoff_t off, xfs_filblks_t len, struct xrep_tempexch *ti); 16 16 int xrep_tempexch_trans_alloc(struct xfs_scrub *sc, int whichfork, 17 17 struct xrep_tempexch *ti); 18 18

+14 -7

fs/xfs/scrub/tempfile.c

··· 606 606 xrep_tempexch_prep_request( 607 607 struct xfs_scrub *sc, 608 608 int whichfork, 609 + xfs_fileoff_t off, 610 + xfs_filblks_t len, 609 611 struct xrep_tempexch *tx) 610 612 { 611 613 struct xfs_exchmaps_req *req = &tx->req; ··· 631 629 /* Exchange all mappings in both forks. */ 632 630 req->ip1 = sc->tempip; 633 631 req->ip2 = sc->ip; 634 - req->startoff1 = 0; 635 - req->startoff2 = 0; 632 + req->startoff1 = off; 633 + req->startoff2 = off; 636 634 switch (whichfork) { 637 635 case XFS_ATTR_FORK: 638 636 req->flags |= XFS_EXCHMAPS_ATTR_FORK; 639 637 break; 640 638 case XFS_DATA_FORK: 641 - /* Always exchange sizes when exchanging data fork mappings. */ 642 - req->flags |= XFS_EXCHMAPS_SET_SIZES; 639 + /* Exchange sizes when exchanging all data fork mappings. */ 640 + if (off == 0 && len == XFS_MAX_FILEOFF) 641 + req->flags |= XFS_EXCHMAPS_SET_SIZES; 643 642 break; 644 643 } 645 - req->blockcount = XFS_MAX_FILEOFF; 644 + req->blockcount = len; 646 645 647 646 return 0; 648 647 } ··· 752 749 * or the two inodes have the same dquots. 753 750 */ 754 751 if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 || 752 + xfs_is_metadir_inode(req->ip1) || 755 753 (req->ip1->i_udquot == req->ip2->i_udquot && 756 754 req->ip1->i_gdquot == req->ip2->i_gdquot && 757 755 req->ip1->i_pdquot == req->ip2->i_pdquot)) ··· 799 795 xrep_tempexch_trans_reserve( 800 796 struct xfs_scrub *sc, 801 797 int whichfork, 798 + xfs_fileoff_t off, 799 + xfs_filblks_t len, 802 800 struct xrep_tempexch *tx) 803 801 { 804 802 int error; ··· 809 803 xfs_assert_ilocked(sc->ip, XFS_ILOCK_EXCL); 810 804 xfs_assert_ilocked(sc->tempip, XFS_ILOCK_EXCL); 811 805 812 - error = xrep_tempexch_prep_request(sc, whichfork, tx); 806 + error = xrep_tempexch_prep_request(sc, whichfork, off, len, tx); 813 807 if (error) 814 808 return error; 815 809 ··· 847 841 ASSERT(sc->tp == NULL); 848 842 ASSERT(xfs_has_exchange_range(sc->mp)); 849 843 850 - error = xrep_tempexch_prep_request(sc, whichfork, tx); 844 + error = xrep_tempexch_prep_request(sc, whichfork, 0, XFS_MAX_FILEOFF, 845 + tx); 851 846 if (error) 852 847 return error; 853 848

+1

fs/xfs/scrub/trace.c

··· 21 21 #include "xfs_rmap.h" 22 22 #include "xfs_parent.h" 23 23 #include "xfs_metafile.h" 24 + #include "xfs_rtgroup.h" 24 25 #include "scrub/scrub.h" 25 26 #include "scrub/xfile.h" 26 27 #include "scrub/xfarray.h"

+255 -25

fs/xfs/scrub/trace.h

··· 17 17 #include "xfs_bit.h" 18 18 #include "xfs_quota_defs.h" 19 19 20 + struct xfs_rtgroup; 20 21 struct xfs_scrub; 21 22 struct xfile; 22 23 struct xfarray; ··· 40 39 */ 41 40 TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); 42 41 TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); 42 + 43 + TRACE_DEFINE_ENUM(XG_TYPE_AG); 44 + TRACE_DEFINE_ENUM(XG_TYPE_RTG); 43 45 44 46 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PROBE); 45 47 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SB); ··· 76 72 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER); 77 73 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_METAPATH); 78 74 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGSUPER); 75 + TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTRMAPBT); 76 + TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTREFCBT); 79 77 80 78 #define XFS_SCRUB_TYPE_STRINGS \ 81 79 { XFS_SCRUB_TYPE_PROBE, "probe" }, \ ··· 111 105 { XFS_SCRUB_TYPE_DIRTREE, "dirtree" }, \ 112 106 { XFS_SCRUB_TYPE_BARRIER, "barrier" }, \ 113 107 { XFS_SCRUB_TYPE_METAPATH, "metapath" }, \ 114 - { XFS_SCRUB_TYPE_RGSUPER, "rgsuper" } 108 + { XFS_SCRUB_TYPE_RGSUPER, "rgsuper" }, \ 109 + { XFS_SCRUB_TYPE_RTRMAPBT, "rtrmapbt" }, \ 110 + { XFS_SCRUB_TYPE_RTREFCBT, "rtrefcountbt" } 115 111 116 112 #define XFS_SCRUB_FLAG_STRINGS \ 117 113 { XFS_SCRUB_IFLAG_REPAIR, "repair" }, \ ··· 1964 1956 #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) 1965 1957 1966 1958 DECLARE_EVENT_CLASS(xrep_extent_class, 1967 - TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, 1959 + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, 1968 1960 xfs_extlen_t len), 1969 - TP_ARGS(pag, agbno, len), 1961 + TP_ARGS(xg, agbno, len), 1970 1962 TP_STRUCT__entry( 1971 1963 __field(dev_t, dev) 1964 + __field(enum xfs_group_type, type) 1972 1965 __field(xfs_agnumber_t, agno) 1973 1966 __field(xfs_agblock_t, agbno) 1974 1967 __field(xfs_extlen_t, len) 1975 1968 ), 1976 1969 TP_fast_assign( 1977 - __entry->dev = pag_mount(pag)->m_super->s_dev; 1978 - __entry->agno = pag_agno(pag); 1970 + __entry->dev = xg->xg_mount->m_super->s_dev; 1971 + __entry->type = xg->xg_type; 1972 + __entry->agno = xg->xg_gno; 1979 1973 __entry->agbno = agbno; 1980 1974 __entry->len = len; 1981 1975 ), 1982 - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", 1976 + TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x", 1983 1977 MAJOR(__entry->dev), MINOR(__entry->dev), 1978 + __print_symbolic(__entry->type, XG_TYPE_STRINGS), 1984 1979 __entry->agno, 1980 + __print_symbolic(__entry->type, XG_TYPE_STRINGS), 1985 1981 __entry->agbno, 1986 1982 __entry->len) 1987 1983 ); 1988 1984 #define DEFINE_REPAIR_EXTENT_EVENT(name) \ 1989 1985 DEFINE_EVENT(xrep_extent_class, name, \ 1990 - TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, \ 1986 + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \ 1991 1987 xfs_extlen_t len), \ 1992 - TP_ARGS(pag, agbno, len)) 1988 + TP_ARGS(xg, agbno, len)) 1993 1989 DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent); 1994 1990 DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent); 1995 1991 DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval); ··· 2001 1989 DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert); 2002 1990 2003 1991 DECLARE_EVENT_CLASS(xrep_reap_find_class, 2004 - TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, 1992 + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, 2005 1993 xfs_extlen_t len, bool crosslinked), 2006 - TP_ARGS(pag, agbno, len, crosslinked), 1994 + TP_ARGS(xg, agbno, len, crosslinked), 2007 1995 TP_STRUCT__entry( 2008 1996 __field(dev_t, dev) 1997 + __field(enum xfs_group_type, type) 2009 1998 __field(xfs_agnumber_t, agno) 2010 1999 __field(xfs_agblock_t, agbno) 2011 2000 __field(xfs_extlen_t, len) 2012 2001 __field(bool, crosslinked) 2013 2002 ), 2014 2003 TP_fast_assign( 2015 - __entry->dev = pag_mount(pag)->m_super->s_dev; 2016 - __entry->agno = pag_agno(pag); 2004 + __entry->dev = xg->xg_mount->m_super->s_dev; 2005 + __entry->type = xg->xg_type; 2006 + __entry->agno = xg->xg_gno; 2017 2007 __entry->agbno = agbno; 2018 2008 __entry->len = len; 2019 2009 __entry->crosslinked = crosslinked; 2020 2010 ), 2021 - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x crosslinked %d", 2011 + TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x crosslinked %d", 2022 2012 MAJOR(__entry->dev), MINOR(__entry->dev), 2013 + __print_symbolic(__entry->type, XG_TYPE_STRINGS), 2023 2014 __entry->agno, 2015 + __print_symbolic(__entry->type, XG_TYPE_STRINGS), 2024 2016 __entry->agbno, 2025 2017 __entry->len, 2026 2018 __entry->crosslinked ? 1 : 0) 2027 2019 ); 2028 2020 #define DEFINE_REPAIR_REAP_FIND_EVENT(name) \ 2029 2021 DEFINE_EVENT(xrep_reap_find_class, name, \ 2030 - TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, \ 2022 + TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \ 2031 2023 xfs_extlen_t len, bool crosslinked), \ 2032 - TP_ARGS(pag, agbno, len, crosslinked)) 2024 + TP_ARGS(xg, agbno, len, crosslinked)) 2033 2025 DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select); 2034 2026 DEFINE_REPAIR_REAP_FIND_EVENT(xreap_bmapi_select); 2035 2027 ··· 2124 2108 ) 2125 2109 2126 2110 TRACE_EVENT(xrep_refc_found, 2127 - TP_PROTO(const struct xfs_perag *pag, 2111 + TP_PROTO(const struct xfs_group *xg, 2128 2112 const struct xfs_refcount_irec *rec), 2129 - TP_ARGS(pag, rec), 2113 + TP_ARGS(xg, rec), 2130 2114 TP_STRUCT__entry( 2131 2115 __field(dev_t, dev) 2132 2116 __field(xfs_agnumber_t, agno) 2133 2117 __field(enum xfs_refc_domain, domain) 2118 + __field(enum xfs_group_type, type) 2134 2119 __field(xfs_agblock_t, startblock) 2135 2120 __field(xfs_extlen_t, blockcount) 2136 2121 __field(xfs_nlink_t, refcount) 2137 2122 ), 2138 2123 TP_fast_assign( 2139 - __entry->dev = pag_mount(pag)->m_super->s_dev; 2140 - __entry->agno = pag_agno(pag); 2124 + __entry->dev = xg->xg_mount->m_super->s_dev; 2125 + __entry->agno = xg->xg_gno; 2126 + __entry->type = xg->xg_type; 2141 2127 __entry->domain = rec->rc_domain; 2142 2128 __entry->startblock = rec->rc_startblock; 2143 2129 __entry->blockcount = rec->rc_blockcount; 2144 2130 __entry->refcount = rec->rc_refcount; 2145 2131 ), 2146 - TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u", 2132 + TP_printk("dev %d:%d %sno 0x%x dom %s %sbno 0x%x fsbcount 0x%x refcount %u", 2147 2133 MAJOR(__entry->dev), MINOR(__entry->dev), 2134 + __print_symbolic(__entry->type, XG_TYPE_STRINGS), 2148 2135 __entry->agno, 2149 2136 __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), 2137 + __print_symbolic(__entry->type, XG_TYPE_STRINGS), 2150 2138 __entry->startblock, 2151 2139 __entry->blockcount, 2152 2140 __entry->refcount) ··· 2302 2282 __entry->rmapbt_sz, 2303 2283 __entry->refcbt_sz) 2304 2284 ) 2285 + 2286 + #ifdef CONFIG_XFS_RT 2287 + TRACE_EVENT(xrep_calc_rtgroup_resblks_btsize, 2288 + TP_PROTO(struct xfs_mount *mp, xfs_rgnumber_t rgno, 2289 + xfs_rgblock_t usedlen, xfs_rgblock_t rmapbt_sz), 2290 + TP_ARGS(mp, rgno, usedlen, rmapbt_sz), 2291 + TP_STRUCT__entry( 2292 + __field(dev_t, dev) 2293 + __field(xfs_rgnumber_t, rgno) 2294 + __field(xfs_rgblock_t, usedlen) 2295 + __field(xfs_rgblock_t, rmapbt_sz) 2296 + ), 2297 + TP_fast_assign( 2298 + __entry->dev = mp->m_super->s_dev; 2299 + __entry->rgno = rgno; 2300 + __entry->usedlen = usedlen; 2301 + __entry->rmapbt_sz = rmapbt_sz; 2302 + ), 2303 + TP_printk("dev %d:%d rgno 0x%x usedlen %u rmapbt %u", 2304 + MAJOR(__entry->dev), MINOR(__entry->dev), 2305 + __entry->rgno, 2306 + __entry->usedlen, 2307 + __entry->rmapbt_sz) 2308 + ); 2309 + #endif /* CONFIG_XFS_RT */ 2310 + 2305 2311 TRACE_EVENT(xrep_reset_counters, 2306 2312 TP_PROTO(struct xfs_mount *mp, struct xchk_fscounters *fsc), 2307 2313 TP_ARGS(mp, fsc), ··· 2726 2680 DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_unfixable_inode); 2727 2681 2728 2682 TRACE_EVENT(xrep_rmap_live_update, 2729 - TP_PROTO(const struct xfs_perag *pag, unsigned int op, 2683 + TP_PROTO(const struct xfs_group *xg, unsigned int op, 2730 2684 const struct xfs_rmap_update_params *p), 2731 - TP_ARGS(pag, op, p), 2685 + TP_ARGS(xg, op, p), 2732 2686 TP_STRUCT__entry( 2733 2687 __field(dev_t, dev) 2688 + __field(enum xfs_group_type, type) 2734 2689 __field(xfs_agnumber_t, agno) 2735 2690 __field(unsigned int, op) 2736 2691 __field(xfs_agblock_t, agbno) ··· 2741 2694 __field(unsigned int, flags) 2742 2695 ), 2743 2696 TP_fast_assign( 2744 - __entry->dev = pag_mount(pag)->m_super->s_dev; 2745 - __entry->agno = pag_agno(pag); 2697 + __entry->dev = xg->xg_mount->m_super->s_dev; 2698 + __entry->type = xg->xg_type; 2699 + __entry->agno = xg->xg_gno; 2746 2700 __entry->op = op; 2747 2701 __entry->agbno = p->startblock; 2748 2702 __entry->len = p->blockcount; ··· 2752 2704 if (p->unwritten) 2753 2705 __entry->flags |= XFS_RMAP_UNWRITTEN; 2754 2706 ), 2755 - TP_printk("dev %d:%d agno 0x%x op %d agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", 2707 + TP_printk("dev %d:%d %sno 0x%x op %d %sbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", 2756 2708 MAJOR(__entry->dev), MINOR(__entry->dev), 2709 + __print_symbolic(__entry->type, XG_TYPE_STRINGS), 2757 2710 __entry->agno, 2758 2711 __entry->op, 2712 + __print_symbolic(__entry->type, XG_TYPE_STRINGS), 2759 2713 __entry->agbno, 2760 2714 __entry->len, 2761 2715 __entry->owner, ··· 3654 3604 DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_try_unlink); 3655 3605 DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_unlink); 3656 3606 DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_link); 3607 + 3608 + #ifdef CONFIG_XFS_RT 3609 + DECLARE_EVENT_CLASS(xrep_rtbitmap_class, 3610 + TP_PROTO(struct xfs_mount *mp, xfs_rtxnum_t start, xfs_rtxnum_t end), 3611 + TP_ARGS(mp, start, end), 3612 + TP_STRUCT__entry( 3613 + __field(dev_t, dev) 3614 + __field(dev_t, rtdev) 3615 + __field(xfs_rtxnum_t, start) 3616 + __field(xfs_rtxnum_t, end) 3617 + ), 3618 + TP_fast_assign( 3619 + __entry->dev = mp->m_super->s_dev; 3620 + __entry->rtdev = mp->m_rtdev_targp->bt_dev; 3621 + __entry->start = start; 3622 + __entry->end = end; 3623 + ), 3624 + TP_printk("dev %d:%d rtdev %d:%d startrtx 0x%llx endrtx 0x%llx", 3625 + MAJOR(__entry->dev), MINOR(__entry->dev), 3626 + MAJOR(__entry->rtdev), MINOR(__entry->rtdev), 3627 + __entry->start, 3628 + __entry->end) 3629 + ); 3630 + #define DEFINE_REPAIR_RGBITMAP_EVENT(name) \ 3631 + DEFINE_EVENT(xrep_rtbitmap_class, name, \ 3632 + TP_PROTO(struct xfs_mount *mp, xfs_rtxnum_t start, \ 3633 + xfs_rtxnum_t end), \ 3634 + TP_ARGS(mp, start, end)) 3635 + DEFINE_REPAIR_RGBITMAP_EVENT(xrep_rtbitmap_record_free); 3636 + DEFINE_REPAIR_RGBITMAP_EVENT(xrep_rtbitmap_record_free_bulk); 3637 + 3638 + TRACE_EVENT(xrep_rtbitmap_or, 3639 + TP_PROTO(struct xfs_mount *mp, unsigned long long wordoff, 3640 + xfs_rtword_t mask, xfs_rtword_t word), 3641 + TP_ARGS(mp, wordoff, mask, word), 3642 + TP_STRUCT__entry( 3643 + __field(dev_t, dev) 3644 + __field(dev_t, rtdev) 3645 + __field(unsigned long long, wordoff) 3646 + __field(unsigned int, mask) 3647 + __field(unsigned int, word) 3648 + ), 3649 + TP_fast_assign( 3650 + __entry->dev = mp->m_super->s_dev; 3651 + __entry->rtdev = mp->m_rtdev_targp->bt_dev; 3652 + __entry->wordoff = wordoff; 3653 + __entry->mask = mask; 3654 + __entry->word = word; 3655 + ), 3656 + TP_printk("dev %d:%d rtdev %d:%d wordoff 0x%llx mask 0x%x word 0x%x", 3657 + MAJOR(__entry->dev), MINOR(__entry->dev), 3658 + MAJOR(__entry->rtdev), MINOR(__entry->rtdev), 3659 + __entry->wordoff, 3660 + __entry->mask, 3661 + __entry->word) 3662 + ); 3663 + 3664 + TRACE_EVENT(xrep_rtbitmap_load, 3665 + TP_PROTO(struct xfs_rtgroup *rtg, xfs_fileoff_t rbmoff, 3666 + xfs_rtxnum_t rtx, xfs_rtxnum_t len), 3667 + TP_ARGS(rtg, rbmoff, rtx, len), 3668 + TP_STRUCT__entry( 3669 + __field(dev_t, dev) 3670 + __field(dev_t, rtdev) 3671 + __field(xfs_rgnumber_t, rgno) 3672 + __field(xfs_fileoff_t, rbmoff) 3673 + __field(xfs_rtxnum_t, rtx) 3674 + __field(xfs_rtxnum_t, len) 3675 + ), 3676 + TP_fast_assign( 3677 + __entry->dev = rtg_mount(rtg)->m_super->s_dev; 3678 + __entry->rtdev = rtg_mount(rtg)->m_rtdev_targp->bt_dev; 3679 + __entry->rgno = rtg_rgno(rtg); 3680 + __entry->rbmoff = rbmoff; 3681 + __entry->rtx = rtx; 3682 + __entry->len = len; 3683 + ), 3684 + TP_printk("dev %d:%d rtdev %d:%d rgno 0x%x rbmoff 0x%llx rtx 0x%llx rtxcount 0x%llx", 3685 + MAJOR(__entry->dev), MINOR(__entry->dev), 3686 + MAJOR(__entry->rtdev), MINOR(__entry->rtdev), 3687 + __entry->rgno, 3688 + __entry->rbmoff, 3689 + __entry->rtx, 3690 + __entry->len) 3691 + ); 3692 + 3693 + TRACE_EVENT(xrep_rtbitmap_load_words, 3694 + TP_PROTO(struct xfs_mount *mp, xfs_fileoff_t rbmoff, 3695 + unsigned long long wordoff, unsigned int wordcnt), 3696 + TP_ARGS(mp, rbmoff, wordoff, wordcnt), 3697 + TP_STRUCT__entry( 3698 + __field(dev_t, dev) 3699 + __field(dev_t, rtdev) 3700 + __field(xfs_fileoff_t, rbmoff) 3701 + __field(unsigned long long, wordoff) 3702 + __field(unsigned int, wordcnt) 3703 + ), 3704 + TP_fast_assign( 3705 + __entry->dev = mp->m_super->s_dev; 3706 + __entry->rtdev = mp->m_rtdev_targp->bt_dev; 3707 + __entry->rbmoff = rbmoff; 3708 + __entry->wordoff = wordoff; 3709 + __entry->wordcnt = wordcnt; 3710 + ), 3711 + TP_printk("dev %d:%d rtdev %d:%d rbmoff 0x%llx wordoff 0x%llx wordcnt 0x%x", 3712 + MAJOR(__entry->dev), MINOR(__entry->dev), 3713 + MAJOR(__entry->rtdev), MINOR(__entry->rtdev), 3714 + __entry->rbmoff, 3715 + __entry->wordoff, 3716 + __entry->wordcnt) 3717 + ); 3718 + 3719 + TRACE_EVENT(xrep_rtbitmap_load_word, 3720 + TP_PROTO(struct xfs_mount *mp, unsigned long long wordoff, 3721 + unsigned int bit, xfs_rtword_t ondisk_word, 3722 + xfs_rtword_t xfile_word, xfs_rtword_t word_mask), 3723 + TP_ARGS(mp, wordoff, bit, ondisk_word, xfile_word, word_mask), 3724 + TP_STRUCT__entry( 3725 + __field(dev_t, dev) 3726 + __field(dev_t, rtdev) 3727 + __field(unsigned long long, wordoff) 3728 + __field(unsigned int, bit) 3729 + __field(xfs_rtword_t, ondisk_word) 3730 + __field(xfs_rtword_t, xfile_word) 3731 + __field(xfs_rtword_t, word_mask) 3732 + ), 3733 + TP_fast_assign( 3734 + __entry->dev = mp->m_super->s_dev; 3735 + __entry->rtdev = mp->m_rtdev_targp->bt_dev; 3736 + __entry->wordoff = wordoff; 3737 + __entry->bit = bit; 3738 + __entry->ondisk_word = ondisk_word; 3739 + __entry->xfile_word = xfile_word; 3740 + __entry->word_mask = word_mask; 3741 + ), 3742 + TP_printk("dev %d:%d rtdev %d:%d wordoff 0x%llx bit %u ondisk 0x%x(0x%x) inmem 0x%x(0x%x) result 0x%x mask 0x%x", 3743 + MAJOR(__entry->dev), MINOR(__entry->dev), 3744 + MAJOR(__entry->rtdev), MINOR(__entry->rtdev), 3745 + __entry->wordoff, 3746 + __entry->bit, 3747 + __entry->ondisk_word, 3748 + __entry->ondisk_word & __entry->word_mask, 3749 + __entry->xfile_word, 3750 + __entry->xfile_word & ~__entry->word_mask, 3751 + (__entry->xfile_word & ~__entry->word_mask) | 3752 + (__entry->ondisk_word & __entry->word_mask), 3753 + __entry->word_mask) 3754 + ); 3755 + 3756 + TRACE_EVENT(xrep_rtrmap_found, 3757 + TP_PROTO(struct xfs_mount *mp, const struct xfs_rmap_irec *rec), 3758 + TP_ARGS(mp, rec), 3759 + TP_STRUCT__entry( 3760 + __field(dev_t, dev) 3761 + __field(dev_t, rtdev) 3762 + __field(xfs_rgblock_t, rgbno) 3763 + __field(xfs_extlen_t, len) 3764 + __field(uint64_t, owner) 3765 + __field(uint64_t, offset) 3766 + __field(unsigned int, flags) 3767 + ), 3768 + TP_fast_assign( 3769 + __entry->dev = mp->m_super->s_dev; 3770 + __entry->rtdev = mp->m_rtdev_targp->bt_dev; 3771 + __entry->rgbno = rec->rm_startblock; 3772 + __entry->len = rec->rm_blockcount; 3773 + __entry->owner = rec->rm_owner; 3774 + __entry->offset = rec->rm_offset; 3775 + __entry->flags = rec->rm_flags; 3776 + ), 3777 + TP_printk("dev %d:%d rtdev %d:%d rgbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", 3778 + MAJOR(__entry->dev), MINOR(__entry->dev), 3779 + MAJOR(__entry->rtdev), MINOR(__entry->rtdev), 3780 + __entry->rgbno, 3781 + __entry->len, 3782 + __entry->owner, 3783 + __entry->offset, 3784 + __entry->flags) 3785 + ); 3786 + #endif /* CONFIG_XFS_RT */ 3657 3787 3658 3788 #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ 3659 3789

+1 -1

fs/xfs/xfs_aops.c

··· 131 131 error = xfs_iomap_write_unwritten(ip, offset, size, false); 132 132 133 133 if (!error && xfs_ioend_is_append(ioend)) 134 - error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size); 134 + error = xfs_setfilesize(ip, offset, size); 135 135 done: 136 136 iomap_finish_ioends(ioend, error); 137 137 memalloc_nofs_restore(nofs_flag);

-5

fs/xfs/xfs_attr_inactive.c

··· 305 305 XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0, &bp); 306 306 if (error) 307 307 return error; 308 - error = bp->b_error; 309 - if (error) { 310 - xfs_trans_brelse(*trans, bp); 311 - return error; 312 - } 313 308 xfs_trans_binval(*trans, bp); /* remove from cache */ 314 309 /* 315 310 * Commit the invalidate and start the next transaction.

+233 -373

fs/xfs/xfs_buf.c

··· 22 22 #include "xfs_error.h" 23 23 #include "xfs_ag.h" 24 24 #include "xfs_buf_mem.h" 25 + #include "xfs_notify_failure.h" 25 26 26 27 struct kmem_cache *xfs_buf_cache; 27 28 ··· 53 52 * b_lock (trylock due to inversion) 54 53 */ 55 54 56 - static int __xfs_buf_submit(struct xfs_buf *bp, bool wait); 57 - 58 - static inline int 59 - xfs_buf_submit( 60 - struct xfs_buf *bp) 61 - { 62 - return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC)); 63 - } 55 + static void xfs_buf_submit(struct xfs_buf *bp); 56 + static int xfs_buf_iowait(struct xfs_buf *bp); 64 57 65 58 static inline bool xfs_buf_is_uncached(struct xfs_buf *bp) 66 59 { ··· 127 132 } 128 133 } 129 134 130 - static inline void 131 - xfs_buf_ioacct_dec( 132 - struct xfs_buf *bp) 133 - { 134 - spin_lock(&bp->b_lock); 135 - __xfs_buf_ioacct_dec(bp); 136 - spin_unlock(&bp->b_lock); 137 - } 138 - 139 135 /* 140 136 * When we mark a buffer stale, we remove the buffer from the LRU and clear the 141 137 * b_lru_ref count so that the buffer is freed immediately when the buffer ··· 162 176 atomic_set(&bp->b_lru_ref, 0); 163 177 if (!(bp->b_state & XFS_BSTATE_DISPOSE) && 164 178 (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru))) 165 - atomic_dec(&bp->b_hold); 179 + bp->b_hold--; 166 180 167 - ASSERT(atomic_read(&bp->b_hold) >= 1); 181 + ASSERT(bp->b_hold >= 1); 168 182 spin_unlock(&bp->b_lock); 169 183 } 170 184 ··· 188 202 return 0; 189 203 } 190 204 191 - /* 192 - * Frees b_pages if it was allocated. 193 - */ 194 205 static void 195 206 xfs_buf_free_maps( 196 207 struct xfs_buf *bp) ··· 220 237 */ 221 238 flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); 222 239 223 - atomic_set(&bp->b_hold, 1); 240 + spin_lock_init(&bp->b_lock); 241 + bp->b_hold = 1; 224 242 atomic_set(&bp->b_lru_ref, 1); 225 243 init_completion(&bp->b_iowait); 226 244 INIT_LIST_HEAD(&bp->b_lru); 227 245 INIT_LIST_HEAD(&bp->b_list); 228 246 INIT_LIST_HEAD(&bp->b_li_list); 229 247 sema_init(&bp->b_sema, 0); /* held, no waiters */ 230 - spin_lock_init(&bp->b_lock); 231 248 bp->b_target = target; 232 249 bp->b_mount = target->bt_mount; 233 250 bp->b_flags = flags; ··· 571 588 return 0; 572 589 } 573 590 591 + static bool 592 + xfs_buf_try_hold( 593 + struct xfs_buf *bp) 594 + { 595 + spin_lock(&bp->b_lock); 596 + if (bp->b_hold == 0) { 597 + spin_unlock(&bp->b_lock); 598 + return false; 599 + } 600 + bp->b_hold++; 601 + spin_unlock(&bp->b_lock); 602 + return true; 603 + } 604 + 574 605 static inline int 575 606 xfs_buf_lookup( 576 607 struct xfs_buf_cache *bch, ··· 597 600 598 601 rcu_read_lock(); 599 602 bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params); 600 - if (!bp || !atomic_inc_not_zero(&bp->b_hold)) { 603 + if (!bp || !xfs_buf_try_hold(bp)) { 601 604 rcu_read_unlock(); 602 605 return -ENOENT; 603 606 } ··· 660 663 spin_unlock(&bch->bc_lock); 661 664 goto out_free_buf; 662 665 } 663 - if (bp) { 666 + if (bp && xfs_buf_try_hold(bp)) { 664 667 /* found an existing buffer */ 665 - atomic_inc(&bp->b_hold); 666 668 spin_unlock(&bch->bc_lock); 667 669 error = xfs_buf_find_lock(bp, flags); 668 670 if (error) ··· 802 806 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE); 803 807 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); 804 808 805 - return xfs_buf_submit(bp); 809 + xfs_buf_submit(bp); 810 + if (flags & XBF_ASYNC) 811 + return 0; 812 + return xfs_buf_iowait(bp); 806 813 } 807 814 808 815 /* ··· 981 982 bp->b_ops = ops; 982 983 983 984 xfs_buf_submit(bp); 984 - if (bp->b_error) { 985 - error = bp->b_error; 985 + error = xfs_buf_iowait(bp); 986 + if (error) { 986 987 xfs_buf_relse(bp); 987 988 return error; 988 989 } ··· 1042 1043 struct xfs_buf *bp) 1043 1044 { 1044 1045 trace_xfs_buf_hold(bp, _RET_IP_); 1045 - atomic_inc(&bp->b_hold); 1046 + 1047 + spin_lock(&bp->b_lock); 1048 + bp->b_hold++; 1049 + spin_unlock(&bp->b_lock); 1046 1050 } 1047 1051 1048 1052 static void ··· 1053 1051 struct xfs_buf *bp) 1054 1052 { 1055 1053 ASSERT(list_empty(&bp->b_lru)); 1056 - if (atomic_dec_and_test(&bp->b_hold)) { 1057 - xfs_buf_ioacct_dec(bp); 1058 - xfs_buf_free(bp); 1054 + 1055 + spin_lock(&bp->b_lock); 1056 + if (--bp->b_hold) { 1057 + spin_unlock(&bp->b_lock); 1058 + return; 1059 1059 } 1060 + __xfs_buf_ioacct_dec(bp); 1061 + spin_unlock(&bp->b_lock); 1062 + xfs_buf_free(bp); 1060 1063 } 1061 1064 1062 1065 static void ··· 1071 1064 struct xfs_buftarg *btp = bp->b_target; 1072 1065 struct xfs_perag *pag = bp->b_pag; 1073 1066 struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag); 1074 - bool release; 1075 1067 bool freebuf = false; 1076 1068 1077 1069 trace_xfs_buf_rele(bp, _RET_IP_); 1078 1070 1079 - ASSERT(atomic_read(&bp->b_hold) > 0); 1080 - 1081 - /* 1082 - * We grab the b_lock here first to serialise racing xfs_buf_rele() 1083 - * calls. The pag_buf_lock being taken on the last reference only 1084 - * serialises against racing lookups in xfs_buf_find(). IOWs, the second 1085 - * to last reference we drop here is not serialised against the last 1086 - * reference until we take bp->b_lock. Hence if we don't grab b_lock 1087 - * first, the last "release" reference can win the race to the lock and 1088 - * free the buffer before the second-to-last reference is processed, 1089 - * leading to a use-after-free scenario. 1090 - */ 1091 1071 spin_lock(&bp->b_lock); 1092 - release = atomic_dec_and_lock(&bp->b_hold, &bch->bc_lock); 1093 - if (!release) { 1072 + ASSERT(bp->b_hold >= 1); 1073 + if (bp->b_hold > 1) { 1094 1074 /* 1095 1075 * Drop the in-flight state if the buffer is already on the LRU 1096 1076 * and it holds the only reference. This is racy because we 1097 1077 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT 1098 1078 * ensures the decrement occurs only once per-buf. 1099 1079 */ 1100 - if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) 1080 + if (--bp->b_hold == 1 && !list_empty(&bp->b_lru)) 1101 1081 __xfs_buf_ioacct_dec(bp); 1102 1082 goto out_unlock; 1103 1083 } 1104 1084 1105 - /* the last reference has been dropped ... */ 1085 + /* we are asked to drop the last reference */ 1086 + spin_lock(&bch->bc_lock); 1106 1087 __xfs_buf_ioacct_dec(bp); 1107 1088 if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { 1108 1089 /* 1109 - * If the buffer is added to the LRU take a new reference to the 1090 + * If the buffer is added to the LRU, keep the reference to the 1110 1091 * buffer for the LRU and clear the (now stale) dispose list 1111 - * state flag 1092 + * state flag, else drop the reference. 1112 1093 */ 1113 - if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) { 1094 + if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) 1114 1095 bp->b_state &= ~XFS_BSTATE_DISPOSE; 1115 - atomic_inc(&bp->b_hold); 1116 - } 1096 + else 1097 + bp->b_hold--; 1117 1098 spin_unlock(&bch->bc_lock); 1118 1099 } else { 1100 + bp->b_hold--; 1119 1101 /* 1120 1102 * most of the time buffers will already be removed from the 1121 1103 * LRU, so optimise that case by checking for the ··· 1287 1291 { 1288 1292 struct xfs_mount *mp = bp->b_mount; 1289 1293 struct xfs_error_cfg *cfg; 1294 + struct xfs_log_item *lip; 1290 1295 1291 1296 /* 1292 1297 * If we've already shutdown the journal because of I/O errors, there's ··· 1335 1338 } 1336 1339 1337 1340 /* Still considered a transient error. Caller will schedule retries. */ 1338 - if (bp->b_flags & _XBF_INODES) 1339 - xfs_buf_inode_io_fail(bp); 1340 - else if (bp->b_flags & _XBF_DQUOTS) 1341 - xfs_buf_dquot_io_fail(bp); 1342 - else 1343 - ASSERT(list_empty(&bp->b_li_list)); 1341 + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { 1342 + set_bit(XFS_LI_FAILED, &lip->li_flags); 1343 + clear_bit(XFS_LI_FLUSHING, &lip->li_flags); 1344 + } 1345 + 1344 1346 xfs_buf_ioerror(bp, 0); 1345 1347 xfs_buf_relse(bp); 1346 1348 return true; ··· 1363 1367 { 1364 1368 trace_xfs_buf_iodone(bp, _RET_IP_); 1365 1369 1366 - /* 1367 - * Pull in IO completion errors now. We are guaranteed to be running 1368 - * single threaded, so we don't need the lock to read b_io_error. 1369 - */ 1370 - if (!bp->b_error && bp->b_io_error) 1371 - xfs_buf_ioerror(bp, bp->b_io_error); 1372 - 1373 1370 if (bp->b_flags & XBF_READ) { 1371 + if (!bp->b_error && xfs_buf_is_vmapped(bp)) 1372 + invalidate_kernel_vmap_range(bp->b_addr, 1373 + xfs_buf_vmap_len(bp)); 1374 1374 if (!bp->b_error && bp->b_ops) 1375 1375 bp->b_ops->verify_read(bp); 1376 1376 if (!bp->b_error) ··· 1393 1401 if (bp->b_log_item) 1394 1402 xfs_buf_item_done(bp); 1395 1403 1396 - if (bp->b_flags & _XBF_INODES) 1397 - xfs_buf_inode_iodone(bp); 1398 - else if (bp->b_flags & _XBF_DQUOTS) 1399 - xfs_buf_dquot_iodone(bp); 1400 - 1404 + if (bp->b_iodone) 1405 + bp->b_iodone(bp); 1401 1406 } 1402 1407 1403 1408 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD | ··· 1474 1485 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | 1475 1486 XBF_DONE); 1476 1487 1477 - error = xfs_buf_submit(bp); 1488 + xfs_buf_submit(bp); 1489 + error = xfs_buf_iowait(bp); 1478 1490 if (error) 1479 1491 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); 1480 1492 return error; ··· 1485 1495 xfs_buf_bio_end_io( 1486 1496 struct bio *bio) 1487 1497 { 1488 - struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private; 1498 + struct xfs_buf *bp = bio->bi_private; 1489 1499 1490 - if (!bio->bi_status && 1491 - (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && 1492 - XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) 1493 - bio->bi_status = BLK_STS_IOERR; 1500 + if (bio->bi_status) 1501 + xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status)); 1502 + else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && 1503 + XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) 1504 + xfs_buf_ioerror(bp, -EIO); 1494 1505 1495 - /* 1496 - * don't overwrite existing errors - otherwise we can lose errors on 1497 - * buffers that require multiple bios to complete. 1498 - */ 1499 - if (bio->bi_status) { 1500 - int error = blk_status_to_errno(bio->bi_status); 1501 - 1502 - cmpxchg(&bp->b_io_error, 0, error); 1503 - } 1504 - 1505 - if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1506 - invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1507 - 1508 - if (atomic_dec_and_test(&bp->b_io_remaining) == 1) 1509 - xfs_buf_ioend_async(bp); 1506 + xfs_buf_ioend_async(bp); 1510 1507 bio_put(bio); 1511 1508 } 1512 1509 1513 - static void 1514 - xfs_buf_ioapply_map( 1515 - struct xfs_buf *bp, 1516 - int map, 1517 - int *buf_offset, 1518 - int *count, 1519 - blk_opf_t op) 1510 + static inline blk_opf_t 1511 + xfs_buf_bio_op( 1512 + struct xfs_buf *bp) 1520 1513 { 1521 - int page_index; 1522 - unsigned int total_nr_pages = bp->b_page_count; 1523 - int nr_pages; 1524 - struct bio *bio; 1525 - sector_t sector = bp->b_maps[map].bm_bn; 1526 - int size; 1527 - int offset; 1528 - 1529 - /* skip the pages in the buffer before the start offset */ 1530 - page_index = 0; 1531 - offset = *buf_offset; 1532 - while (offset >= PAGE_SIZE) { 1533 - page_index++; 1534 - offset -= PAGE_SIZE; 1535 - } 1536 - 1537 - /* 1538 - * Limit the IO size to the length of the current vector, and update the 1539 - * remaining IO count for the next time around. 1540 - */ 1541 - size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count); 1542 - *count -= size; 1543 - *buf_offset += size; 1544 - 1545 - next_chunk: 1546 - atomic_inc(&bp->b_io_remaining); 1547 - nr_pages = bio_max_segs(total_nr_pages); 1548 - 1549 - bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO); 1550 - bio->bi_iter.bi_sector = sector; 1551 - bio->bi_end_io = xfs_buf_bio_end_io; 1552 - bio->bi_private = bp; 1553 - 1554 - for (; size && nr_pages; nr_pages--, page_index++) { 1555 - int rbytes, nbytes = PAGE_SIZE - offset; 1556 - 1557 - if (nbytes > size) 1558 - nbytes = size; 1559 - 1560 - rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes, 1561 - offset); 1562 - if (rbytes < nbytes) 1563 - break; 1564 - 1565 - offset = 0; 1566 - sector += BTOBB(nbytes); 1567 - size -= nbytes; 1568 - total_nr_pages--; 1569 - } 1570 - 1571 - if (likely(bio->bi_iter.bi_size)) { 1572 - if (xfs_buf_is_vmapped(bp)) { 1573 - flush_kernel_vmap_range(bp->b_addr, 1574 - xfs_buf_vmap_len(bp)); 1575 - } 1576 - submit_bio(bio); 1577 - if (size) 1578 - goto next_chunk; 1579 - } else { 1580 - /* 1581 - * This is guaranteed not to be the last io reference count 1582 - * because the caller (xfs_buf_submit) holds a count itself. 1583 - */ 1584 - atomic_dec(&bp->b_io_remaining); 1585 - xfs_buf_ioerror(bp, -EIO); 1586 - bio_put(bio); 1587 - } 1588 - 1589 - } 1590 - 1591 - STATIC void 1592 - _xfs_buf_ioapply( 1593 - struct xfs_buf *bp) 1594 - { 1595 - struct blk_plug plug; 1596 - blk_opf_t op; 1597 - int offset; 1598 - int size; 1599 - int i; 1600 - 1601 - /* 1602 - * Make sure we capture only current IO errors rather than stale errors 1603 - * left over from previous use of the buffer (e.g. failed readahead). 1604 - */ 1605 - bp->b_error = 0; 1514 + blk_opf_t op; 1606 1515 1607 1516 if (bp->b_flags & XBF_WRITE) { 1608 1517 op = REQ_OP_WRITE; 1609 - 1610 - /* 1611 - * Run the write verifier callback function if it exists. If 1612 - * this function fails it will mark the buffer with an error and 1613 - * the IO should not be dispatched. 1614 - */ 1615 - if (bp->b_ops) { 1616 - bp->b_ops->verify_write(bp); 1617 - if (bp->b_error) { 1618 - xfs_force_shutdown(bp->b_mount, 1619 - SHUTDOWN_CORRUPT_INCORE); 1620 - return; 1621 - } 1622 - } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) { 1623 - struct xfs_mount *mp = bp->b_mount; 1624 - 1625 - /* 1626 - * non-crc filesystems don't attach verifiers during 1627 - * log recovery, so don't warn for such filesystems. 1628 - */ 1629 - if (xfs_has_crc(mp)) { 1630 - xfs_warn(mp, 1631 - "%s: no buf ops on daddr 0x%llx len %d", 1632 - __func__, xfs_buf_daddr(bp), 1633 - bp->b_length); 1634 - xfs_hex_dump(bp->b_addr, 1635 - XFS_CORRUPTION_DUMP_LEN); 1636 - dump_stack(); 1637 - } 1638 - } 1639 1518 } else { 1640 1519 op = REQ_OP_READ; 1641 1520 if (bp->b_flags & XBF_READ_AHEAD) 1642 1521 op |= REQ_RAHEAD; 1643 1522 } 1644 1523 1645 - /* we only use the buffer cache for meta-data */ 1646 - op |= REQ_META; 1524 + return op | REQ_META; 1525 + } 1647 1526 1648 - /* in-memory targets are directly mapped, no IO required. */ 1649 - if (xfs_buftarg_is_mem(bp->b_target)) { 1650 - xfs_buf_ioend(bp); 1651 - return; 1527 + static void 1528 + xfs_buf_submit_bio( 1529 + struct xfs_buf *bp) 1530 + { 1531 + unsigned int size = BBTOB(bp->b_length); 1532 + unsigned int map = 0, p; 1533 + struct blk_plug plug; 1534 + struct bio *bio; 1535 + 1536 + bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count, 1537 + xfs_buf_bio_op(bp), GFP_NOIO); 1538 + bio->bi_private = bp; 1539 + bio->bi_end_io = xfs_buf_bio_end_io; 1540 + 1541 + if (bp->b_flags & _XBF_KMEM) { 1542 + __bio_add_page(bio, virt_to_page(bp->b_addr), size, 1543 + bp->b_offset); 1544 + } else { 1545 + for (p = 0; p < bp->b_page_count; p++) 1546 + __bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0); 1547 + bio->bi_iter.bi_size = size; /* limit to the actual size used */ 1548 + 1549 + if (xfs_buf_is_vmapped(bp)) 1550 + flush_kernel_vmap_range(bp->b_addr, 1551 + xfs_buf_vmap_len(bp)); 1652 1552 } 1653 1553 1654 1554 /* 1655 - * Walk all the vectors issuing IO on them. Set up the initial offset 1656 - * into the buffer and the desired IO size before we start - 1657 - * _xfs_buf_ioapply_vec() will modify them appropriately for each 1658 - * subsequent call. 1555 + * If there is more than one map segment, split out a new bio for each 1556 + * map except of the last one. The last map is handled by the 1557 + * remainder of the original bio outside the loop. 1659 1558 */ 1660 - offset = bp->b_offset; 1661 - size = BBTOB(bp->b_length); 1662 1559 blk_start_plug(&plug); 1663 - for (i = 0; i < bp->b_map_count; i++) { 1664 - xfs_buf_ioapply_map(bp, i, &offset, &size, op); 1665 - if (bp->b_error) 1666 - break; 1667 - if (size <= 0) 1668 - break; /* all done */ 1560 + for (map = 0; map < bp->b_map_count - 1; map++) { 1561 + struct bio *split; 1562 + 1563 + split = bio_split(bio, bp->b_maps[map].bm_len, GFP_NOFS, 1564 + &fs_bio_set); 1565 + split->bi_iter.bi_sector = bp->b_maps[map].bm_bn; 1566 + bio_chain(split, bio); 1567 + submit_bio(split); 1669 1568 } 1569 + bio->bi_iter.bi_sector = bp->b_maps[map].bm_bn; 1570 + submit_bio(bio); 1670 1571 blk_finish_plug(&plug); 1671 1572 } 1672 1573 ··· 1578 1697 } 1579 1698 1580 1699 /* 1700 + * Run the write verifier callback function if it exists. If this fails, mark 1701 + * the buffer with an error and do not dispatch the I/O. 1702 + */ 1703 + static bool 1704 + xfs_buf_verify_write( 1705 + struct xfs_buf *bp) 1706 + { 1707 + if (bp->b_ops) { 1708 + bp->b_ops->verify_write(bp); 1709 + if (bp->b_error) 1710 + return false; 1711 + } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) { 1712 + /* 1713 + * Non-crc filesystems don't attach verifiers during log 1714 + * recovery, so don't warn for such filesystems. 1715 + */ 1716 + if (xfs_has_crc(bp->b_mount)) { 1717 + xfs_warn(bp->b_mount, 1718 + "%s: no buf ops on daddr 0x%llx len %d", 1719 + __func__, xfs_buf_daddr(bp), 1720 + bp->b_length); 1721 + xfs_hex_dump(bp->b_addr, XFS_CORRUPTION_DUMP_LEN); 1722 + dump_stack(); 1723 + } 1724 + } 1725 + 1726 + return true; 1727 + } 1728 + 1729 + /* 1581 1730 * Buffer I/O submission path, read or write. Asynchronous submission transfers 1582 1731 * the buffer lock ownership and the current reference to the IO. It is not 1583 1732 * safe to reference the buffer after a call to this function unless the caller 1584 1733 * holds an additional reference itself. 1585 1734 */ 1586 - static int 1587 - __xfs_buf_submit( 1588 - struct xfs_buf *bp, 1589 - bool wait) 1735 + static void 1736 + xfs_buf_submit( 1737 + struct xfs_buf *bp) 1590 1738 { 1591 - int error = 0; 1592 - 1593 1739 trace_xfs_buf_submit(bp, _RET_IP_); 1594 1740 1595 1741 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); ··· 1636 1728 * state here rather than mount state to avoid corrupting the log tail 1637 1729 * on shutdown. 1638 1730 */ 1639 - if (bp->b_mount->m_log && 1640 - xlog_is_shutdown(bp->b_mount->m_log)) { 1731 + if (bp->b_mount->m_log && xlog_is_shutdown(bp->b_mount->m_log)) { 1641 1732 xfs_buf_ioend_fail(bp); 1642 - return -EIO; 1733 + return; 1643 1734 } 1644 - 1645 - /* 1646 - * Grab a reference so the buffer does not go away underneath us. For 1647 - * async buffers, I/O completion drops the callers reference, which 1648 - * could occur before submission returns. 1649 - */ 1650 - xfs_buf_hold(bp); 1651 1735 1652 1736 if (bp->b_flags & XBF_WRITE) 1653 1737 xfs_buf_wait_unpin(bp); 1654 1738 1655 - /* clear the internal error state to avoid spurious errors */ 1656 - bp->b_io_error = 0; 1657 - 1658 1739 /* 1659 - * Set the count to 1 initially, this will stop an I/O completion 1660 - * callout which happens before we have started all the I/O from calling 1661 - * xfs_buf_ioend too early. 1740 + * Make sure we capture only current IO errors rather than stale errors 1741 + * left over from previous use of the buffer (e.g. failed readahead). 1662 1742 */ 1663 - atomic_set(&bp->b_io_remaining, 1); 1743 + bp->b_error = 0; 1744 + 1664 1745 if (bp->b_flags & XBF_ASYNC) 1665 1746 xfs_buf_ioacct_inc(bp); 1666 - _xfs_buf_ioapply(bp); 1667 1747 1668 - /* 1669 - * If _xfs_buf_ioapply failed, we can get back here with only the IO 1670 - * reference we took above. If we drop it to zero, run completion so 1671 - * that we don't return to the caller with completion still pending. 1672 - */ 1673 - if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { 1674 - if (bp->b_error || !(bp->b_flags & XBF_ASYNC)) 1675 - xfs_buf_ioend(bp); 1676 - else 1677 - xfs_buf_ioend_async(bp); 1748 + if ((bp->b_flags & XBF_WRITE) && !xfs_buf_verify_write(bp)) { 1749 + xfs_force_shutdown(bp->b_mount, SHUTDOWN_CORRUPT_INCORE); 1750 + xfs_buf_ioend(bp); 1751 + return; 1678 1752 } 1679 1753 1680 - if (wait) 1681 - error = xfs_buf_iowait(bp); 1754 + /* In-memory targets are directly mapped, no I/O required. */ 1755 + if (xfs_buftarg_is_mem(bp->b_target)) { 1756 + xfs_buf_ioend(bp); 1757 + return; 1758 + } 1682 1759 1683 - /* 1684 - * Release the hold that keeps the buffer referenced for the entire 1685 - * I/O. Note that if the buffer is async, it is not safe to reference 1686 - * after this release. 1687 - */ 1688 - xfs_buf_rele(bp); 1689 - return error; 1760 + xfs_buf_submit_bio(bp); 1690 1761 } 1691 1762 1692 1763 void * ··· 1750 1863 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1751 1864 struct list_head *dispose = arg; 1752 1865 1753 - if (atomic_read(&bp->b_hold) > 1) { 1866 + if (!spin_trylock(&bp->b_lock)) 1867 + return LRU_SKIP; 1868 + if (bp->b_hold > 1) { 1754 1869 /* need to wait, so skip it this pass */ 1870 + spin_unlock(&bp->b_lock); 1755 1871 trace_xfs_buf_drain_buftarg(bp, _RET_IP_); 1756 1872 return LRU_SKIP; 1757 1873 } 1758 - if (!spin_trylock(&bp->b_lock)) 1759 - return LRU_SKIP; 1760 1874 1761 1875 /* 1762 1876 * clear the LRU reference count so the buffer doesn't get ··· 2096 2208 */ 2097 2209 bp->b_flags |= _XBF_DELWRI_Q; 2098 2210 if (list_empty(&bp->b_list)) { 2099 - atomic_inc(&bp->b_hold); 2211 + xfs_buf_hold(bp); 2100 2212 list_add_tail(&bp->b_list, list); 2101 2213 } 2102 2214 ··· 2154 2266 return 0; 2155 2267 } 2156 2268 2157 - /* 2158 - * Submit buffers for write. If wait_list is specified, the buffers are 2159 - * submitted using sync I/O and placed on the wait list such that the caller can 2160 - * iowait each buffer. Otherwise async I/O is used and the buffers are released 2161 - * at I/O completion time. In either case, buffers remain locked until I/O 2162 - * completes and the buffer is released from the queue. 2163 - */ 2164 - static int 2165 - xfs_buf_delwri_submit_buffers( 2166 - struct list_head *buffer_list, 2167 - struct list_head *wait_list) 2269 + static bool 2270 + xfs_buf_delwri_submit_prep( 2271 + struct xfs_buf *bp) 2168 2272 { 2169 - struct xfs_buf *bp, *n; 2170 - int pinned = 0; 2171 - struct blk_plug plug; 2172 - 2173 - list_sort(NULL, buffer_list, xfs_buf_cmp); 2174 - 2175 - blk_start_plug(&plug); 2176 - list_for_each_entry_safe(bp, n, buffer_list, b_list) { 2177 - if (!wait_list) { 2178 - if (!xfs_buf_trylock(bp)) 2179 - continue; 2180 - if (xfs_buf_ispinned(bp)) { 2181 - xfs_buf_unlock(bp); 2182 - pinned++; 2183 - continue; 2184 - } 2185 - } else { 2186 - xfs_buf_lock(bp); 2187 - } 2188 - 2189 - /* 2190 - * Someone else might have written the buffer synchronously or 2191 - * marked it stale in the meantime. In that case only the 2192 - * _XBF_DELWRI_Q flag got cleared, and we have to drop the 2193 - * reference and remove it from the list here. 2194 - */ 2195 - if (!(bp->b_flags & _XBF_DELWRI_Q)) { 2196 - xfs_buf_list_del(bp); 2197 - xfs_buf_relse(bp); 2198 - continue; 2199 - } 2200 - 2201 - trace_xfs_buf_delwri_split(bp, _RET_IP_); 2202 - 2203 - /* 2204 - * If we have a wait list, each buffer (and associated delwri 2205 - * queue reference) transfers to it and is submitted 2206 - * synchronously. Otherwise, drop the buffer from the delwri 2207 - * queue and submit async. 2208 - */ 2209 - bp->b_flags &= ~_XBF_DELWRI_Q; 2210 - bp->b_flags |= XBF_WRITE; 2211 - if (wait_list) { 2212 - bp->b_flags &= ~XBF_ASYNC; 2213 - list_move_tail(&bp->b_list, wait_list); 2214 - } else { 2215 - bp->b_flags |= XBF_ASYNC; 2216 - xfs_buf_list_del(bp); 2217 - } 2218 - __xfs_buf_submit(bp, false); 2273 + /* 2274 + * Someone else might have written the buffer synchronously or marked it 2275 + * stale in the meantime. In that case only the _XBF_DELWRI_Q flag got 2276 + * cleared, and we have to drop the reference and remove it from the 2277 + * list here. 2278 + */ 2279 + if (!(bp->b_flags & _XBF_DELWRI_Q)) { 2280 + xfs_buf_list_del(bp); 2281 + xfs_buf_relse(bp); 2282 + return false; 2219 2283 } 2220 - blk_finish_plug(&plug); 2221 2284 2222 - return pinned; 2285 + trace_xfs_buf_delwri_split(bp, _RET_IP_); 2286 + bp->b_flags &= ~_XBF_DELWRI_Q; 2287 + bp->b_flags |= XBF_WRITE; 2288 + return true; 2223 2289 } 2224 2290 2225 2291 /* ··· 2196 2354 xfs_buf_delwri_submit_nowait( 2197 2355 struct list_head *buffer_list) 2198 2356 { 2199 - return xfs_buf_delwri_submit_buffers(buffer_list, NULL); 2357 + struct xfs_buf *bp, *n; 2358 + int pinned = 0; 2359 + struct blk_plug plug; 2360 + 2361 + list_sort(NULL, buffer_list, xfs_buf_cmp); 2362 + 2363 + blk_start_plug(&plug); 2364 + list_for_each_entry_safe(bp, n, buffer_list, b_list) { 2365 + if (!xfs_buf_trylock(bp)) 2366 + continue; 2367 + if (xfs_buf_ispinned(bp)) { 2368 + xfs_buf_unlock(bp); 2369 + pinned++; 2370 + continue; 2371 + } 2372 + if (!xfs_buf_delwri_submit_prep(bp)) 2373 + continue; 2374 + bp->b_flags |= XBF_ASYNC; 2375 + xfs_buf_list_del(bp); 2376 + xfs_buf_submit(bp); 2377 + } 2378 + blk_finish_plug(&plug); 2379 + 2380 + return pinned; 2200 2381 } 2201 2382 2202 2383 /* ··· 2236 2371 { 2237 2372 LIST_HEAD (wait_list); 2238 2373 int error = 0, error2; 2239 - struct xfs_buf *bp; 2374 + struct xfs_buf *bp, *n; 2375 + struct blk_plug plug; 2240 2376 2241 - xfs_buf_delwri_submit_buffers(buffer_list, &wait_list); 2377 + list_sort(NULL, buffer_list, xfs_buf_cmp); 2378 + 2379 + blk_start_plug(&plug); 2380 + list_for_each_entry_safe(bp, n, buffer_list, b_list) { 2381 + xfs_buf_lock(bp); 2382 + if (!xfs_buf_delwri_submit_prep(bp)) 2383 + continue; 2384 + bp->b_flags &= ~XBF_ASYNC; 2385 + list_move_tail(&bp->b_list, &wait_list); 2386 + xfs_buf_submit(bp); 2387 + } 2388 + blk_finish_plug(&plug); 2242 2389 2243 2390 /* Wait for IO to complete. */ 2244 2391 while (!list_empty(&wait_list)) { ··· 2275 2398 * Push a single buffer on a delwri queue. 2276 2399 * 2277 2400 * The purpose of this function is to submit a single buffer of a delwri queue 2278 - * and return with the buffer still on the original queue. The waiting delwri 2279 - * buffer submission infrastructure guarantees transfer of the delwri queue 2280 - * buffer reference to a temporary wait list. We reuse this infrastructure to 2281 - * transfer the buffer back to the original queue. 2401 + * and return with the buffer still on the original queue. 2282 2402 * 2283 - * Note the buffer transitions from the queued state, to the submitted and wait 2284 - * listed state and back to the queued state during this call. The buffer 2285 - * locking and queue management logic between _delwri_pushbuf() and 2403 + * The buffer locking and queue management logic between _delwri_pushbuf() and 2286 2404 * _delwri_queue() guarantee that the buffer cannot be queued to another list 2287 2405 * before returning. 2288 2406 */ ··· 2286 2414 struct xfs_buf *bp, 2287 2415 struct list_head *buffer_list) 2288 2416 { 2289 - LIST_HEAD (submit_list); 2290 2417 int error; 2291 2418 2292 2419 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 2293 2420 2294 2421 trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_); 2295 2422 2296 - /* 2297 - * Isolate the buffer to a new local list so we can submit it for I/O 2298 - * independently from the rest of the original list. 2299 - */ 2300 2423 xfs_buf_lock(bp); 2301 - list_move(&bp->b_list, &submit_list); 2302 - xfs_buf_unlock(bp); 2424 + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC); 2425 + bp->b_flags |= XBF_WRITE; 2426 + xfs_buf_submit(bp); 2303 2427 2304 2428 /* 2305 - * Delwri submission clears the DELWRI_Q buffer flag and returns with 2306 - * the buffer on the wait list with the original reference. Rather than 2307 - * bounce the buffer from a local wait list back to the original list 2308 - * after I/O completion, reuse the original list as the wait list. 2309 - */ 2310 - xfs_buf_delwri_submit_buffers(&submit_list, buffer_list); 2311 - 2312 - /* 2313 - * The buffer is now locked, under I/O and wait listed on the original 2314 - * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and 2315 - * return with the buffer unlocked and on the original queue. 2429 + * The buffer is now locked, under I/O but still on the original delwri 2430 + * queue. Wait for I/O completion, restore the DELWRI_Q flag and 2431 + * return with the buffer unlocked and still on the original queue. 2316 2432 */ 2317 2433 error = xfs_buf_iowait(bp); 2318 2434 bp->b_flags |= _XBF_DELWRI_Q;

+3 -8

fs/xfs/xfs_buf.h

··· 34 34 #define XBF_WRITE_FAIL (1u << 7) /* async writes have failed on this buffer */ 35 35 36 36 /* buffer type flags for write callbacks */ 37 - #define _XBF_INODES (1u << 16)/* inode buffer */ 38 - #define _XBF_DQUOTS (1u << 17)/* dquot buffer */ 39 37 #define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */ 40 38 41 39 /* flags used only internally */ ··· 63 65 { XBF_DONE, "DONE" }, \ 64 66 { XBF_STALE, "STALE" }, \ 65 67 { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ 66 - { _XBF_INODES, "INODES" }, \ 67 - { _XBF_DQUOTS, "DQUOTS" }, \ 68 68 { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \ 69 69 { _XBF_PAGES, "PAGES" }, \ 70 70 { _XBF_KMEM, "KMEM" }, \ ··· 168 172 169 173 xfs_daddr_t b_rhash_key; /* buffer cache index */ 170 174 int b_length; /* size of buffer in BBs */ 171 - atomic_t b_hold; /* reference count */ 175 + unsigned int b_hold; /* reference count */ 172 176 atomic_t b_lru_ref; /* lru reclaim ref count */ 173 177 xfs_buf_flags_t b_flags; /* status flags */ 174 178 struct semaphore b_sema; /* semaphore for lockables */ ··· 180 184 struct list_head b_lru; /* lru list */ 181 185 spinlock_t b_lock; /* internal state lock */ 182 186 unsigned int b_state; /* internal state flags */ 183 - int b_io_error; /* internal IO error state */ 184 187 wait_queue_head_t b_waiters; /* unpin waiters */ 185 188 struct list_head b_list; 186 - struct xfs_perag *b_pag; /* contains rbtree root */ 189 + struct xfs_perag *b_pag; 187 190 struct xfs_mount *b_mount; 188 191 struct xfs_buftarg *b_target; /* buffer target (device) */ 189 192 void *b_addr; /* virtual address of buffer */ ··· 197 202 struct xfs_buf_map __b_map; /* inline compound buffer map */ 198 203 int b_map_count; 199 204 atomic_t b_pin_count; /* pin count */ 200 - atomic_t b_io_remaining; /* #outstanding I/O requests */ 201 205 unsigned int b_page_count; /* size of page array */ 202 206 unsigned int b_offset; /* page offset of b_addr, 203 207 only for _XBF_KMEM buffers */ 204 208 int b_error; /* error code on I/O */ 209 + void (*b_iodone)(struct xfs_buf *bp); 205 210 206 211 /* 207 212 * async write failure retry count. Initialised to zero on the first

-5

fs/xfs/xfs_buf_item.h

··· 54 54 void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint); 55 55 bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); 56 56 void xfs_buf_inode_iodone(struct xfs_buf *); 57 - void xfs_buf_inode_io_fail(struct xfs_buf *bp); 58 57 #ifdef CONFIG_XFS_QUOTA 59 58 void xfs_buf_dquot_iodone(struct xfs_buf *); 60 - void xfs_buf_dquot_io_fail(struct xfs_buf *bp); 61 59 #else 62 60 static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp) 63 - { 64 - } 65 - static inline void xfs_buf_dquot_io_fail(struct xfs_buf *bp) 66 61 { 67 62 } 68 63 #endif /* CONFIG_XFS_QUOTA */

+18 -1

fs/xfs/xfs_buf_item_recover.c

··· 262 262 case XFS_BMAP_MAGIC: 263 263 bp->b_ops = &xfs_bmbt_buf_ops; 264 264 break; 265 + case XFS_RTRMAP_CRC_MAGIC: 266 + bp->b_ops = &xfs_rtrmapbt_buf_ops; 267 + break; 265 268 case XFS_RMAP_CRC_MAGIC: 266 269 bp->b_ops = &xfs_rmapbt_buf_ops; 267 270 break; 268 271 case XFS_REFC_CRC_MAGIC: 269 272 bp->b_ops = &xfs_refcountbt_buf_ops; 273 + break; 274 + case XFS_RTREFC_CRC_MAGIC: 275 + bp->b_ops = &xfs_rtrefcountbt_buf_ops; 270 276 break; 271 277 default: 272 278 warnmsg = "Bad btree block magic!"; ··· 861 855 uuid = &btb->bb_u.s.bb_uuid; 862 856 break; 863 857 } 858 + case XFS_RTRMAP_CRC_MAGIC: 859 + case XFS_RTREFC_CRC_MAGIC: 864 860 case XFS_BMAP_CRC_MAGIC: 865 861 case XFS_BMAP_MAGIC: { 866 862 struct xfs_btree_block *btb = blk; ··· 1087 1079 error = xlog_recover_do_primary_sb_buffer(mp, item, bp, buf_f, 1088 1080 current_lsn); 1089 1081 if (error) 1090 - goto out_release; 1082 + goto out_writebuf; 1091 1083 1092 1084 /* Update the rt superblock if we have one. */ 1093 1085 if (xfs_has_rtsb(mp) && mp->m_rtsb_bp) { ··· 1103 1095 } else { 1104 1096 xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); 1105 1097 } 1098 + 1099 + /* 1100 + * Buffer held by buf log item during 'normal' buffer recovery must 1101 + * be committed through buffer I/O submission path to ensure proper 1102 + * release. When error occurs during sb buffer recovery, log shutdown 1103 + * will be done before submitting buffer list so that buffers can be 1104 + * released correctly through ioend failure path. 1105 + */ 1106 + out_writebuf: 1106 1107 1107 1108 /* 1108 1109 * Perform delayed write on the buffer. Asynchronous writes will be

+1 -1

fs/xfs/xfs_discard.c

··· 90 90 91 91 /* 92 92 * Queue up the actual completion to a thread to avoid IRQ-safe locking for 93 - * pagb_lock. 93 + * eb_lock. 94 94 */ 95 95 static void 96 96 xfs_discard_endio(

+9 -17

fs/xfs/xfs_dquot.c

··· 1230 1230 } 1231 1231 } 1232 1232 1233 - void 1234 - xfs_buf_dquot_io_fail( 1235 - struct xfs_buf *bp) 1236 - { 1237 - struct xfs_log_item *lip; 1238 - 1239 - spin_lock(&bp->b_mount->m_ail->ail_lock); 1240 - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) 1241 - set_bit(XFS_LI_FAILED, &lip->li_flags); 1242 - spin_unlock(&bp->b_mount->m_ail->ail_lock); 1243 - } 1244 - 1245 1233 /* Check incore dquot for errors before we flush. */ 1246 1234 static xfs_failaddr_t 1247 1235 xfs_qm_dqflush_check( ··· 1304 1316 1305 1317 /* 1306 1318 * Attach a dquot buffer to this dquot to avoid allocating a buffer during a 1307 - * dqflush, since dqflush can be called from reclaim context. 1319 + * dqflush, since dqflush can be called from reclaim context. Caller must hold 1320 + * the dqlock. 1308 1321 */ 1309 1322 int 1310 1323 xfs_dquot_attach_buf( ··· 1326 1337 return error; 1327 1338 1328 1339 /* 1329 - * Attach the dquot to the buffer so that the AIL does not have 1330 - * to read the dquot buffer to push this item. 1340 + * Hold the dquot buffer so that we retain our ref to it after 1341 + * detaching it from the transaction, then give that ref to the 1342 + * dquot log item so that the AIL does not have to read the 1343 + * dquot buffer to push this item. 1331 1344 */ 1332 1345 xfs_buf_hold(bp); 1346 + xfs_trans_brelse(tp, bp); 1347 + 1333 1348 spin_lock(&qlip->qli_lock); 1334 1349 lip->li_buf = bp; 1335 - xfs_trans_brelse(tp, bp); 1336 1350 } 1337 1351 qlip->qli_dirty = true; 1338 1352 spin_unlock(&qlip->qli_lock); ··· 1451 1459 * Attach the dquot to the buffer so that we can remove this dquot from 1452 1460 * the AIL and release the flush lock once the dquot is synced to disk. 1453 1461 */ 1454 - bp->b_flags |= _XBF_DQUOTS; 1462 + bp->b_iodone = xfs_buf_dquot_iodone; 1455 1463 list_add_tail(&lip->li_bio_list, &bp->b_li_list); 1456 1464 1457 1465 /*

+3

fs/xfs/xfs_dquot.h

··· 160 160 struct xfs_inode *ip, 161 161 xfs_dqtype_t type) 162 162 { 163 + if (xfs_is_metadir_inode(ip)) 164 + return NULL; 165 + 163 166 switch (type) { 164 167 case XFS_DQTYPE_USER: 165 168 return ip->i_udquot;

+10 -10

fs/xfs/xfs_drain.c

··· 13 13 #include "xfs_trace.h" 14 14 15 15 /* 16 - * Use a static key here to reduce the overhead of xfs_drain_rele. If the 17 - * compiler supports jump labels, the static branch will be replaced by a nop 18 - * sled when there are no xfs_drain_wait callers. Online fsck is currently 19 - * the only caller, so this is a reasonable tradeoff. 16 + * Use a static key here to reduce the overhead of xfs_defer_drain_rele. If 17 + * the compiler supports jump labels, the static branch will be replaced by a 18 + * nop sled when there are no xfs_defer_drain_wait callers. Online fsck is 19 + * currently the only caller, so this is a reasonable tradeoff. 20 20 * 21 21 * Note: Patching the kernel code requires taking the cpu hotplug lock. Other 22 22 * parts of the kernel allocate memory with that lock held, which means that 23 23 * XFS callers cannot hold any locks that might be used by memory reclaim or 24 24 * writeback when calling the static_branch_{inc,dec} functions. 25 25 */ 26 - static DEFINE_STATIC_KEY_FALSE(xfs_drain_waiter_gate); 26 + static DEFINE_STATIC_KEY_FALSE(xfs_defer_drain_waiter_gate); 27 27 28 28 void 29 - xfs_drain_wait_disable(void) 29 + xfs_defer_drain_wait_disable(void) 30 30 { 31 - static_branch_dec(&xfs_drain_waiter_gate); 31 + static_branch_dec(&xfs_defer_drain_waiter_gate); 32 32 } 33 33 34 34 void 35 - xfs_drain_wait_enable(void) 35 + xfs_defer_drain_wait_enable(void) 36 36 { 37 - static_branch_inc(&xfs_drain_waiter_gate); 37 + static_branch_inc(&xfs_defer_drain_waiter_gate); 38 38 } 39 39 40 40 void ··· 71 71 static inline void xfs_defer_drain_rele(struct xfs_defer_drain *dr) 72 72 { 73 73 if (atomic_dec_and_test(&dr->dr_count) && 74 - static_branch_unlikely(&xfs_drain_waiter_gate) && 74 + static_branch_unlikely(&xfs_defer_drain_waiter_gate) && 75 75 has_waiters(&dr->dr_waiters)) 76 76 wake_up(&dr->dr_waiters); 77 77 }

+5 -2

fs/xfs/xfs_drain.h

··· 26 26 void xfs_defer_drain_init(struct xfs_defer_drain *dr); 27 27 void xfs_defer_drain_free(struct xfs_defer_drain *dr); 28 28 29 - void xfs_drain_wait_disable(void); 30 - void xfs_drain_wait_enable(void); 29 + void xfs_defer_drain_wait_disable(void); 30 + void xfs_defer_drain_wait_enable(void); 31 31 32 32 /* 33 33 * Deferred Work Intent Drains ··· 61 61 * All functions that create work items must increment the intent counter as 62 62 * soon as the item is added to the transaction and cannot drop the counter 63 63 * until the item is finished or cancelled. 64 + * 65 + * The same principles apply to realtime groups because the rt metadata inode 66 + * ILOCKs are not held across transaction rolls. 64 67 */ 65 68 struct xfs_group *xfs_group_intent_get(struct xfs_mount *mp, 66 69 xfs_fsblock_t fsbno, enum xfs_group_type type);

+3

fs/xfs/xfs_error.c

··· 63 63 XFS_RANDOM_WB_DELAY_MS, 64 64 XFS_RANDOM_WRITE_DELAY_MS, 65 65 XFS_RANDOM_EXCHMAPS_FINISH_ONE, 66 + XFS_RANDOM_METAFILE_RESV_CRITICAL, 66 67 }; 67 68 68 69 struct xfs_errortag_attr { ··· 182 181 XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS); 183 182 XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS); 184 183 XFS_ERRORTAG_ATTR_RW(exchmaps_finish_one, XFS_ERRTAG_EXCHMAPS_FINISH_ONE); 184 + XFS_ERRORTAG_ATTR_RW(metafile_resv_crit, XFS_ERRTAG_METAFILE_RESV_CRITICAL); 185 185 186 186 static struct attribute *xfs_errortag_attrs[] = { 187 187 XFS_ERRORTAG_ATTR_LIST(noerror), ··· 229 227 XFS_ERRORTAG_ATTR_LIST(wb_delay_ms), 230 228 XFS_ERRORTAG_ATTR_LIST(write_delay_ms), 231 229 XFS_ERRORTAG_ATTR_LIST(exchmaps_finish_one), 230 + XFS_ERRORTAG_ATTR_LIST(metafile_resv_crit), 232 231 NULL, 233 232 }; 234 233 ATTRIBUTE_GROUPS(xfs_errortag);

+3

fs/xfs/xfs_exchrange.c

··· 119 119 int ip1_error = 0; 120 120 int error; 121 121 122 + ASSERT(!xfs_is_metadir_inode(req->ip1)); 123 + ASSERT(!xfs_is_metadir_inode(req->ip2)); 124 + 122 125 /* 123 126 * Don't bother with a quota reservation if we're not enforcing them 124 127 * or the two inodes have the same dquots.

+183 -10

fs/xfs/xfs_fsmap.c

··· 26 26 #include "xfs_rtbitmap.h" 27 27 #include "xfs_ag.h" 28 28 #include "xfs_rtgroup.h" 29 + #include "xfs_rtrmap_btree.h" 30 + #include "xfs_rtrefcount_btree.h" 29 31 30 32 /* Convert an xfs_fsmap to an fsmap. */ 31 33 static void ··· 213 211 struct xfs_mount *mp = tp->t_mountp; 214 212 struct xfs_btree_cur *cur; 215 213 xfs_agblock_t fbno; 216 - xfs_extlen_t flen; 214 + xfs_extlen_t flen = 0; 217 215 int error; 218 216 219 217 *stat = false; 220 - if (!xfs_has_reflink(mp)) 218 + if (!xfs_has_reflink(mp) || !info->group) 221 219 return 0; 222 - /* rt files will have no perag structure */ 223 - if (!info->group) 224 - return 0; 220 + 221 + if (info->group->xg_type == XG_TYPE_RTG) 222 + cur = xfs_rtrefcountbt_init_cursor(tp, to_rtg(info->group)); 223 + else 224 + cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, 225 + to_perag(info->group)); 225 226 226 227 /* Are there any shared blocks here? */ 227 - flen = 0; 228 - cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, 229 - to_perag(info->group)); 230 - 231 228 error = xfs_refcount_find_shared(cur, frec->rec_key, 232 229 XFS_BB_TO_FSBT(mp, frec->len_daddr), &fbno, &flen, 233 230 false); ··· 833 832 834 833 return error; 835 834 } 835 + 836 + /* Transform a realtime rmapbt record into a fsmap */ 837 + STATIC int 838 + xfs_getfsmap_rtdev_rmapbt_helper( 839 + struct xfs_btree_cur *cur, 840 + const struct xfs_rmap_irec *rec, 841 + void *priv) 842 + { 843 + struct xfs_fsmap_irec frec = { 844 + .owner = rec->rm_owner, 845 + .offset = rec->rm_offset, 846 + .rm_flags = rec->rm_flags, 847 + .rec_key = rec->rm_startblock, 848 + }; 849 + struct xfs_getfsmap_info *info = priv; 850 + 851 + return xfs_getfsmap_group_helper(info, cur->bc_tp, cur->bc_group, 852 + rec->rm_startblock, rec->rm_blockcount, &frec); 853 + } 854 + 855 + /* Actually query the rtrmap btree. */ 856 + STATIC int 857 + xfs_getfsmap_rtdev_rmapbt_query( 858 + struct xfs_trans *tp, 859 + struct xfs_getfsmap_info *info, 860 + struct xfs_btree_cur **curpp) 861 + { 862 + struct xfs_rtgroup *rtg = to_rtg(info->group); 863 + 864 + /* Query the rtrmapbt */ 865 + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP | XFS_RTGLOCK_REFCOUNT); 866 + *curpp = xfs_rtrmapbt_init_cursor(tp, rtg); 867 + return xfs_rmap_query_range(*curpp, &info->low, &info->high, 868 + xfs_getfsmap_rtdev_rmapbt_helper, info); 869 + } 870 + 871 + /* Execute a getfsmap query against the realtime device rmapbt. */ 872 + STATIC int 873 + xfs_getfsmap_rtdev_rmapbt( 874 + struct xfs_trans *tp, 875 + const struct xfs_fsmap *keys, 876 + struct xfs_getfsmap_info *info) 877 + { 878 + struct xfs_mount *mp = tp->t_mountp; 879 + struct xfs_rtgroup *rtg = NULL; 880 + struct xfs_btree_cur *bt_cur = NULL; 881 + xfs_rtblock_t start_rtb; 882 + xfs_rtblock_t end_rtb; 883 + xfs_rgnumber_t start_rg, end_rg; 884 + uint64_t eofs; 885 + int error = 0; 886 + 887 + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); 888 + if (keys[0].fmr_physical >= eofs) 889 + return 0; 890 + start_rtb = xfs_daddr_to_rtb(mp, keys[0].fmr_physical); 891 + end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical)); 892 + 893 + info->missing_owner = XFS_FMR_OWN_FREE; 894 + 895 + /* 896 + * Convert the fsmap low/high keys to rtgroup based keys. Initialize 897 + * low to the fsmap low key and max out the high key to the end 898 + * of the rtgroup. 899 + */ 900 + info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); 901 + error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); 902 + if (error) 903 + return error; 904 + info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, keys[0].fmr_length); 905 + xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); 906 + 907 + /* Adjust the low key if we are continuing from where we left off. */ 908 + if (info->low.rm_blockcount == 0) { 909 + /* No previous record from which to continue */ 910 + } else if (rmap_not_shareable(mp, &info->low)) { 911 + /* Last record seen was an unshareable extent */ 912 + info->low.rm_owner = 0; 913 + info->low.rm_offset = 0; 914 + 915 + start_rtb += info->low.rm_blockcount; 916 + if (xfs_rtb_to_daddr(mp, start_rtb) >= eofs) 917 + return 0; 918 + } else { 919 + /* Last record seen was a shareable file data extent */ 920 + info->low.rm_offset += info->low.rm_blockcount; 921 + } 922 + info->low.rm_startblock = xfs_rtb_to_rgbno(mp, start_rtb); 923 + 924 + info->high.rm_startblock = -1U; 925 + info->high.rm_owner = ULLONG_MAX; 926 + info->high.rm_offset = ULLONG_MAX; 927 + info->high.rm_blockcount = 0; 928 + info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS; 929 + 930 + start_rg = xfs_rtb_to_rgno(mp, start_rtb); 931 + end_rg = xfs_rtb_to_rgno(mp, end_rtb); 932 + 933 + while ((rtg = xfs_rtgroup_next_range(mp, rtg, start_rg, end_rg))) { 934 + /* 935 + * Set the rtgroup high key from the fsmap high key if this 936 + * is the last rtgroup that we're querying. 937 + */ 938 + info->group = rtg_group(rtg); 939 + if (rtg_rgno(rtg) == end_rg) { 940 + info->high.rm_startblock = 941 + xfs_rtb_to_rgbno(mp, end_rtb); 942 + info->high.rm_offset = 943 + XFS_BB_TO_FSBT(mp, keys[1].fmr_offset); 944 + error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]); 945 + if (error) 946 + break; 947 + xfs_getfsmap_set_irec_flags(&info->high, &keys[1]); 948 + } 949 + 950 + if (bt_cur) { 951 + xfs_rtgroup_unlock(to_rtg(bt_cur->bc_group), 952 + XFS_RTGLOCK_RMAP | 953 + XFS_RTGLOCK_REFCOUNT); 954 + xfs_btree_del_cursor(bt_cur, XFS_BTREE_NOERROR); 955 + bt_cur = NULL; 956 + } 957 + 958 + trace_xfs_fsmap_low_group_key(mp, info->dev, rtg_rgno(rtg), 959 + &info->low); 960 + trace_xfs_fsmap_high_group_key(mp, info->dev, rtg_rgno(rtg), 961 + &info->high); 962 + 963 + error = xfs_getfsmap_rtdev_rmapbt_query(tp, info, &bt_cur); 964 + if (error) 965 + break; 966 + 967 + /* 968 + * Set the rtgroup low key to the start of the rtgroup prior to 969 + * moving on to the next rtgroup. 970 + */ 971 + if (rtg_rgno(rtg) == start_rg) 972 + memset(&info->low, 0, sizeof(info->low)); 973 + 974 + /* 975 + * If this is the last rtgroup, report any gap at the end of it 976 + * before we drop the reference to the perag when the loop 977 + * terminates. 978 + */ 979 + if (rtg_rgno(rtg) == end_rg) { 980 + info->last = true; 981 + error = xfs_getfsmap_rtdev_rmapbt_helper(bt_cur, 982 + &info->high, info); 983 + if (error) 984 + break; 985 + } 986 + info->group = NULL; 987 + } 988 + 989 + if (bt_cur) { 990 + xfs_rtgroup_unlock(to_rtg(bt_cur->bc_group), 991 + XFS_RTGLOCK_RMAP | XFS_RTGLOCK_REFCOUNT); 992 + xfs_btree_del_cursor(bt_cur, error < 0 ? XFS_BTREE_ERROR : 993 + XFS_BTREE_NOERROR); 994 + } 995 + 996 + /* loop termination case */ 997 + if (rtg) { 998 + info->group = NULL; 999 + xfs_rtgroup_rele(rtg); 1000 + } 1001 + 1002 + return error; 1003 + } 836 1004 #endif /* CONFIG_XFS_RT */ 837 1005 838 1006 /* Do we recognize the device? */ ··· 1141 971 if (mp->m_rtdev_targp) { 1142 972 handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); 1143 973 handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev); 1144 - handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap; 974 + if (use_rmap) 975 + handlers[2].fn = xfs_getfsmap_rtdev_rmapbt; 976 + else 977 + handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap; 1145 978 } 1146 979 #endif /* CONFIG_XFS_RT */ 1147 980

+30

fs/xfs/xfs_fsops.c

··· 21 21 #include "xfs_ag.h" 22 22 #include "xfs_ag_resv.h" 23 23 #include "xfs_trace.h" 24 + #include "xfs_rtalloc.h" 25 + #include "xfs_rtrmap_btree.h" 26 + #include "xfs_rtrefcount_btree.h" 24 27 25 28 /* 26 29 * Write new AG headers to disk. Non-transactional, but need to be ··· 115 112 return error; 116 113 xfs_buf_relse(bp); 117 114 } 115 + 116 + /* Make sure the new fs size won't cause problems with the log. */ 117 + error = xfs_growfs_check_rtgeom(mp, nb, mp->m_sb.sb_rblocks, 118 + mp->m_sb.sb_rextsize); 119 + if (error) 120 + return error; 118 121 119 122 nb_div = nb; 120 123 nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks); ··· 229 220 error = xfs_fs_reserve_ag_blocks(mp); 230 221 if (error == -ENOSPC) 231 222 error = 0; 223 + 224 + /* Compute new maxlevels for rt btrees. */ 225 + xfs_rtrmapbt_compute_maxlevels(mp); 226 + xfs_rtrefcountbt_compute_maxlevels(mp); 232 227 } 228 + 233 229 return error; 234 230 235 231 out_trans_cancel: ··· 555 541 xfs_warn(mp, 556 542 "Error %d reserving per-AG metadata reserve pool.", error); 557 543 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 544 + return error; 545 + } 546 + 547 + if (xfs_has_realtime(mp)) { 548 + err2 = xfs_rt_resv_init(mp); 549 + if (err2 && err2 != -ENOSPC) { 550 + xfs_warn(mp, 551 + "Error %d reserving realtime metadata reserve pool.", err2); 552 + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 553 + } 554 + 555 + if (err2 && !error) 556 + error = err2; 558 557 } 559 558 560 559 return error; ··· 581 554 struct xfs_mount *mp) 582 555 { 583 556 struct xfs_perag *pag = NULL; 557 + 558 + if (xfs_has_realtime(mp)) 559 + xfs_rt_resv_free(mp); 584 560 585 561 while ((pag = xfs_perag_next(mp, pag))) 586 562 xfs_ag_resv_free(pag);

+2

fs/xfs/xfs_health.c

··· 447 447 { XFS_SICK_RG_SUPER, XFS_RTGROUP_GEOM_SICK_SUPER }, 448 448 { XFS_SICK_RG_BITMAP, XFS_RTGROUP_GEOM_SICK_BITMAP }, 449 449 { XFS_SICK_RG_SUMMARY, XFS_RTGROUP_GEOM_SICK_SUMMARY }, 450 + { XFS_SICK_RG_RMAPBT, XFS_RTGROUP_GEOM_SICK_RMAPBT }, 451 + { XFS_SICK_RG_REFCNTBT, XFS_RTGROUP_GEOM_SICK_REFCNTBT }, 450 452 }; 451 453 452 454 /* Fill out rtgroup geometry health info. */

+18 -1

fs/xfs/xfs_inode.c

··· 2382 2382 __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); 2383 2383 goto flush_out; 2384 2384 } 2385 - if (S_ISREG(VFS_I(ip)->i_mode)) { 2385 + if (ip->i_df.if_format == XFS_DINODE_FMT_META_BTREE) { 2386 + if (!S_ISREG(VFS_I(ip)->i_mode) || 2387 + !(ip->i_diflags2 & XFS_DIFLAG2_METADATA)) { 2388 + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2389 + "%s: Bad %s meta btree inode %Lu, ptr "PTR_FMT, 2390 + __func__, xfs_metafile_type_str(ip->i_metatype), 2391 + ip->i_ino, ip); 2392 + goto flush_out; 2393 + } 2394 + } else if (S_ISREG(VFS_I(ip)->i_mode)) { 2386 2395 if (XFS_TEST_ERROR( 2387 2396 ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && 2388 2397 ip->i_df.if_format != XFS_DINODE_FMT_BTREE, ··· 2428 2419 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2429 2420 "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT, 2430 2421 __func__, ip->i_ino, ip->i_forkoff, ip); 2422 + goto flush_out; 2423 + } 2424 + 2425 + if (xfs_inode_has_attr_fork(ip) && 2426 + ip->i_af.if_format == XFS_DINODE_FMT_META_BTREE) { 2427 + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2428 + "%s: meta btree in inode %Lu attr fork, ptr "PTR_FMT, 2429 + __func__, ip->i_ino, ip); 2431 2430 goto flush_out; 2432 2431 } 2433 2432

+13 -3

fs/xfs/xfs_inode.h

··· 25 25 typedef struct xfs_inode { 26 26 /* Inode linking and identification information. */ 27 27 struct xfs_mount *i_mount; /* fs mount struct ptr */ 28 - struct xfs_dquot *i_udquot; /* user dquot */ 29 - struct xfs_dquot *i_gdquot; /* group dquot */ 30 - struct xfs_dquot *i_pdquot; /* project dquot */ 28 + union { 29 + struct { 30 + struct xfs_dquot *i_udquot; /* user dquot */ 31 + struct xfs_dquot *i_gdquot; /* group dquot */ 32 + struct xfs_dquot *i_pdquot; /* project dquot */ 33 + }; 34 + 35 + /* 36 + * Space that has been set aside to accomodate expansions of a 37 + * metadata btree rooted in this file. 38 + */ 39 + uint64_t i_meta_resv_asked; 40 + }; 31 41 32 42 /* Inode location stuff */ 33 43 xfs_ino_t i_ino; /* inode number (agno/agino)*/

+17 -13

fs/xfs/xfs_inode_item.c

··· 157 157 if (flags & XFS_ILOG_IVERSION) 158 158 flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); 159 159 160 + /* 161 + * Inode verifiers do not check that the CoW extent size hint is an 162 + * integer multiple of the rt extent size on a directory with both 163 + * rtinherit and cowextsize flags set. If we're logging a directory 164 + * that is misconfigured in this way, clear the hint. 165 + */ 166 + if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && 167 + (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && 168 + xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) { 169 + ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; 170 + ip->i_cowextsize = 0; 171 + flags |= XFS_ILOG_CORE; 172 + } 173 + 160 174 if (!iip->ili_item.li_buf) { 161 175 struct xfs_buf *bp; 162 176 int error; ··· 199 185 xfs_buf_hold(bp); 200 186 spin_lock(&iip->ili_lock); 201 187 iip->ili_item.li_buf = bp; 202 - bp->b_flags |= _XBF_INODES; 188 + bp->b_iodone = xfs_buf_inode_iodone; 203 189 list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); 204 190 xfs_trans_brelse(tp, bp); 205 191 } ··· 256 242 } 257 243 break; 258 244 case XFS_DINODE_FMT_BTREE: 245 + case XFS_DINODE_FMT_META_BTREE: 259 246 if ((iip->ili_fields & XFS_ILOG_DBROOT) && 260 247 ip->i_df.if_broot_bytes > 0) { 261 248 *nbytes += ip->i_df.if_broot_bytes; ··· 377 362 } 378 363 break; 379 364 case XFS_DINODE_FMT_BTREE: 365 + case XFS_DINODE_FMT_META_BTREE: 380 366 iip->ili_fields &= 381 367 ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | XFS_ILOG_DEV); 382 368 ··· 1037 1021 xfs_iflush_finish(bp, &flushed_inodes); 1038 1022 if (!list_empty(&flushed_inodes)) 1039 1023 list_splice_tail(&flushed_inodes, &bp->b_li_list); 1040 - } 1041 - 1042 - void 1043 - xfs_buf_inode_io_fail( 1044 - struct xfs_buf *bp) 1045 - { 1046 - struct xfs_log_item *lip; 1047 - 1048 - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { 1049 - set_bit(XFS_LI_FAILED, &lip->li_flags); 1050 - clear_bit(XFS_LI_FLUSHING, &lip->li_flags); 1051 - } 1052 1024 } 1053 1025 1054 1026 /*

+43 -5

fs/xfs/xfs_inode_item_recover.c

··· 22 22 #include "xfs_log_recover.h" 23 23 #include "xfs_icache.h" 24 24 #include "xfs_bmap_btree.h" 25 + #include "xfs_rtrmap_btree.h" 26 + #include "xfs_rtrefcount_btree.h" 25 27 26 28 STATIC void 27 29 xlog_recover_inode_ra_pass2( ··· 268 266 return 0; 269 267 } 270 268 269 + static inline int 270 + xlog_recover_inode_dbroot( 271 + struct xfs_mount *mp, 272 + void *src, 273 + unsigned int len, 274 + struct xfs_dinode *dip) 275 + { 276 + void *dfork = XFS_DFORK_DPTR(dip); 277 + unsigned int dsize = XFS_DFORK_DSIZE(dip, mp); 278 + 279 + switch (dip->di_format) { 280 + case XFS_DINODE_FMT_BTREE: 281 + xfs_bmbt_to_bmdr(mp, src, len, dfork, dsize); 282 + break; 283 + case XFS_DINODE_FMT_META_BTREE: 284 + switch (be16_to_cpu(dip->di_metatype)) { 285 + case XFS_METAFILE_RTRMAP: 286 + xfs_rtrmapbt_to_disk(mp, src, len, dfork, dsize); 287 + return 0; 288 + case XFS_METAFILE_RTREFCOUNT: 289 + xfs_rtrefcountbt_to_disk(mp, src, len, dfork, dsize); 290 + return 0; 291 + default: 292 + ASSERT(0); 293 + return -EFSCORRUPTED; 294 + } 295 + break; 296 + default: 297 + ASSERT(0); 298 + return -EFSCORRUPTED; 299 + } 300 + 301 + return 0; 302 + } 303 + 271 304 STATIC int 272 305 xlog_recover_inode_commit_pass2( 273 306 struct xlog *log, ··· 430 393 431 394 432 395 if (unlikely(S_ISREG(ldip->di_mode))) { 433 - if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && 434 - (ldip->di_format != XFS_DINODE_FMT_BTREE)) { 396 + if (ldip->di_format != XFS_DINODE_FMT_EXTENTS && 397 + ldip->di_format != XFS_DINODE_FMT_BTREE && 398 + ldip->di_format != XFS_DINODE_FMT_META_BTREE) { 435 399 XFS_CORRUPTION_ERROR( 436 400 "Bad log dinode data fork format for regular file", 437 401 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); ··· 513 475 break; 514 476 515 477 case XFS_ILOG_DBROOT: 516 - xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, 517 - (struct xfs_bmdr_block *)XFS_DFORK_DPTR(dip), 518 - XFS_DFORK_DSIZE(dip, mp)); 478 + error = xlog_recover_inode_dbroot(mp, src, len, dip); 479 + if (error) 480 + goto out_release; 519 481 break; 520 482 521 483 default:

+15 -6

fs/xfs/xfs_ioctl.c

··· 469 469 } 470 470 } 471 471 472 - if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) 473 - fa->fsx_cowextsize = XFS_FSB_TO_B(mp, ip->i_cowextsize); 472 + if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) { 473 + /* 474 + * Don't let a misaligned CoW extent size hint on a directory 475 + * escape to userspace if it won't pass the setattr checks 476 + * later. 477 + */ 478 + if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && 479 + ip->i_cowextsize % mp->m_sb.sb_rextsize > 0) { 480 + fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; 481 + fa->fsx_cowextsize = 0; 482 + } else { 483 + fa->fsx_cowextsize = XFS_FSB_TO_B(mp, ip->i_cowextsize); 484 + } 485 + } 486 + 474 487 fa->fsx_projid = ip->i_projid; 475 488 if (ifp && !xfs_need_iread_extents(ifp)) 476 489 fa->fsx_nextents = xfs_iext_count(ifp); ··· 554 541 if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 || 555 542 xfs_extlen_to_rtxmod(mp, ip->i_extsize)) 556 543 return -EINVAL; 557 - 558 - /* Clear reflink if we are actually able to set the rt flag. */ 559 - if (xfs_is_reflink_inode(ip)) 560 - ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 561 544 } 562 545 563 546 /* diflags2 only valid for v3 inodes. */

-2

fs/xfs/xfs_log.c

··· 2744 2744 if (!ticket->t_cnt) { 2745 2745 xlog_grant_add_space(&log->l_reserve_head, ticket->t_unit_res); 2746 2746 trace_xfs_log_ticket_regrant_exit(log, ticket); 2747 - 2748 - ticket->t_curr_res = ticket->t_unit_res; 2749 2747 } 2750 2748 2751 2749 xfs_log_ticket_put(ticket);

+4

fs/xfs/xfs_log_recover.c

··· 1820 1820 &xlog_xmd_item_ops, 1821 1821 &xlog_rtefi_item_ops, 1822 1822 &xlog_rtefd_item_ops, 1823 + &xlog_rtrui_item_ops, 1824 + &xlog_rtrud_item_ops, 1825 + &xlog_rtcui_item_ops, 1826 + &xlog_rtcud_item_ops, 1823 1827 }; 1824 1828 1825 1829 static const struct xlog_recover_item_ops *

+14

fs/xfs/xfs_mount.c

··· 37 37 #include "xfs_rtbitmap.h" 38 38 #include "xfs_metafile.h" 39 39 #include "xfs_rtgroup.h" 40 + #include "xfs_rtrmap_btree.h" 41 + #include "xfs_rtrefcount_btree.h" 40 42 #include "scrub/stats.h" 41 43 42 44 static DEFINE_MUTEX(xfs_uuid_table_mutex); ··· 652 650 mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels); 653 651 } 654 652 653 + /* Compute maximum possible height for realtime btree types for this fs. */ 654 + static inline void 655 + xfs_rtbtree_compute_maxlevels( 656 + struct xfs_mount *mp) 657 + { 658 + mp->m_rtbtree_maxlevels = max(mp->m_rtrmap_maxlevels, 659 + mp->m_rtrefc_maxlevels); 660 + } 661 + 655 662 /* 656 663 * This function does the following on an initial mount of a file system: 657 664 * - reads the superblock from disk and init the mount struct ··· 729 718 xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); 730 719 xfs_mount_setup_inode_geom(mp); 731 720 xfs_rmapbt_compute_maxlevels(mp); 721 + xfs_rtrmapbt_compute_maxlevels(mp); 732 722 xfs_refcountbt_compute_maxlevels(mp); 723 + xfs_rtrefcountbt_compute_maxlevels(mp); 733 724 734 725 xfs_agbtree_compute_maxlevels(mp); 726 + xfs_rtbtree_compute_maxlevels(mp); 735 727 736 728 /* 737 729 * Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks

+22 -3

fs/xfs/xfs_mount.h

··· 158 158 uint m_bmap_dmnr[2]; /* min bmap btree records */ 159 159 uint m_rmap_mxr[2]; /* max rmap btree records */ 160 160 uint m_rmap_mnr[2]; /* min rmap btree records */ 161 + uint m_rtrmap_mxr[2]; /* max rtrmap btree records */ 162 + uint m_rtrmap_mnr[2]; /* min rtrmap btree records */ 161 163 uint m_refc_mxr[2]; /* max refc btree records */ 162 164 uint m_refc_mnr[2]; /* min refc btree records */ 165 + uint m_rtrefc_mxr[2]; /* max rtrefc btree records */ 166 + uint m_rtrefc_mnr[2]; /* min rtrefc btree records */ 163 167 uint m_alloc_maxlevels; /* max alloc btree levels */ 164 168 uint m_bm_maxlevels[2]; /* max bmap btree levels */ 165 169 uint m_rmap_maxlevels; /* max rmap btree levels */ 170 + uint m_rtrmap_maxlevels; /* max rtrmap btree level */ 166 171 uint m_refc_maxlevels; /* max refcount btree level */ 172 + uint m_rtrefc_maxlevels; /* max rtrefc btree level */ 167 173 unsigned int m_agbtree_maxlevels; /* max level of all AG btrees */ 174 + unsigned int m_rtbtree_maxlevels; /* max level of all rt btrees */ 168 175 xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */ 169 176 uint m_alloc_set_aside; /* space we can't use */ 170 177 uint m_ag_max_usable; /* max space per AG */ ··· 357 350 #define XFS_FEAT_NOUUID (1ULL << 63) /* ignore uuid during mount */ 358 351 359 352 #define __XFS_HAS_FEAT(name, NAME) \ 360 - static inline bool xfs_has_ ## name (struct xfs_mount *mp) \ 353 + static inline bool xfs_has_ ## name (const struct xfs_mount *mp) \ 361 354 { \ 362 355 return mp->m_features & XFS_FEAT_ ## NAME; \ 363 356 } ··· 393 386 __XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE) 394 387 __XFS_HAS_FEAT(metadir, METADIR) 395 388 396 - static inline bool xfs_has_rtgroups(struct xfs_mount *mp) 389 + static inline bool xfs_has_rtgroups(const struct xfs_mount *mp) 397 390 { 398 391 /* all metadir file systems also allow rtgroups */ 399 392 return xfs_has_metadir(mp); 400 393 } 401 394 402 - static inline bool xfs_has_rtsb(struct xfs_mount *mp) 395 + static inline bool xfs_has_rtsb(const struct xfs_mount *mp) 403 396 { 404 397 /* all rtgroups filesystems with an rt section have an rtsb */ 405 398 return xfs_has_rtgroups(mp) && xfs_has_realtime(mp); 399 + } 400 + 401 + static inline bool xfs_has_rtrmapbt(const struct xfs_mount *mp) 402 + { 403 + return xfs_has_rtgroups(mp) && xfs_has_realtime(mp) && 404 + xfs_has_rmapbt(mp); 405 + } 406 + 407 + static inline bool xfs_has_rtreflink(const struct xfs_mount *mp) 408 + { 409 + return xfs_has_metadir(mp) && xfs_has_realtime(mp) && 410 + xfs_has_reflink(mp); 406 411 } 407 412 408 413 /*

+145 -85

fs/xfs/xfs_notify_failure.c

··· 19 19 #include "xfs_rtalloc.h" 20 20 #include "xfs_trans.h" 21 21 #include "xfs_ag.h" 22 + #include "xfs_notify_failure.h" 23 + #include "xfs_rtgroup.h" 24 + #include "xfs_rtrmap_btree.h" 22 25 23 26 #include <linux/mm.h> 24 27 #include <linux/dax.h> ··· 157 154 } 158 155 159 156 static int 160 - xfs_dax_notify_ddev_failure( 157 + xfs_dax_translate_range( 158 + struct xfs_buftarg *btp, 159 + u64 offset, 160 + u64 len, 161 + xfs_daddr_t *daddr, 162 + uint64_t *bblen) 163 + { 164 + u64 dev_start = btp->bt_dax_part_off; 165 + u64 dev_len = bdev_nr_bytes(btp->bt_bdev); 166 + u64 dev_end = dev_start + dev_len - 1; 167 + 168 + /* Notify failure on the whole device. */ 169 + if (offset == 0 && len == U64_MAX) { 170 + offset = dev_start; 171 + len = dev_len; 172 + } 173 + 174 + /* Ignore the range out of filesystem area */ 175 + if (offset + len - 1 < dev_start) 176 + return -ENXIO; 177 + if (offset > dev_end) 178 + return -ENXIO; 179 + 180 + /* Calculate the real range when it touches the boundary */ 181 + if (offset > dev_start) 182 + offset -= dev_start; 183 + else { 184 + len -= dev_start - offset; 185 + offset = 0; 186 + } 187 + if (offset + len - 1 > dev_end) 188 + len = dev_end - offset + 1; 189 + 190 + *daddr = BTOBB(offset); 191 + *bblen = BTOBB(len); 192 + return 0; 193 + } 194 + 195 + static int 196 + xfs_dax_notify_logdev_failure( 161 197 struct xfs_mount *mp, 162 - xfs_daddr_t daddr, 163 - xfs_daddr_t bblen, 198 + u64 offset, 199 + u64 len, 164 200 int mf_flags) 201 + { 202 + xfs_daddr_t daddr; 203 + uint64_t bblen; 204 + int error; 205 + 206 + /* 207 + * Return ENXIO instead of shutting down the filesystem if the failed 208 + * region is beyond the end of the log. 209 + */ 210 + error = xfs_dax_translate_range(mp->m_logdev_targp, 211 + offset, len, &daddr, &bblen); 212 + if (error) 213 + return error; 214 + 215 + /* 216 + * In the pre-remove case the failure notification is attempting to 217 + * trigger a force unmount. The expectation is that the device is 218 + * still present, but its removal is in progress and can not be 219 + * cancelled, proceed with accessing the log device. 220 + */ 221 + if (mf_flags & MF_MEM_PRE_REMOVE) 222 + return 0; 223 + 224 + xfs_err(mp, "ondisk log corrupt, shutting down fs!"); 225 + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); 226 + return -EFSCORRUPTED; 227 + } 228 + 229 + static int 230 + xfs_dax_notify_dev_failure( 231 + struct xfs_mount *mp, 232 + u64 offset, 233 + u64 len, 234 + int mf_flags, 235 + enum xfs_group_type type) 165 236 { 166 237 struct xfs_failure_info notify = { .mf_flags = mf_flags }; 167 238 struct xfs_trans *tp = NULL; 168 239 struct xfs_btree_cur *cur = NULL; 169 - struct xfs_buf *agf_bp = NULL; 170 240 int error = 0; 171 241 bool kernel_frozen = false; 172 - xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr); 173 - xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); 174 - xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, 175 - daddr + bblen - 1); 176 - xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno); 242 + uint32_t start_gno, end_gno; 243 + xfs_fsblock_t start_bno, end_bno; 244 + xfs_daddr_t daddr; 245 + uint64_t bblen; 246 + struct xfs_group *xg = NULL; 247 + 248 + if (!xfs_has_rmapbt(mp)) { 249 + xfs_debug(mp, "notify_failure() needs rmapbt enabled!"); 250 + return -EOPNOTSUPP; 251 + } 252 + 253 + error = xfs_dax_translate_range(type == XG_TYPE_RTG ? 254 + mp->m_rtdev_targp : mp->m_ddev_targp, 255 + offset, len, &daddr, &bblen); 256 + if (error) 257 + return error; 258 + 259 + if (type == XG_TYPE_RTG) { 260 + start_bno = xfs_daddr_to_rtb(mp, daddr); 261 + end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1); 262 + } else { 263 + start_bno = XFS_DADDR_TO_FSB(mp, daddr); 264 + end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1); 265 + } 177 266 178 267 if (mf_flags & MF_MEM_PRE_REMOVE) { 179 268 xfs_info(mp, "Device is about to be removed!"); ··· 284 189 if (error) 285 190 goto out; 286 191 287 - for (; agno <= end_agno; agno++) { 192 + start_gno = xfs_fsb_to_gno(mp, start_bno, type); 193 + end_gno = xfs_fsb_to_gno(mp, end_bno, type); 194 + while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) { 195 + struct xfs_buf *agf_bp = NULL; 196 + struct xfs_rtgroup *rtg = NULL; 288 197 struct xfs_rmap_irec ri_low = { }; 289 198 struct xfs_rmap_irec ri_high; 290 - struct xfs_agf *agf; 291 - struct xfs_perag *pag; 292 - xfs_agblock_t range_agend; 293 199 294 - pag = xfs_perag_get(mp, agno); 295 - error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); 296 - if (error) { 297 - xfs_perag_put(pag); 298 - break; 200 + if (type == XG_TYPE_AG) { 201 + struct xfs_perag *pag = to_perag(xg); 202 + 203 + error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); 204 + if (error) { 205 + xfs_perag_put(pag); 206 + break; 207 + } 208 + 209 + cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag); 210 + } else { 211 + rtg = to_rtg(xg); 212 + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 213 + cur = xfs_rtrmapbt_init_cursor(tp, rtg); 299 214 } 300 - 301 - cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag); 302 215 303 216 /* 304 217 * Set the rmap range from ri_low to ri_high, which represents 305 218 * a [start, end] where we looking for the files or metadata. 306 219 */ 307 220 memset(&ri_high, 0xFF, sizeof(ri_high)); 308 - ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno); 309 - if (agno == end_agno) 310 - ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno); 221 + if (xg->xg_gno == start_gno) 222 + ri_low.rm_startblock = 223 + xfs_fsb_to_gbno(mp, start_bno, type); 224 + if (xg->xg_gno == end_gno) 225 + ri_high.rm_startblock = 226 + xfs_fsb_to_gbno(mp, end_bno, type); 311 227 312 - agf = agf_bp->b_addr; 313 - range_agend = min(be32_to_cpu(agf->agf_length) - 1, 314 - ri_high.rm_startblock); 315 228 notify.startblock = ri_low.rm_startblock; 316 - notify.blockcount = range_agend + 1 - ri_low.rm_startblock; 229 + notify.blockcount = min(xg->xg_block_count, 230 + ri_high.rm_startblock + 1) - 231 + ri_low.rm_startblock; 317 232 318 233 error = xfs_rmap_query_range(cur, &ri_low, &ri_high, 319 234 xfs_dax_failure_fn, &notify); 320 235 xfs_btree_del_cursor(cur, error); 321 - xfs_trans_brelse(tp, agf_bp); 322 - xfs_perag_put(pag); 323 - if (error) 236 + if (agf_bp) 237 + xfs_trans_brelse(tp, agf_bp); 238 + if (rtg) 239 + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 240 + if (error) { 241 + xfs_group_put(xg); 324 242 break; 325 - 326 - fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0); 243 + } 327 244 } 328 245 329 246 xfs_trans_cancel(tp); ··· 370 263 int mf_flags) 371 264 { 372 265 struct xfs_mount *mp = dax_holder(dax_dev); 373 - u64 ddev_start; 374 - u64 ddev_end; 375 266 376 267 if (!(mp->m_super->s_flags & SB_BORN)) { 377 268 xfs_warn(mp, "filesystem is not ready for notify_failure()!"); 378 269 return -EIO; 379 270 } 380 271 381 - if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) { 382 - xfs_debug(mp, 383 - "notify_failure() not supported on realtime device!"); 384 - return -EOPNOTSUPP; 272 + if (mp->m_logdev_targp != mp->m_ddev_targp && 273 + mp->m_logdev_targp->bt_daxdev == dax_dev) { 274 + return xfs_dax_notify_logdev_failure(mp, offset, len, mf_flags); 385 275 } 386 276 387 - if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev && 388 - mp->m_logdev_targp != mp->m_ddev_targp) { 389 - /* 390 - * In the pre-remove case the failure notification is attempting 391 - * to trigger a force unmount. The expectation is that the 392 - * device is still present, but its removal is in progress and 393 - * can not be cancelled, proceed with accessing the log device. 394 - */ 395 - if (mf_flags & MF_MEM_PRE_REMOVE) 396 - return 0; 397 - xfs_err(mp, "ondisk log corrupt, shutting down fs!"); 398 - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); 399 - return -EFSCORRUPTED; 400 - } 401 - 402 - if (!xfs_has_rmapbt(mp)) { 403 - xfs_debug(mp, "notify_failure() needs rmapbt enabled!"); 404 - return -EOPNOTSUPP; 405 - } 406 - 407 - ddev_start = mp->m_ddev_targp->bt_dax_part_off; 408 - ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1; 409 - 410 - /* Notify failure on the whole device. */ 411 - if (offset == 0 && len == U64_MAX) { 412 - offset = ddev_start; 413 - len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev); 414 - } 415 - 416 - /* Ignore the range out of filesystem area */ 417 - if (offset + len - 1 < ddev_start) 418 - return -ENXIO; 419 - if (offset > ddev_end) 420 - return -ENXIO; 421 - 422 - /* Calculate the real range when it touches the boundary */ 423 - if (offset > ddev_start) 424 - offset -= ddev_start; 425 - else { 426 - len -= ddev_start - offset; 427 - offset = 0; 428 - } 429 - if (offset + len - 1 > ddev_end) 430 - len = ddev_end - offset + 1; 431 - 432 - return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len), 433 - mf_flags); 277 + return xfs_dax_notify_dev_failure(mp, offset, len, mf_flags, 278 + (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) ? 279 + XG_TYPE_RTG : XG_TYPE_AG); 434 280 } 435 281 436 282 const struct dax_holder_operations xfs_dax_holder_operations = {

+11

fs/xfs/xfs_notify_failure.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Copyright (C) 2024 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <djwong@kernel.org> 5 + */ 6 + #ifndef __XFS_NOTIFY_FAILURE_H__ 7 + #define __XFS_NOTIFY_FAILURE_H__ 8 + 9 + extern const struct dax_holder_operations xfs_dax_holder_operations; 10 + 11 + #endif /* __XFS_NOTIFY_FAILURE_H__ */

+6 -4

fs/xfs/xfs_qm.c

··· 230 230 231 231 if (!rtg) 232 232 return; 233 - if (rtg->rtg_inodes[XFS_RTGI_BITMAP]) 234 - xfs_qm_dqdetach(rtg->rtg_inodes[XFS_RTGI_BITMAP]); 235 - if (rtg->rtg_inodes[XFS_RTGI_SUMMARY]) 236 - xfs_qm_dqdetach(rtg->rtg_inodes[XFS_RTGI_SUMMARY]); 233 + if (rtg_bitmap(rtg)) 234 + xfs_qm_dqdetach(rtg_bitmap(rtg)); 235 + if (rtg_summary(rtg)) 236 + xfs_qm_dqdetach(rtg_summary(rtg)); 237 237 xfs_rtgroup_rele(rtg); 238 238 } 239 239 ··· 428 428 xfs_qm_dqdetach( 429 429 xfs_inode_t *ip) 430 430 { 431 + if (xfs_is_metadir_inode(ip)) 432 + return; 431 433 if (!(ip->i_udquot || ip->i_gdquot || ip->i_pdquot)) 432 434 return; 433 435

+16 -10

fs/xfs/xfs_qm_bhv.c

··· 32 32 limit = blkres->softlimit ? 33 33 blkres->softlimit : 34 34 blkres->hardlimit; 35 - if (limit && statp->f_blocks > limit) { 36 - statp->f_blocks = limit; 37 - statp->f_bfree = statp->f_bavail = 38 - (statp->f_blocks > blkres->reserved) ? 39 - (statp->f_blocks - blkres->reserved) : 0; 35 + if (limit) { 36 + uint64_t remaining = 0; 37 + 38 + if (limit > blkres->reserved) 39 + remaining = limit - blkres->reserved; 40 + 41 + statp->f_blocks = min(statp->f_blocks, limit); 42 + statp->f_bfree = min(statp->f_bfree, remaining); 40 43 } 41 44 42 45 limit = dqp->q_ino.softlimit ? 43 46 dqp->q_ino.softlimit : 44 47 dqp->q_ino.hardlimit; 45 - if (limit && statp->f_files > limit) { 46 - statp->f_files = limit; 47 - statp->f_ffree = 48 - (statp->f_files > dqp->q_ino.reserved) ? 49 - (statp->f_files - dqp->q_ino.reserved) : 0; 48 + if (limit) { 49 + uint64_t remaining = 0; 50 + 51 + if (limit > dqp->q_ino.reserved) 52 + remaining = limit - dqp->q_ino.reserved; 53 + 54 + statp->f_files = min(statp->f_files, limit); 55 + statp->f_ffree = min(statp->f_ffree, remaining); 50 56 } 51 57 } 52 58

-5

fs/xfs/xfs_quota.h

··· 29 29 (XFS_IS_GQUOTA_ON(mp) && (ip)->i_gdquot == NULL) || \ 30 30 (XFS_IS_PQUOTA_ON(mp) && (ip)->i_pdquot == NULL)) 31 31 32 - #define XFS_IS_DQDETACHED(ip) \ 33 - ((ip)->i_udquot == NULL && \ 34 - (ip)->i_gdquot == NULL && \ 35 - (ip)->i_pdquot == NULL) 36 - 37 32 #define XFS_QM_NEED_QUOTACHECK(mp) \ 38 33 ((XFS_IS_UQUOTA_ON(mp) && \ 39 34 (mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \

+229 -25

fs/xfs/xfs_refcount_item.c

··· 23 23 #include "xfs_ag.h" 24 24 #include "xfs_btree.h" 25 25 #include "xfs_trace.h" 26 + #include "xfs_rtgroup.h" 26 27 27 28 struct kmem_cache *xfs_cui_cache; 28 29 struct kmem_cache *xfs_cud_cache; ··· 95 94 96 95 ASSERT(atomic_read(&cuip->cui_next_extent) == 97 96 cuip->cui_format.cui_nextents); 97 + ASSERT(lip->li_type == XFS_LI_CUI || lip->li_type == XFS_LI_CUI_RT); 98 98 99 - cuip->cui_format.cui_type = XFS_LI_CUI; 99 + cuip->cui_format.cui_type = lip->li_type; 100 100 cuip->cui_format.cui_size = 1; 101 101 102 102 xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUI_FORMAT, &cuip->cui_format, ··· 140 138 STATIC struct xfs_cui_log_item * 141 139 xfs_cui_init( 142 140 struct xfs_mount *mp, 141 + unsigned short item_type, 143 142 uint nextents) 144 - 145 143 { 146 144 struct xfs_cui_log_item *cuip; 147 145 148 146 ASSERT(nextents > 0); 147 + ASSERT(item_type == XFS_LI_CUI || item_type == XFS_LI_CUI_RT); 148 + 149 149 if (nextents > XFS_CUI_MAX_FAST_EXTENTS) 150 150 cuip = kzalloc(xfs_cui_log_item_sizeof(nextents), 151 151 GFP_KERNEL | __GFP_NOFAIL); ··· 155 151 cuip = kmem_cache_zalloc(xfs_cui_cache, 156 152 GFP_KERNEL | __GFP_NOFAIL); 157 153 158 - xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); 154 + xfs_log_item_init(mp, &cuip->cui_item, item_type, &xfs_cui_item_ops); 159 155 cuip->cui_format.cui_nextents = nextents; 160 156 cuip->cui_format.cui_id = (uintptr_t)(void *)cuip; 161 157 atomic_set(&cuip->cui_next_extent, 0); ··· 194 190 struct xfs_cud_log_item *cudp = CUD_ITEM(lip); 195 191 struct xfs_log_iovec *vecp = NULL; 196 192 197 - cudp->cud_format.cud_type = XFS_LI_CUD; 193 + ASSERT(lip->li_type == XFS_LI_CUD || lip->li_type == XFS_LI_CUD_RT); 194 + 195 + cudp->cud_format.cud_type = lip->li_type; 198 196 cudp->cud_format.cud_size = 1; 199 197 200 198 xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUD_FORMAT, &cudp->cud_format, ··· 238 232 static inline struct xfs_refcount_intent *ci_entry(const struct list_head *e) 239 233 { 240 234 return list_entry(e, struct xfs_refcount_intent, ri_list); 235 + } 236 + 237 + static inline bool 238 + xfs_cui_item_isrt(const struct xfs_log_item *lip) 239 + { 240 + ASSERT(lip->li_type == XFS_LI_CUI || lip->li_type == XFS_LI_CUI_RT); 241 + 242 + return lip->li_type == XFS_LI_CUI_RT; 241 243 } 242 244 243 245 /* Sort refcount intents by AG. */ ··· 296 282 } 297 283 298 284 static struct xfs_log_item * 285 + __xfs_refcount_update_create_intent( 286 + struct xfs_trans *tp, 287 + struct list_head *items, 288 + unsigned int count, 289 + bool sort, 290 + unsigned short item_type) 291 + { 292 + struct xfs_mount *mp = tp->t_mountp; 293 + struct xfs_cui_log_item *cuip; 294 + struct xfs_refcount_intent *ri; 295 + 296 + ASSERT(count > 0); 297 + 298 + cuip = xfs_cui_init(mp, item_type, count); 299 + if (sort) 300 + list_sort(mp, items, xfs_refcount_update_diff_items); 301 + list_for_each_entry(ri, items, ri_list) 302 + xfs_refcount_update_log_item(tp, cuip, ri); 303 + return &cuip->cui_item; 304 + } 305 + 306 + static struct xfs_log_item * 299 307 xfs_refcount_update_create_intent( 300 308 struct xfs_trans *tp, 301 309 struct list_head *items, 302 310 unsigned int count, 303 311 bool sort) 304 312 { 305 - struct xfs_mount *mp = tp->t_mountp; 306 - struct xfs_cui_log_item *cuip = xfs_cui_init(mp, count); 307 - struct xfs_refcount_intent *ri; 313 + return __xfs_refcount_update_create_intent(tp, items, count, sort, 314 + XFS_LI_CUI); 315 + } 308 316 309 - ASSERT(count > 0); 310 - 311 - if (sort) 312 - list_sort(mp, items, xfs_refcount_update_diff_items); 313 - list_for_each_entry(ri, items, ri_list) 314 - xfs_refcount_update_log_item(tp, cuip, ri); 315 - return &cuip->cui_item; 317 + static inline unsigned short 318 + xfs_cud_type_from_cui(const struct xfs_cui_log_item *cuip) 319 + { 320 + return xfs_cui_item_isrt(&cuip->cui_item) ? XFS_LI_CUD_RT : XFS_LI_CUD; 316 321 } 317 322 318 323 /* Get an CUD so we can process all the deferred refcount updates. */ ··· 345 312 struct xfs_cud_log_item *cudp; 346 313 347 314 cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL); 348 - xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, 349 - &xfs_cud_item_ops); 315 + xfs_log_item_init(tp->t_mountp, &cudp->cud_item, 316 + xfs_cud_type_from_cui(cuip), &xfs_cud_item_ops); 350 317 cudp->cud_cuip = cuip; 351 318 cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id; 352 319 ··· 361 328 { 362 329 struct xfs_mount *mp = tp->t_mountp; 363 330 364 - trace_xfs_refcount_defer(mp, ri); 331 + /* 332 + * Deferred refcount updates for the realtime and data sections must 333 + * use separate transactions to finish deferred work because updates to 334 + * realtime metadata files can lock AGFs to allocate btree blocks and 335 + * we don't want that mixing with the AGF locks taken to finish data 336 + * section updates. 337 + */ 338 + ri->ri_group = xfs_group_intent_get(mp, ri->ri_startblock, 339 + ri->ri_realtime ? XG_TYPE_RTG : XG_TYPE_AG); 365 340 366 - ri->ri_group = xfs_group_intent_get(mp, ri->ri_startblock, XG_TYPE_AG); 367 - xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type); 341 + trace_xfs_refcount_defer(mp, ri); 342 + xfs_defer_add(tp, &ri->ri_list, ri->ri_realtime ? 343 + &xfs_rtrefcount_update_defer_type : 344 + &xfs_refcount_update_defer_type); 368 345 } 369 346 370 347 /* Cancel a deferred refcount update. */ ··· 424 381 return; 425 382 agbp = rcur->bc_ag.agbp; 426 383 xfs_btree_del_cursor(rcur, error); 427 - if (error) 384 + if (error && agbp) 428 385 xfs_trans_brelse(tp, agbp); 429 386 } 430 387 ··· 440 397 static inline bool 441 398 xfs_cui_validate_phys( 442 399 struct xfs_mount *mp, 400 + bool isrt, 443 401 struct xfs_phys_extent *pmap) 444 402 { 445 403 if (!xfs_has_reflink(mp)) ··· 459 415 return false; 460 416 } 461 417 418 + if (isrt) 419 + return xfs_verify_rtbext(mp, pmap->pe_startblock, pmap->pe_len); 420 + 462 421 return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len); 463 422 } 464 423 ··· 469 422 xfs_cui_recover_work( 470 423 struct xfs_mount *mp, 471 424 struct xfs_defer_pending *dfp, 425 + bool isrt, 472 426 struct xfs_phys_extent *pmap) 473 427 { 474 428 struct xfs_refcount_intent *ri; ··· 480 432 ri->ri_startblock = pmap->pe_startblock; 481 433 ri->ri_blockcount = pmap->pe_len; 482 434 ri->ri_group = xfs_group_intent_get(mp, pmap->pe_startblock, 483 - XG_TYPE_AG); 435 + isrt ? XG_TYPE_RTG : XG_TYPE_AG); 436 + ri->ri_realtime = isrt; 484 437 485 438 xfs_defer_add_item(dfp, &ri->ri_list); 486 439 } ··· 500 451 struct xfs_cui_log_item *cuip = CUI_ITEM(lip); 501 452 struct xfs_trans *tp; 502 453 struct xfs_mount *mp = lip->li_log->l_mp; 454 + bool isrt = xfs_cui_item_isrt(lip); 503 455 int i; 504 456 int error = 0; 505 457 ··· 510 460 * just toss the CUI. 511 461 */ 512 462 for (i = 0; i < cuip->cui_format.cui_nextents; i++) { 513 - if (!xfs_cui_validate_phys(mp, 463 + if (!xfs_cui_validate_phys(mp, isrt, 514 464 &cuip->cui_format.cui_extents[i])) { 515 465 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, 516 466 &cuip->cui_format, ··· 518 468 return -EFSCORRUPTED; 519 469 } 520 470 521 - xfs_cui_recover_work(mp, dfp, &cuip->cui_format.cui_extents[i]); 471 + xfs_cui_recover_work(mp, dfp, isrt, 472 + &cuip->cui_format.cui_extents[i]); 522 473 } 523 474 524 475 /* ··· 566 515 struct xfs_phys_extent *pmap; 567 516 unsigned int count; 568 517 518 + ASSERT(intent->li_type == XFS_LI_CUI || 519 + intent->li_type == XFS_LI_CUI_RT); 520 + 569 521 count = CUI_ITEM(intent)->cui_format.cui_nextents; 570 522 pmap = CUI_ITEM(intent)->cui_format.cui_extents; 571 523 572 - cuip = xfs_cui_init(tp->t_mountp, count); 524 + cuip = xfs_cui_init(tp->t_mountp, intent->li_type, count); 573 525 memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap)); 574 526 atomic_set(&cuip->cui_next_extent, count); 575 527 ··· 591 537 .recover_work = xfs_refcount_recover_work, 592 538 .relog_intent = xfs_refcount_relog_intent, 593 539 }; 540 + 541 + #ifdef CONFIG_XFS_RT 542 + static struct xfs_log_item * 543 + xfs_rtrefcount_update_create_intent( 544 + struct xfs_trans *tp, 545 + struct list_head *items, 546 + unsigned int count, 547 + bool sort) 548 + { 549 + return __xfs_refcount_update_create_intent(tp, items, count, sort, 550 + XFS_LI_CUI_RT); 551 + } 552 + 553 + /* Process a deferred realtime refcount update. */ 554 + STATIC int 555 + xfs_rtrefcount_update_finish_item( 556 + struct xfs_trans *tp, 557 + struct xfs_log_item *done, 558 + struct list_head *item, 559 + struct xfs_btree_cur **state) 560 + { 561 + struct xfs_refcount_intent *ri = ci_entry(item); 562 + int error; 563 + 564 + error = xfs_rtrefcount_finish_one(tp, ri, state); 565 + 566 + /* Did we run out of reservation? Requeue what we didn't finish. */ 567 + if (!error && ri->ri_blockcount > 0) { 568 + ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE || 569 + ri->ri_type == XFS_REFCOUNT_DECREASE); 570 + return -EAGAIN; 571 + } 572 + 573 + xfs_refcount_update_cancel_item(item); 574 + return error; 575 + } 576 + 577 + /* Clean up after calling xfs_rtrefcount_finish_one. */ 578 + STATIC void 579 + xfs_rtrefcount_finish_one_cleanup( 580 + struct xfs_trans *tp, 581 + struct xfs_btree_cur *rcur, 582 + int error) 583 + { 584 + if (rcur) 585 + xfs_btree_del_cursor(rcur, error); 586 + } 587 + 588 + const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type = { 589 + .name = "rtrefcount", 590 + .max_items = XFS_CUI_MAX_FAST_EXTENTS, 591 + .create_intent = xfs_rtrefcount_update_create_intent, 592 + .abort_intent = xfs_refcount_update_abort_intent, 593 + .create_done = xfs_refcount_update_create_done, 594 + .finish_item = xfs_rtrefcount_update_finish_item, 595 + .finish_cleanup = xfs_rtrefcount_finish_one_cleanup, 596 + .cancel_item = xfs_refcount_update_cancel_item, 597 + .recover_work = xfs_refcount_recover_work, 598 + .relog_intent = xfs_refcount_relog_intent, 599 + }; 600 + #else 601 + const struct xfs_defer_op_type xfs_rtrefcount_update_defer_type = { 602 + .name = "rtrefcount", 603 + }; 604 + #endif /* CONFIG_XFS_RT */ 594 605 595 606 STATIC bool 596 607 xfs_cui_item_match( ··· 722 603 return -EFSCORRUPTED; 723 604 } 724 605 725 - cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); 606 + cuip = xfs_cui_init(mp, ITEM_TYPE(item), cui_formatp->cui_nextents); 726 607 xfs_cui_copy_format(&cuip->cui_format, cui_formatp); 727 608 atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); 728 609 ··· 734 615 const struct xlog_recover_item_ops xlog_cui_item_ops = { 735 616 .item_type = XFS_LI_CUI, 736 617 .commit_pass2 = xlog_recover_cui_commit_pass2, 618 + }; 619 + 620 + #ifdef CONFIG_XFS_RT 621 + STATIC int 622 + xlog_recover_rtcui_commit_pass2( 623 + struct xlog *log, 624 + struct list_head *buffer_list, 625 + struct xlog_recover_item *item, 626 + xfs_lsn_t lsn) 627 + { 628 + struct xfs_mount *mp = log->l_mp; 629 + struct xfs_cui_log_item *cuip; 630 + struct xfs_cui_log_format *cui_formatp; 631 + size_t len; 632 + 633 + cui_formatp = item->ri_buf[0].i_addr; 634 + 635 + if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) { 636 + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, 637 + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); 638 + return -EFSCORRUPTED; 639 + } 640 + 641 + len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents); 642 + if (item->ri_buf[0].i_len != len) { 643 + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, 644 + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); 645 + return -EFSCORRUPTED; 646 + } 647 + 648 + cuip = xfs_cui_init(mp, ITEM_TYPE(item), cui_formatp->cui_nextents); 649 + xfs_cui_copy_format(&cuip->cui_format, cui_formatp); 650 + atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); 651 + 652 + xlog_recover_intent_item(log, &cuip->cui_item, lsn, 653 + &xfs_rtrefcount_update_defer_type); 654 + return 0; 655 + } 656 + #else 657 + STATIC int 658 + xlog_recover_rtcui_commit_pass2( 659 + struct xlog *log, 660 + struct list_head *buffer_list, 661 + struct xlog_recover_item *item, 662 + xfs_lsn_t lsn) 663 + { 664 + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, 665 + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); 666 + return -EFSCORRUPTED; 667 + } 668 + #endif 669 + 670 + const struct xlog_recover_item_ops xlog_rtcui_item_ops = { 671 + .item_type = XFS_LI_CUI_RT, 672 + .commit_pass2 = xlog_recover_rtcui_commit_pass2, 737 673 }; 738 674 739 675 /* ··· 821 647 const struct xlog_recover_item_ops xlog_cud_item_ops = { 822 648 .item_type = XFS_LI_CUD, 823 649 .commit_pass2 = xlog_recover_cud_commit_pass2, 650 + }; 651 + 652 + #ifdef CONFIG_XFS_RT 653 + STATIC int 654 + xlog_recover_rtcud_commit_pass2( 655 + struct xlog *log, 656 + struct list_head *buffer_list, 657 + struct xlog_recover_item *item, 658 + xfs_lsn_t lsn) 659 + { 660 + struct xfs_cud_log_format *cud_formatp; 661 + 662 + cud_formatp = item->ri_buf[0].i_addr; 663 + if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) { 664 + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, 665 + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); 666 + return -EFSCORRUPTED; 667 + } 668 + 669 + xlog_recover_release_intent(log, XFS_LI_CUI_RT, 670 + cud_formatp->cud_cui_id); 671 + return 0; 672 + } 673 + #else 674 + # define xlog_recover_rtcud_commit_pass2 xlog_recover_rtcui_commit_pass2 675 + #endif 676 + 677 + const struct xlog_recover_item_ops xlog_rtcud_item_ops = { 678 + .item_type = XFS_LI_CUD_RT, 679 + .commit_pass2 = xlog_recover_rtcud_commit_pass2, 824 680 };

+236 -89

fs/xfs/xfs_reflink.c

··· 30 30 #include "xfs_ag.h" 31 31 #include "xfs_ag_resv.h" 32 32 #include "xfs_health.h" 33 + #include "xfs_rtrefcount_btree.h" 34 + #include "xfs_rtalloc.h" 35 + #include "xfs_rtgroup.h" 36 + #include "xfs_metafile.h" 33 37 34 38 /* 35 39 * Copy on Write of Shared Blocks ··· 124 120 */ 125 121 126 122 /* 127 - * Given an AG extent, find the lowest-numbered run of shared blocks 128 - * within that range and return the range in fbno/flen. If 129 - * find_end_of_shared is true, return the longest contiguous extent of 130 - * shared blocks. If there are no shared extents, fbno and flen will 131 - * be set to NULLAGBLOCK and 0, respectively. 123 + * Given a file mapping for the data device, find the lowest-numbered run of 124 + * shared blocks within that mapping and return it in shared_offset/shared_len. 125 + * The offset is relative to the start of irec. 126 + * 127 + * If find_end_of_shared is true, return the longest contiguous extent of shared 128 + * blocks. If there are no shared extents, shared_offset and shared_len will be 129 + * set to 0; 132 130 */ 133 131 static int 134 132 xfs_reflink_find_shared( 135 - struct xfs_perag *pag, 133 + struct xfs_mount *mp, 136 134 struct xfs_trans *tp, 137 - xfs_agblock_t agbno, 138 - xfs_extlen_t aglen, 139 - xfs_agblock_t *fbno, 140 - xfs_extlen_t *flen, 135 + const struct xfs_bmbt_irec *irec, 136 + xfs_extlen_t *shared_offset, 137 + xfs_extlen_t *shared_len, 141 138 bool find_end_of_shared) 142 139 { 143 140 struct xfs_buf *agbp; 141 + struct xfs_perag *pag; 144 142 struct xfs_btree_cur *cur; 145 143 int error; 144 + xfs_agblock_t orig_bno, found_bno; 145 + 146 + pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock)); 147 + orig_bno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); 146 148 147 149 error = xfs_alloc_read_agf(pag, tp, 0, &agbp); 148 150 if (error) 149 - return error; 151 + goto out; 150 152 151 - cur = xfs_refcountbt_init_cursor(pag_mount(pag), tp, agbp, pag); 152 - 153 - error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, 154 - find_end_of_shared); 155 - 153 + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); 154 + error = xfs_refcount_find_shared(cur, orig_bno, irec->br_blockcount, 155 + &found_bno, shared_len, find_end_of_shared); 156 156 xfs_btree_del_cursor(cur, error); 157 - 158 157 xfs_trans_brelse(tp, agbp); 158 + 159 + if (!error && *shared_len) 160 + *shared_offset = found_bno - orig_bno; 161 + out: 162 + xfs_perag_put(pag); 163 + return error; 164 + } 165 + 166 + /* 167 + * Given a file mapping for the rt device, find the lowest-numbered run of 168 + * shared blocks within that mapping and return it in shared_offset/shared_len. 169 + * The offset is relative to the start of irec. 170 + * 171 + * If find_end_of_shared is true, return the longest contiguous extent of shared 172 + * blocks. If there are no shared extents, shared_offset and shared_len will be 173 + * set to 0; 174 + */ 175 + static int 176 + xfs_reflink_find_rtshared( 177 + struct xfs_mount *mp, 178 + struct xfs_trans *tp, 179 + const struct xfs_bmbt_irec *irec, 180 + xfs_extlen_t *shared_offset, 181 + xfs_extlen_t *shared_len, 182 + bool find_end_of_shared) 183 + { 184 + struct xfs_rtgroup *rtg; 185 + struct xfs_btree_cur *cur; 186 + xfs_rgblock_t orig_bno; 187 + xfs_agblock_t found_bno; 188 + int error; 189 + 190 + BUILD_BUG_ON(NULLRGBLOCK != NULLAGBLOCK); 191 + 192 + /* 193 + * Note: this uses the not quite correct xfs_agblock_t type because 194 + * xfs_refcount_find_shared is shared between the RT and data device 195 + * refcount code. 196 + */ 197 + orig_bno = xfs_rtb_to_rgbno(mp, irec->br_startblock); 198 + rtg = xfs_rtgroup_get(mp, xfs_rtb_to_rgno(mp, irec->br_startblock)); 199 + 200 + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_REFCOUNT); 201 + cur = xfs_rtrefcountbt_init_cursor(tp, rtg); 202 + error = xfs_refcount_find_shared(cur, orig_bno, irec->br_blockcount, 203 + &found_bno, shared_len, find_end_of_shared); 204 + xfs_btree_del_cursor(cur, error); 205 + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_REFCOUNT); 206 + xfs_rtgroup_put(rtg); 207 + 208 + if (!error && *shared_len) 209 + *shared_offset = found_bno - orig_bno; 159 210 return error; 160 211 } 161 212 ··· 231 172 bool *shared) 232 173 { 233 174 struct xfs_mount *mp = ip->i_mount; 234 - struct xfs_perag *pag; 235 - xfs_agblock_t agbno; 236 - xfs_extlen_t aglen; 237 - xfs_agblock_t fbno; 238 - xfs_extlen_t flen; 175 + xfs_extlen_t shared_offset, shared_len; 239 176 int error = 0; 240 177 241 178 /* Holes, unwritten, and delalloc extents cannot be shared */ ··· 242 187 243 188 trace_xfs_reflink_trim_around_shared(ip, irec); 244 189 245 - pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock)); 246 - agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); 247 - aglen = irec->br_blockcount; 248 - 249 - error = xfs_reflink_find_shared(pag, NULL, agbno, aglen, &fbno, &flen, 250 - true); 251 - xfs_perag_put(pag); 190 + if (XFS_IS_REALTIME_INODE(ip)) 191 + error = xfs_reflink_find_rtshared(mp, NULL, irec, 192 + &shared_offset, &shared_len, true); 193 + else 194 + error = xfs_reflink_find_shared(mp, NULL, irec, 195 + &shared_offset, &shared_len, true); 252 196 if (error) 253 197 return error; 254 198 255 - *shared = false; 256 - if (fbno == NULLAGBLOCK) { 199 + if (!shared_len) { 257 200 /* No shared blocks at all. */ 258 - return 0; 259 - } 260 - 261 - if (fbno == agbno) { 201 + *shared = false; 202 + } else if (!shared_offset) { 262 203 /* 263 - * The start of this extent is shared. Truncate the 264 - * mapping at the end of the shared region so that a 265 - * subsequent iteration starts at the start of the 266 - * unshared region. 204 + * The start of this mapping points to shared space. Truncate 205 + * the mapping at the end of the shared region so that a 206 + * subsequent iteration starts at the start of the unshared 207 + * region. 267 208 */ 268 - irec->br_blockcount = flen; 209 + irec->br_blockcount = shared_len; 269 210 *shared = true; 270 - return 0; 211 + } else { 212 + /* 213 + * There's a shared region that doesn't start at the beginning 214 + * of the mapping. Truncate the mapping at the start of the 215 + * shared extent so that a subsequent iteration starts at the 216 + * start of the shared region. 217 + */ 218 + irec->br_blockcount = shared_offset; 219 + *shared = false; 271 220 } 272 - 273 - /* 274 - * There's a shared extent midway through this extent. 275 - * Truncate the mapping at the start of the shared 276 - * extent so that a subsequent iteration starts at the 277 - * start of the shared region. 278 - */ 279 - irec->br_blockcount = fbno - agbno; 280 221 return 0; 281 222 } 282 223 ··· 440 389 struct xfs_mount *mp = ip->i_mount; 441 390 struct xfs_trans *tp; 442 391 xfs_filblks_t resaligned; 443 - xfs_extlen_t resblks; 392 + unsigned int dblocks = 0, rblocks = 0; 444 393 int nimaps; 445 394 int error; 446 395 bool found; 447 396 448 397 resaligned = xfs_aligned_fsb_count(imap->br_startoff, 449 398 imap->br_blockcount, xfs_get_cowextsz_hint(ip)); 450 - resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); 399 + if (XFS_IS_REALTIME_INODE(ip)) { 400 + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); 401 + rblocks = resaligned; 402 + } else { 403 + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); 404 + rblocks = 0; 405 + } 451 406 452 407 xfs_iunlock(ip, *lockmode); 453 408 *lockmode = 0; 454 409 455 - error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0, 456 - false, &tp); 410 + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, 411 + rblocks, false, &tp); 457 412 if (error) 458 413 return error; 459 414 ··· 628 571 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); 629 572 struct xfs_bmbt_irec got, del; 630 573 struct xfs_iext_cursor icur; 574 + bool isrt = XFS_IS_REALTIME_INODE(ip); 631 575 int error = 0; 632 576 633 577 if (!xfs_inode_has_cow_data(ip)) ··· 656 598 ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER); 657 599 658 600 /* Free the CoW orphan record. */ 659 - xfs_refcount_free_cow_extent(*tpp, del.br_startblock, 660 - del.br_blockcount); 601 + xfs_refcount_free_cow_extent(*tpp, isrt, 602 + del.br_startblock, del.br_blockcount); 661 603 662 604 error = xfs_free_extent_later(*tpp, del.br_startblock, 663 605 del.br_blockcount, NULL, 664 - XFS_AG_RESV_NONE, 0); 606 + XFS_AG_RESV_NONE, 607 + isrt ? XFS_FREE_EXTENT_REALTIME : 0); 665 608 if (error) 666 609 break; 667 610 ··· 746 687 return error; 747 688 } 748 689 690 + #ifdef CONFIG_XFS_QUOTA 691 + /* 692 + * Update quota accounting for a remapping operation. When we're remapping 693 + * something from the CoW fork to the data fork, we must update the quota 694 + * accounting for delayed allocations. For remapping from the data fork to the 695 + * data fork, use regular block accounting. 696 + */ 697 + static inline void 698 + xfs_reflink_update_quota( 699 + struct xfs_trans *tp, 700 + struct xfs_inode *ip, 701 + bool is_cow, 702 + int64_t blocks) 703 + { 704 + unsigned int qflag; 705 + 706 + if (XFS_IS_REALTIME_INODE(ip)) { 707 + qflag = is_cow ? XFS_TRANS_DQ_DELRTBCOUNT : 708 + XFS_TRANS_DQ_RTBCOUNT; 709 + } else { 710 + qflag = is_cow ? XFS_TRANS_DQ_DELBCOUNT : 711 + XFS_TRANS_DQ_BCOUNT; 712 + } 713 + xfs_trans_mod_dquot_byino(tp, ip, qflag, blocks); 714 + } 715 + #else 716 + # define xfs_reflink_update_quota(tp, ip, is_cow, blocks) ((void)0) 717 + #endif 718 + 749 719 /* 750 720 * Remap part of the CoW fork into the data fork. 751 721 * ··· 798 710 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); 799 711 unsigned int resblks; 800 712 int nmaps; 713 + bool isrt = XFS_IS_REALTIME_INODE(ip); 801 714 int error; 802 715 803 716 resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); ··· 868 779 * or not), unmap the extent and drop its refcount. 869 780 */ 870 781 xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data); 871 - xfs_refcount_decrease_extent(tp, &data); 872 - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 873 - -data.br_blockcount); 782 + xfs_refcount_decrease_extent(tp, isrt, &data); 783 + xfs_reflink_update_quota(tp, ip, false, -data.br_blockcount); 874 784 } else if (data.br_startblock == DELAYSTARTBLOCK) { 875 785 int done; 876 786 ··· 887 799 } 888 800 889 801 /* Free the CoW orphan record. */ 890 - xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount); 802 + xfs_refcount_free_cow_extent(tp, isrt, del.br_startblock, 803 + del.br_blockcount); 891 804 892 805 /* Map the new blocks into the data fork. */ 893 806 xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &del); 894 807 895 808 /* Charge this new data fork mapping to the on-disk quota. */ 896 - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, 897 - (long)del.br_blockcount); 809 + xfs_reflink_update_quota(tp, ip, true, del.br_blockcount); 898 810 899 811 /* Remove the mapping from the CoW fork. */ 900 812 xfs_bmap_del_extent_cow(ip, &icur, &got, &del); ··· 983 895 struct xfs_mount *mp) 984 896 { 985 897 struct xfs_perag *pag = NULL; 898 + struct xfs_rtgroup *rtg = NULL; 986 899 int error = 0; 987 900 988 901 if (!xfs_has_reflink(mp)) 989 902 return 0; 990 903 991 904 while ((pag = xfs_perag_next(mp, pag))) { 992 - error = xfs_refcount_recover_cow_leftovers(mp, pag); 905 + error = xfs_refcount_recover_cow_leftovers(pag_group(pag)); 993 906 if (error) { 994 907 xfs_perag_rele(pag); 995 - break; 908 + return error; 996 909 } 997 910 } 998 911 999 - return error; 912 + while ((rtg = xfs_rtgroup_next(mp, rtg))) { 913 + error = xfs_refcount_recover_cow_leftovers(rtg_group(rtg)); 914 + if (error) { 915 + xfs_rtgroup_rele(rtg); 916 + return error; 917 + } 918 + } 919 + 920 + return 0; 1000 921 } 1001 922 1002 923 /* ··· 1197 1100 static int 1198 1101 xfs_reflink_ag_has_free_space( 1199 1102 struct xfs_mount *mp, 1200 - xfs_agnumber_t agno) 1103 + struct xfs_inode *ip, 1104 + xfs_fsblock_t fsb) 1201 1105 { 1202 1106 struct xfs_perag *pag; 1107 + xfs_agnumber_t agno; 1203 1108 int error = 0; 1204 1109 1205 1110 if (!xfs_has_rmapbt(mp)) 1206 1111 return 0; 1112 + if (XFS_IS_REALTIME_INODE(ip)) { 1113 + struct xfs_rtgroup *rtg; 1114 + xfs_rgnumber_t rgno; 1207 1115 1116 + rgno = xfs_rtb_to_rgno(mp, fsb); 1117 + rtg = xfs_rtgroup_get(mp, rgno); 1118 + if (xfs_metafile_resv_critical(rtg_rmap(rtg))) 1119 + error = -ENOSPC; 1120 + xfs_rtgroup_put(rtg); 1121 + return error; 1122 + } 1123 + 1124 + agno = XFS_FSB_TO_AGNO(mp, fsb); 1208 1125 pag = xfs_perag_get(mp, agno); 1209 1126 if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) || 1210 1127 xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA)) ··· 1242 1131 struct xfs_trans *tp; 1243 1132 xfs_off_t newlen; 1244 1133 int64_t qdelta = 0; 1245 - unsigned int resblks; 1134 + unsigned int dblocks, rblocks, resblks; 1246 1135 bool quota_reserved = true; 1247 1136 bool smap_real; 1248 1137 bool dmap_written = xfs_bmap_is_written_extent(dmap); 1138 + bool isrt = XFS_IS_REALTIME_INODE(ip); 1249 1139 int iext_delta = 0; 1250 1140 int nimaps; 1251 1141 int error; ··· 1273 1161 * we're remapping. 1274 1162 */ 1275 1163 resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); 1164 + if (XFS_IS_REALTIME_INODE(ip)) { 1165 + dblocks = resblks; 1166 + rblocks = dmap->br_blockcount; 1167 + } else { 1168 + dblocks = resblks + dmap->br_blockcount; 1169 + rblocks = 0; 1170 + } 1276 1171 error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 1277 - resblks + dmap->br_blockcount, 0, false, &tp); 1172 + dblocks, rblocks, false, &tp); 1278 1173 if (error == -EDQUOT || error == -ENOSPC) { 1279 1174 quota_reserved = false; 1280 1175 error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, ··· 1332 1213 1333 1214 /* No reflinking if the AG of the dest mapping is low on space. */ 1334 1215 if (dmap_written) { 1335 - error = xfs_reflink_ag_has_free_space(mp, 1336 - XFS_FSB_TO_AGNO(mp, dmap->br_startblock)); 1216 + error = xfs_reflink_ag_has_free_space(mp, ip, 1217 + dmap->br_startblock); 1337 1218 if (error) 1338 1219 goto out_cancel; 1339 1220 } ··· 1361 1242 * done. 1362 1243 */ 1363 1244 if (!quota_reserved && !smap_real && dmap_written) { 1364 - error = xfs_trans_reserve_quota_nblks(tp, ip, 1365 - dmap->br_blockcount, 0, false); 1245 + if (XFS_IS_REALTIME_INODE(ip)) { 1246 + dblocks = 0; 1247 + rblocks = dmap->br_blockcount; 1248 + } else { 1249 + dblocks = dmap->br_blockcount; 1250 + rblocks = 0; 1251 + } 1252 + error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, 1253 + false); 1366 1254 if (error) 1367 1255 goto out_cancel; 1368 1256 } ··· 1390 1264 * or not), unmap the extent and drop its refcount. 1391 1265 */ 1392 1266 xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &smap); 1393 - xfs_refcount_decrease_extent(tp, &smap); 1267 + xfs_refcount_decrease_extent(tp, isrt, &smap); 1394 1268 qdelta -= smap.br_blockcount; 1395 1269 } else if (smap.br_startblock == DELAYSTARTBLOCK) { 1396 1270 int done; ··· 1413 1287 * its refcount and map it into the file. 1414 1288 */ 1415 1289 if (dmap_written) { 1416 - xfs_refcount_increase_extent(tp, dmap); 1290 + xfs_refcount_increase_extent(tp, isrt, dmap); 1417 1291 xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, dmap); 1418 1292 qdelta += dmap->br_blockcount; 1419 1293 } 1420 1294 1421 - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta); 1295 + xfs_reflink_update_quota(tp, ip, false, qdelta); 1422 1296 1423 1297 /* Update dest isize if needed. */ 1424 1298 newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount); ··· 1592 1466 1593 1467 /* Check file eligibility and prepare for block sharing. */ 1594 1468 ret = -EINVAL; 1595 - /* Don't reflink realtime inodes */ 1596 - if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) 1469 + /* Can't reflink between data and rt volumes */ 1470 + if (XFS_IS_REALTIME_INODE(src) != XFS_IS_REALTIME_INODE(dest)) 1597 1471 goto out_unlock; 1598 1472 1599 1473 /* Don't share DAX file data with non-DAX file. */ ··· 1673 1547 *has_shared = false; 1674 1548 found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got); 1675 1549 while (found) { 1676 - struct xfs_perag *pag; 1677 - xfs_agblock_t agbno; 1678 - xfs_extlen_t aglen; 1679 - xfs_agblock_t rbno; 1680 - xfs_extlen_t rlen; 1550 + xfs_extlen_t shared_offset, shared_len; 1681 1551 1682 1552 if (isnullstartblock(got.br_startblock) || 1683 1553 got.br_state != XFS_EXT_NORM) 1684 1554 goto next; 1685 1555 1686 - pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, got.br_startblock)); 1687 - agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock); 1688 - aglen = got.br_blockcount; 1689 - error = xfs_reflink_find_shared(pag, tp, agbno, aglen, 1690 - &rbno, &rlen, false); 1691 - xfs_perag_put(pag); 1556 + if (XFS_IS_REALTIME_INODE(ip)) 1557 + error = xfs_reflink_find_rtshared(mp, tp, &got, 1558 + &shared_offset, &shared_len, false); 1559 + else 1560 + error = xfs_reflink_find_shared(mp, tp, &got, 1561 + &shared_offset, &shared_len, false); 1692 1562 if (error) 1693 1563 return error; 1694 1564 1695 1565 /* Is there still a shared block here? */ 1696 - if (rbno != NULLAGBLOCK) { 1566 + if (shared_len) { 1697 1567 *has_shared = true; 1698 1568 return 0; 1699 1569 } ··· 1821 1699 out: 1822 1700 trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); 1823 1701 return error; 1702 + } 1703 + 1704 + /* 1705 + * Can we use reflink with this realtime extent size? Note that we don't check 1706 + * for rblocks > 0 here because this can be called as part of attaching a new 1707 + * rt section. 1708 + */ 1709 + bool 1710 + xfs_reflink_supports_rextsize( 1711 + struct xfs_mount *mp, 1712 + unsigned int rextsize) 1713 + { 1714 + /* reflink on the realtime device requires rtgroups */ 1715 + if (!xfs_has_rtgroups(mp)) 1716 + return false; 1717 + 1718 + /* 1719 + * Reflink doesn't support rt extent size larger than a single fsblock 1720 + * because we would have to perform CoW-around for unaligned write 1721 + * requests to guarantee that we always remap entire rt extents. 1722 + */ 1723 + if (rextsize != 1) 1724 + return false; 1725 + 1726 + return true; 1824 1727 }

+3 -1

fs/xfs/xfs_reflink.h

··· 25 25 return true; 26 26 } 27 27 28 - extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, 28 + int xfs_reflink_trim_around_shared(struct xfs_inode *ip, 29 29 struct xfs_bmbt_irec *irec, bool *shared); 30 30 int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, 31 31 bool *shared); ··· 61 61 loff_t *remapped); 62 62 extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen, 63 63 xfs_extlen_t cowextsize, unsigned int remap_flags); 64 + 65 + bool xfs_reflink_supports_rextsize(struct xfs_mount *mp, unsigned int rextsize); 64 66 65 67 #endif /* __XFS_REFLINK_H */

+206 -24

fs/xfs/xfs_rmap_item.c

··· 23 23 #include "xfs_ag.h" 24 24 #include "xfs_btree.h" 25 25 #include "xfs_trace.h" 26 + #include "xfs_rtgroup.h" 26 27 27 28 struct kmem_cache *xfs_rui_cache; 28 29 struct kmem_cache *xfs_rud_cache; ··· 95 94 ASSERT(atomic_read(&ruip->rui_next_extent) == 96 95 ruip->rui_format.rui_nextents); 97 96 98 - ruip->rui_format.rui_type = XFS_LI_RUI; 97 + ASSERT(lip->li_type == XFS_LI_RUI || lip->li_type == XFS_LI_RUI_RT); 98 + 99 + ruip->rui_format.rui_type = lip->li_type; 99 100 ruip->rui_format.rui_size = 1; 100 101 101 102 xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format, ··· 140 137 STATIC struct xfs_rui_log_item * 141 138 xfs_rui_init( 142 139 struct xfs_mount *mp, 140 + unsigned short item_type, 143 141 uint nextents) 144 142 145 143 { 146 144 struct xfs_rui_log_item *ruip; 147 145 148 146 ASSERT(nextents > 0); 147 + ASSERT(item_type == XFS_LI_RUI || item_type == XFS_LI_RUI_RT); 148 + 149 149 if (nextents > XFS_RUI_MAX_FAST_EXTENTS) 150 150 ruip = kzalloc(xfs_rui_log_item_sizeof(nextents), 151 151 GFP_KERNEL | __GFP_NOFAIL); ··· 156 150 ruip = kmem_cache_zalloc(xfs_rui_cache, 157 151 GFP_KERNEL | __GFP_NOFAIL); 158 152 159 - xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); 153 + xfs_log_item_init(mp, &ruip->rui_item, item_type, &xfs_rui_item_ops); 160 154 ruip->rui_format.rui_nextents = nextents; 161 155 ruip->rui_format.rui_id = (uintptr_t)(void *)ruip; 162 156 atomic_set(&ruip->rui_next_extent, 0); ··· 195 189 struct xfs_rud_log_item *rudp = RUD_ITEM(lip); 196 190 struct xfs_log_iovec *vecp = NULL; 197 191 198 - rudp->rud_format.rud_type = XFS_LI_RUD; 192 + ASSERT(lip->li_type == XFS_LI_RUD || lip->li_type == XFS_LI_RUD_RT); 193 + 194 + rudp->rud_format.rud_type = lip->li_type; 199 195 rudp->rud_format.rud_size = 1; 200 196 201 197 xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUD_FORMAT, &rudp->rud_format, ··· 239 231 static inline struct xfs_rmap_intent *ri_entry(const struct list_head *e) 240 232 { 241 233 return list_entry(e, struct xfs_rmap_intent, ri_list); 234 + } 235 + 236 + static inline bool 237 + xfs_rui_item_isrt(const struct xfs_log_item *lip) 238 + { 239 + ASSERT(lip->li_type == XFS_LI_RUI || lip->li_type == XFS_LI_RUI_RT); 240 + 241 + return lip->li_type == XFS_LI_RUI_RT; 242 242 } 243 243 244 244 /* Sort rmap intents by AG. */ ··· 321 305 } 322 306 323 307 static struct xfs_log_item * 308 + __xfs_rmap_update_create_intent( 309 + struct xfs_trans *tp, 310 + struct list_head *items, 311 + unsigned int count, 312 + bool sort, 313 + unsigned short item_type) 314 + { 315 + struct xfs_mount *mp = tp->t_mountp; 316 + struct xfs_rui_log_item *ruip; 317 + struct xfs_rmap_intent *ri; 318 + 319 + ASSERT(count > 0); 320 + 321 + ruip = xfs_rui_init(mp, item_type, count); 322 + if (sort) 323 + list_sort(mp, items, xfs_rmap_update_diff_items); 324 + list_for_each_entry(ri, items, ri_list) 325 + xfs_rmap_update_log_item(tp, ruip, ri); 326 + return &ruip->rui_item; 327 + } 328 + 329 + static struct xfs_log_item * 324 330 xfs_rmap_update_create_intent( 325 331 struct xfs_trans *tp, 326 332 struct list_head *items, 327 333 unsigned int count, 328 334 bool sort) 329 335 { 330 - struct xfs_mount *mp = tp->t_mountp; 331 - struct xfs_rui_log_item *ruip = xfs_rui_init(mp, count); 332 - struct xfs_rmap_intent *ri; 336 + return __xfs_rmap_update_create_intent(tp, items, count, sort, 337 + XFS_LI_RUI); 338 + } 333 339 334 - ASSERT(count > 0); 335 - 336 - if (sort) 337 - list_sort(mp, items, xfs_rmap_update_diff_items); 338 - list_for_each_entry(ri, items, ri_list) 339 - xfs_rmap_update_log_item(tp, ruip, ri); 340 - return &ruip->rui_item; 340 + static inline unsigned short 341 + xfs_rud_type_from_rui(const struct xfs_rui_log_item *ruip) 342 + { 343 + return xfs_rui_item_isrt(&ruip->rui_item) ? XFS_LI_RUD_RT : XFS_LI_RUD; 341 344 } 342 345 343 346 /* Get an RUD so we can process all the deferred rmap updates. */ ··· 370 335 struct xfs_rud_log_item *rudp; 371 336 372 337 rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL); 373 - xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, 374 - &xfs_rud_item_ops); 338 + xfs_log_item_init(tp->t_mountp, &rudp->rud_item, 339 + xfs_rud_type_from_rui(ruip), &xfs_rud_item_ops); 375 340 rudp->rud_ruip = ruip; 376 341 rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id; 377 342 ··· 386 351 { 387 352 struct xfs_mount *mp = tp->t_mountp; 388 353 389 - trace_xfs_rmap_defer(mp, ri); 390 - 354 + /* 355 + * Deferred rmap updates for the realtime and data sections must use 356 + * separate transactions to finish deferred work because updates to 357 + * realtime metadata files can lock AGFs to allocate btree blocks and 358 + * we don't want that mixing with the AGF locks taken to finish data 359 + * section updates. 360 + */ 391 361 ri->ri_group = xfs_group_intent_get(mp, ri->ri_bmap.br_startblock, 392 - XG_TYPE_AG); 393 - xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type); 362 + ri->ri_realtime ? XG_TYPE_RTG : XG_TYPE_AG); 363 + 364 + trace_xfs_rmap_defer(mp, ri); 365 + xfs_defer_add(tp, &ri->ri_list, ri->ri_realtime ? 366 + &xfs_rtrmap_update_defer_type : 367 + &xfs_rmap_update_defer_type); 394 368 } 395 369 396 370 /* Cancel a deferred rmap update. */ ··· 459 415 static inline bool 460 416 xfs_rui_validate_map( 461 417 struct xfs_mount *mp, 418 + bool isrt, 462 419 struct xfs_map_extent *map) 463 420 { 464 421 if (!xfs_has_rmapbt(mp)) ··· 489 444 if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len)) 490 445 return false; 491 446 447 + if (isrt) 448 + return xfs_verify_rtbext(mp, map->me_startblock, map->me_len); 449 + 492 450 return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); 493 451 } 494 452 ··· 499 451 xfs_rui_recover_work( 500 452 struct xfs_mount *mp, 501 453 struct xfs_defer_pending *dfp, 454 + bool isrt, 502 455 const struct xfs_map_extent *map) 503 456 { 504 457 struct xfs_rmap_intent *ri; ··· 544 495 ri->ri_bmap.br_blockcount = map->me_len; 545 496 ri->ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? 546 497 XFS_EXT_UNWRITTEN : XFS_EXT_NORM; 547 - ri->ri_group = xfs_group_intent_get(mp, map->me_startblock, XG_TYPE_AG); 498 + ri->ri_group = xfs_group_intent_get(mp, map->me_startblock, 499 + isrt ? XG_TYPE_RTG : XG_TYPE_AG); 500 + ri->ri_realtime = isrt; 548 501 549 502 xfs_defer_add_item(dfp, &ri->ri_list); 550 503 } ··· 565 514 struct xfs_rui_log_item *ruip = RUI_ITEM(lip); 566 515 struct xfs_trans *tp; 567 516 struct xfs_mount *mp = lip->li_log->l_mp; 517 + bool isrt = xfs_rui_item_isrt(lip); 568 518 int i; 569 519 int error = 0; 570 520 ··· 575 523 * just toss the RUI. 576 524 */ 577 525 for (i = 0; i < ruip->rui_format.rui_nextents; i++) { 578 - if (!xfs_rui_validate_map(mp, 526 + if (!xfs_rui_validate_map(mp, isrt, 579 527 &ruip->rui_format.rui_extents[i])) { 580 528 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, 581 529 &ruip->rui_format, ··· 583 531 return -EFSCORRUPTED; 584 532 } 585 533 586 - xfs_rui_recover_work(mp, dfp, &ruip->rui_format.rui_extents[i]); 534 + xfs_rui_recover_work(mp, dfp, isrt, 535 + &ruip->rui_format.rui_extents[i]); 587 536 } 588 537 589 538 resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); ··· 619 566 struct xfs_map_extent *map; 620 567 unsigned int count; 621 568 569 + ASSERT(intent->li_type == XFS_LI_RUI || 570 + intent->li_type == XFS_LI_RUI_RT); 571 + 622 572 count = RUI_ITEM(intent)->rui_format.rui_nextents; 623 573 map = RUI_ITEM(intent)->rui_format.rui_extents; 624 574 625 - ruip = xfs_rui_init(tp->t_mountp, count); 575 + ruip = xfs_rui_init(tp->t_mountp, intent->li_type, count); 626 576 memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map)); 627 577 atomic_set(&ruip->rui_next_extent, count); 628 578 ··· 644 588 .recover_work = xfs_rmap_recover_work, 645 589 .relog_intent = xfs_rmap_relog_intent, 646 590 }; 591 + 592 + #ifdef CONFIG_XFS_RT 593 + static struct xfs_log_item * 594 + xfs_rtrmap_update_create_intent( 595 + struct xfs_trans *tp, 596 + struct list_head *items, 597 + unsigned int count, 598 + bool sort) 599 + { 600 + return __xfs_rmap_update_create_intent(tp, items, count, sort, 601 + XFS_LI_RUI_RT); 602 + } 603 + 604 + /* Clean up after calling xfs_rmap_finish_one. */ 605 + STATIC void 606 + xfs_rtrmap_finish_one_cleanup( 607 + struct xfs_trans *tp, 608 + struct xfs_btree_cur *rcur, 609 + int error) 610 + { 611 + if (rcur) 612 + xfs_btree_del_cursor(rcur, error); 613 + } 614 + 615 + const struct xfs_defer_op_type xfs_rtrmap_update_defer_type = { 616 + .name = "rtrmap", 617 + .max_items = XFS_RUI_MAX_FAST_EXTENTS, 618 + .create_intent = xfs_rtrmap_update_create_intent, 619 + .abort_intent = xfs_rmap_update_abort_intent, 620 + .create_done = xfs_rmap_update_create_done, 621 + .finish_item = xfs_rmap_update_finish_item, 622 + .finish_cleanup = xfs_rtrmap_finish_one_cleanup, 623 + .cancel_item = xfs_rmap_update_cancel_item, 624 + .recover_work = xfs_rmap_recover_work, 625 + .relog_intent = xfs_rmap_relog_intent, 626 + }; 627 + #else 628 + const struct xfs_defer_op_type xfs_rtrmap_update_defer_type = { 629 + .name = "rtrmap", 630 + }; 631 + #endif 647 632 648 633 STATIC bool 649 634 xfs_rui_item_match( ··· 751 654 return -EFSCORRUPTED; 752 655 } 753 656 754 - ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); 657 + ruip = xfs_rui_init(mp, ITEM_TYPE(item), rui_formatp->rui_nextents); 755 658 xfs_rui_copy_format(&ruip->rui_format, rui_formatp); 756 659 atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); 757 660 ··· 763 666 const struct xlog_recover_item_ops xlog_rui_item_ops = { 764 667 .item_type = XFS_LI_RUI, 765 668 .commit_pass2 = xlog_recover_rui_commit_pass2, 669 + }; 670 + 671 + #ifdef CONFIG_XFS_RT 672 + STATIC int 673 + xlog_recover_rtrui_commit_pass2( 674 + struct xlog *log, 675 + struct list_head *buffer_list, 676 + struct xlog_recover_item *item, 677 + xfs_lsn_t lsn) 678 + { 679 + struct xfs_mount *mp = log->l_mp; 680 + struct xfs_rui_log_item *ruip; 681 + struct xfs_rui_log_format *rui_formatp; 682 + size_t len; 683 + 684 + rui_formatp = item->ri_buf[0].i_addr; 685 + 686 + if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) { 687 + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, 688 + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); 689 + return -EFSCORRUPTED; 690 + } 691 + 692 + len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents); 693 + if (item->ri_buf[0].i_len != len) { 694 + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, 695 + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); 696 + return -EFSCORRUPTED; 697 + } 698 + 699 + ruip = xfs_rui_init(mp, ITEM_TYPE(item), rui_formatp->rui_nextents); 700 + xfs_rui_copy_format(&ruip->rui_format, rui_formatp); 701 + atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); 702 + 703 + xlog_recover_intent_item(log, &ruip->rui_item, lsn, 704 + &xfs_rtrmap_update_defer_type); 705 + return 0; 706 + } 707 + #else 708 + STATIC int 709 + xlog_recover_rtrui_commit_pass2( 710 + struct xlog *log, 711 + struct list_head *buffer_list, 712 + struct xlog_recover_item *item, 713 + xfs_lsn_t lsn) 714 + { 715 + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, 716 + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); 717 + return -EFSCORRUPTED; 718 + } 719 + #endif 720 + 721 + const struct xlog_recover_item_ops xlog_rtrui_item_ops = { 722 + .item_type = XFS_LI_RUI_RT, 723 + .commit_pass2 = xlog_recover_rtrui_commit_pass2, 766 724 }; 767 725 768 726 /* ··· 850 698 const struct xlog_recover_item_ops xlog_rud_item_ops = { 851 699 .item_type = XFS_LI_RUD, 852 700 .commit_pass2 = xlog_recover_rud_commit_pass2, 701 + }; 702 + 703 + #ifdef CONFIG_XFS_RT 704 + STATIC int 705 + xlog_recover_rtrud_commit_pass2( 706 + struct xlog *log, 707 + struct list_head *buffer_list, 708 + struct xlog_recover_item *item, 709 + xfs_lsn_t lsn) 710 + { 711 + struct xfs_rud_log_format *rud_formatp; 712 + 713 + rud_formatp = item->ri_buf[0].i_addr; 714 + if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) { 715 + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, 716 + rud_formatp, item->ri_buf[0].i_len); 717 + return -EFSCORRUPTED; 718 + } 719 + 720 + xlog_recover_release_intent(log, XFS_LI_RUI_RT, 721 + rud_formatp->rud_rui_id); 722 + return 0; 723 + } 724 + #else 725 + # define xlog_recover_rtrud_commit_pass2 xlog_recover_rtrui_commit_pass2 726 + #endif 727 + 728 + const struct xlog_recover_item_ops xlog_rtrud_item_ops = { 729 + .item_type = XFS_LI_RUD_RT, 730 + .commit_pass2 = xlog_recover_rtrud_commit_pass2, 853 731 };

+100 -21

fs/xfs/xfs_rtalloc.c

··· 22 22 #include "xfs_rtalloc.h" 23 23 #include "xfs_sb.h" 24 24 #include "xfs_rtbitmap.h" 25 + #include "xfs_rtrmap_btree.h" 25 26 #include "xfs_quota.h" 26 27 #include "xfs_log_priv.h" 27 28 #include "xfs_health.h" ··· 31 30 #include "xfs_rtgroup.h" 32 31 #include "xfs_error.h" 33 32 #include "xfs_trace.h" 33 + #include "xfs_rtrefcount_btree.h" 34 + #include "xfs_reflink.h" 34 35 35 36 /* 36 37 * Return whether there are any free extents in the size range given ··· 595 592 * specified. If we don't get maxlen then use prod to trim 596 593 * the length, if given. The lengths are all in rtextents. 597 594 */ 598 - STATIC int 595 + static int 599 596 xfs_rtallocate_extent_size( 600 597 struct xfs_rtalloc_args *args, 601 598 xfs_rtxlen_t minlen, /* minimum length to allocate */ ··· 848 845 mp->m_rtsb_bp = rtsb_bp; 849 846 error = xfs_bwrite(rtsb_bp); 850 847 xfs_buf_unlock(rtsb_bp); 848 + if (error) 849 + return error; 850 + 851 + /* Initialize the rtrmap to reflect the rtsb. */ 852 + if (rtg_rmap(args->rtg) != NULL) 853 + error = xfs_rtrmapbt_init_rtsb(nargs->mp, args->rtg, args->tp); 854 + 851 855 return error; 852 856 } 853 857 ··· 866 856 xfs_fileoff_t bmbno) 867 857 { 868 858 struct xfs_mount *mp = rtg_mount(rtg); 869 - struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP]; 870 - struct xfs_inode *rsumip = rtg->rtg_inodes[XFS_RTGI_SUMMARY]; 859 + struct xfs_inode *rbmip = rtg_bitmap(rtg); 860 + struct xfs_inode *rsumip = rtg_summary(rtg); 871 861 struct xfs_rtalloc_args args = { 872 862 .mp = mp, 873 863 .rtg = rtg, ··· 903 893 goto out_free; 904 894 nargs.tp = args.tp; 905 895 906 - xfs_rtgroup_lock(args.rtg, XFS_RTGLOCK_BITMAP); 907 - xfs_rtgroup_trans_join(args.tp, args.rtg, XFS_RTGLOCK_BITMAP); 896 + xfs_rtgroup_lock(args.rtg, XFS_RTGLOCK_BITMAP | XFS_RTGLOCK_RMAP); 897 + xfs_rtgroup_trans_join(args.tp, args.rtg, 898 + XFS_RTGLOCK_BITMAP | XFS_RTGLOCK_RMAP); 908 899 909 900 /* 910 901 * Update the bitmap inode's size ondisk and incore. We need to update ··· 991 980 goto out_free; 992 981 993 982 /* 994 - * Ensure the mount RT feature flag is now set. 983 + * Ensure the mount RT feature flag is now set, and compute new 984 + * maxlevels for rt btrees. 995 985 */ 996 986 mp->m_features |= XFS_FEAT_REALTIME; 987 + xfs_rtrmapbt_compute_maxlevels(mp); 988 + xfs_rtrefcountbt_compute_maxlevels(mp); 997 989 998 990 kfree(nmp); 999 991 return 0; ··· 1055 1041 xfs_extlen_t *nrbmblocks) 1056 1042 { 1057 1043 struct xfs_mount *mp = rtg_mount(rtg); 1058 - struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP]; 1059 - struct xfs_inode *rsumip = rtg->rtg_inodes[XFS_RTGI_SUMMARY]; 1044 + struct xfs_inode *rbmip = rtg_bitmap(rtg); 1045 + struct xfs_inode *rsumip = rtg_summary(rtg); 1060 1046 xfs_extlen_t orbmblocks = 0; 1061 1047 xfs_extlen_t orsumblocks = 0; 1062 1048 struct xfs_mount *nmp; ··· 1164 1150 return error; 1165 1151 } 1166 1152 1167 - static int 1153 + int 1168 1154 xfs_growfs_check_rtgeom( 1169 1155 const struct xfs_mount *mp, 1156 + xfs_rfsblock_t dblocks, 1170 1157 xfs_rfsblock_t rblocks, 1171 1158 xfs_extlen_t rextsize) 1172 1159 { 1160 + xfs_extlen_t min_logfsbs; 1173 1161 struct xfs_mount *nmp; 1174 - int error = 0; 1175 1162 1176 1163 nmp = xfs_growfs_rt_alloc_fake_mount(mp, rblocks, rextsize); 1177 1164 if (!nmp) 1178 1165 return -ENOMEM; 1166 + nmp->m_sb.sb_dblocks = dblocks; 1167 + 1168 + xfs_rtrmapbt_compute_maxlevels(nmp); 1169 + xfs_rtrefcountbt_compute_maxlevels(nmp); 1170 + xfs_trans_resv_calc(nmp, M_RES(nmp)); 1179 1171 1180 1172 /* 1181 1173 * New summary size can't be more than half the size of the log. This 1182 1174 * prevents us from getting a log overflow, since we'll log basically 1183 1175 * the whole summary file at once. 1184 1176 */ 1185 - if (nmp->m_rsumblocks > (mp->m_sb.sb_logblocks >> 1)) 1186 - error = -EINVAL; 1177 + min_logfsbs = min_t(xfs_extlen_t, xfs_log_calc_minimum_size(nmp), 1178 + nmp->m_rsumblocks * 2); 1187 1179 1188 1180 kfree(nmp); 1189 - return error; 1181 + 1182 + if (min_logfsbs > mp->m_sb.sb_logblocks) 1183 + return -EINVAL; 1184 + return 0; 1190 1185 } 1191 1186 1192 1187 /* ··· 1286 1263 XFS_FSB_TO_B(mp, in->extsize) < XFS_MIN_RTEXTSIZE) 1287 1264 goto out_unlock; 1288 1265 1289 - /* Unsupported realtime features. */ 1266 + /* Check for features supported only on rtgroups filesystems. */ 1290 1267 error = -EOPNOTSUPP; 1291 - if (xfs_has_quota(mp) && !xfs_has_rtgroups(mp)) 1292 - goto out_unlock; 1293 - if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp)) 1268 + if (!xfs_has_rtgroups(mp)) { 1269 + if (xfs_has_rmapbt(mp)) 1270 + goto out_unlock; 1271 + if (xfs_has_quota(mp)) 1272 + goto out_unlock; 1273 + if (xfs_has_reflink(mp)) 1274 + goto out_unlock; 1275 + } else if (xfs_has_reflink(mp) && 1276 + !xfs_reflink_supports_rextsize(mp, in->extsize)) 1294 1277 goto out_unlock; 1295 1278 1296 1279 error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks); ··· 1320 1291 goto out_unlock; 1321 1292 1322 1293 /* Make sure the new fs size won't cause problems with the log. */ 1323 - error = xfs_growfs_check_rtgeom(mp, in->newblocks, in->extsize); 1294 + error = xfs_growfs_check_rtgeom(mp, mp->m_sb.sb_dblocks, in->newblocks, 1295 + in->extsize); 1324 1296 if (error) 1325 1297 goto out_unlock; 1326 1298 ··· 1373 1343 int error2 = xfs_update_secondary_sbs(mp); 1374 1344 1375 1345 if (!error) 1346 + error = error2; 1347 + 1348 + /* Reset the rt metadata btree space reservations. */ 1349 + xfs_rt_resv_free(mp); 1350 + error2 = xfs_rt_resv_init(mp); 1351 + if (error2 && error2 != -ENOSPC) 1376 1352 error = error2; 1377 1353 } 1378 1354 ··· 1523 1487 return 0; 1524 1488 } 1525 1489 1490 + /* Free space reservations for rt metadata inodes. */ 1491 + void 1492 + xfs_rt_resv_free( 1493 + struct xfs_mount *mp) 1494 + { 1495 + struct xfs_rtgroup *rtg = NULL; 1496 + unsigned int i; 1497 + 1498 + while ((rtg = xfs_rtgroup_next(mp, rtg))) { 1499 + for (i = 0; i < XFS_RTGI_MAX; i++) 1500 + xfs_metafile_resv_free(rtg->rtg_inodes[i]); 1501 + } 1502 + } 1503 + 1504 + /* Reserve space for rt metadata inodes' space expansion. */ 1505 + int 1506 + xfs_rt_resv_init( 1507 + struct xfs_mount *mp) 1508 + { 1509 + struct xfs_rtgroup *rtg = NULL; 1510 + xfs_filblks_t ask; 1511 + int error = 0; 1512 + 1513 + while ((rtg = xfs_rtgroup_next(mp, rtg))) { 1514 + int err2; 1515 + 1516 + ask = xfs_rtrmapbt_calc_reserves(mp); 1517 + err2 = xfs_metafile_resv_init(rtg_rmap(rtg), ask); 1518 + if (err2 && !error) 1519 + error = err2; 1520 + 1521 + ask = xfs_rtrefcountbt_calc_reserves(mp); 1522 + err2 = xfs_metafile_resv_init(rtg_refcount(rtg), ask); 1523 + if (err2 && !error) 1524 + error = err2; 1525 + } 1526 + 1527 + return error; 1528 + } 1529 + 1526 1530 /* 1527 1531 * Read in the bmbt of an rt metadata inode so that we never have to load them 1528 1532 * at runtime. This enables the use of shared ILOCKs for rtbitmap scans. Use ··· 1677 1601 xfs_rtxlen_t len) /* allocation length (rtextents) */ 1678 1602 { 1679 1603 struct xfs_mount *mp = rtg_mount(rtg); 1680 - struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP]; 1604 + struct xfs_inode *rbmip = rtg_bitmap(rtg); 1681 1605 xfs_rtxnum_t b = 0; /* result rtext */ 1682 1606 int log2; /* log of sequence number */ 1683 1607 uint64_t resid; /* residual after log removed */ ··· 1961 1885 goto out_release; 1962 1886 } 1963 1887 1964 - static int 1888 + int 1965 1889 xfs_rtallocate_rtgs( 1966 1890 struct xfs_trans *tp, 1967 1891 xfs_fsblock_t bno_hint, ··· 2026 1950 if (*noalign) { 2027 1951 align = mp->m_sb.sb_rextsize; 2028 1952 } else { 2029 - align = xfs_get_extsz_hint(ap->ip); 1953 + if (ap->flags & XFS_BMAPI_COWFORK) 1954 + align = xfs_get_cowextsz_hint(ap->ip); 1955 + else 1956 + align = xfs_get_extsz_hint(ap->ip); 2030 1957 if (!align) 2031 1958 align = 1; 2032 1959 if (align == mp->m_sb.sb_rextsize)

+20

fs/xfs/xfs_rtalloc.h

··· 34 34 xfs_rtmount_inodes( 35 35 struct xfs_mount *mp); /* file system mount structure */ 36 36 37 + void xfs_rt_resv_free(struct xfs_mount *mp); 38 + int xfs_rt_resv_init(struct xfs_mount *mp); 39 + 37 40 /* 38 41 * Grow the realtime area of the filesystem. 39 42 */ ··· 46 43 xfs_growfs_rt_t *in); /* user supplied growfs struct */ 47 44 48 45 int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp); 46 + int xfs_growfs_check_rtgeom(const struct xfs_mount *mp, xfs_rfsblock_t dblocks, 47 + xfs_rfsblock_t rblocks, xfs_agblock_t rextsize); 49 48 #else 50 49 # define xfs_growfs_rt(mp,in) (-ENOSYS) 51 50 # define xfs_rtalloc_reinit_frextents(m) (0) ··· 65 60 } 66 61 # define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS)) 67 62 # define xfs_rtunmount_inodes(m) 63 + # define xfs_rt_resv_free(mp) ((void)0) 64 + # define xfs_rt_resv_init(mp) (0) 65 + 66 + static inline int 67 + xfs_growfs_check_rtgeom(const struct xfs_mount *mp, 68 + xfs_rfsblock_t dblocks, xfs_rfsblock_t rblocks, 69 + xfs_extlen_t rextsize) 70 + { 71 + return 0; 72 + } 68 73 #endif /* CONFIG_XFS_RT */ 74 + 75 + int xfs_rtallocate_rtgs(struct xfs_trans *tp, xfs_fsblock_t bno_hint, 76 + xfs_rtxlen_t minlen, xfs_rtxlen_t maxlen, xfs_rtxlen_t prod, 77 + bool wasdel, bool initial_user_data, xfs_rtblock_t *bno, 78 + xfs_extlen_t *blen); 69 79 70 80 #endif /* __XFS_RTALLOC_H__ */

+4 -1

fs/xfs/xfs_stats.c

··· 52 52 { "rmapbt", xfsstats_offset(xs_refcbt_2) }, 53 53 { "refcntbt", xfsstats_offset(xs_rmap_mem_2) }, 54 54 { "rmapbt_mem", xfsstats_offset(xs_rcbag_2) }, 55 - { "rcbagbt", xfsstats_offset(xs_qm_dqreclaims)}, 55 + { "rcbagbt", xfsstats_offset(xs_rtrmap_2) }, 56 + { "rtrmapbt", xfsstats_offset(xs_rtrmap_mem_2)}, 57 + { "rtrmapbt_mem", xfsstats_offset(xs_rtrefcbt_2) }, 58 + { "rtrefcntbt", xfsstats_offset(xs_qm_dqreclaims)}, 56 59 /* we print both series of quota information together */ 57 60 { "qm", xfsstats_offset(xs_xstrat_bytes)}, 58 61 };

+3

fs/xfs/xfs_stats.h

··· 127 127 uint32_t xs_refcbt_2[__XBTS_MAX]; 128 128 uint32_t xs_rmap_mem_2[__XBTS_MAX]; 129 129 uint32_t xs_rcbag_2[__XBTS_MAX]; 130 + uint32_t xs_rtrmap_2[__XBTS_MAX]; 131 + uint32_t xs_rtrmap_mem_2[__XBTS_MAX]; 132 + uint32_t xs_rtrefcbt_2[__XBTS_MAX]; 130 133 uint32_t xs_qm_dqreclaims; 131 134 uint32_t xs_qm_dqreclaim_misses; 132 135 uint32_t xs_qm_dquot_dups;

+82 -60

fs/xfs/xfs_super.c

··· 819 819 return 0; 820 820 } 821 821 822 + static xfs_extlen_t 823 + xfs_internal_log_size( 824 + struct xfs_mount *mp) 825 + { 826 + if (!mp->m_sb.sb_logstart) 827 + return 0; 828 + return mp->m_sb.sb_logblocks; 829 + } 830 + 831 + static void 832 + xfs_statfs_data( 833 + struct xfs_mount *mp, 834 + struct kstatfs *st) 835 + { 836 + int64_t fdblocks = 837 + percpu_counter_sum(&mp->m_fdblocks); 838 + 839 + /* make sure st->f_bfree does not underflow */ 840 + st->f_bfree = max(0LL, fdblocks - xfs_fdblocks_unavailable(mp)); 841 + /* 842 + * sb_dblocks can change during growfs, but nothing cares about reporting 843 + * the old or new value during growfs. 844 + */ 845 + st->f_blocks = mp->m_sb.sb_dblocks - xfs_internal_log_size(mp); 846 + } 847 + 848 + /* 849 + * When stat(v)fs is called on a file with the realtime bit set or a directory 850 + * with the rtinherit bit, report freespace information for the RT device 851 + * instead of the main data device. 852 + */ 853 + static void 854 + xfs_statfs_rt( 855 + struct xfs_mount *mp, 856 + struct kstatfs *st) 857 + { 858 + st->f_bfree = xfs_rtbxlen_to_blen(mp, 859 + percpu_counter_sum_positive(&mp->m_frextents)); 860 + st->f_blocks = mp->m_sb.sb_rblocks; 861 + } 862 + 863 + static void 864 + xfs_statfs_inodes( 865 + struct xfs_mount *mp, 866 + struct kstatfs *st) 867 + { 868 + uint64_t icount = percpu_counter_sum(&mp->m_icount); 869 + uint64_t ifree = percpu_counter_sum(&mp->m_ifree); 870 + uint64_t fakeinos = XFS_FSB_TO_INO(mp, st->f_bfree); 871 + 872 + st->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER); 873 + if (M_IGEO(mp)->maxicount) 874 + st->f_files = min_t(typeof(st->f_files), st->f_files, 875 + M_IGEO(mp)->maxicount); 876 + 877 + /* If sb_icount overshot maxicount, report actual allocation */ 878 + st->f_files = max_t(typeof(st->f_files), st->f_files, 879 + mp->m_sb.sb_icount); 880 + 881 + /* Make sure st->f_ffree does not underflow */ 882 + st->f_ffree = max_t(int64_t, 0, st->f_files - (icount - ifree)); 883 + } 884 + 822 885 STATIC int 823 886 xfs_fs_statfs( 824 887 struct dentry *dentry, 825 - struct kstatfs *statp) 888 + struct kstatfs *st) 826 889 { 827 890 struct xfs_mount *mp = XFS_M(dentry->d_sb); 828 - xfs_sb_t *sbp = &mp->m_sb; 829 891 struct xfs_inode *ip = XFS_I(d_inode(dentry)); 830 - uint64_t fakeinos, id; 831 - uint64_t icount; 832 - uint64_t ifree; 833 - uint64_t fdblocks; 834 - xfs_extlen_t lsize; 835 - int64_t ffree; 836 892 837 893 /* 838 894 * Expedite background inodegc but don't wait. We do not want to block ··· 896 840 */ 897 841 xfs_inodegc_push(mp); 898 842 899 - statp->f_type = XFS_SUPER_MAGIC; 900 - statp->f_namelen = MAXNAMELEN - 1; 843 + st->f_type = XFS_SUPER_MAGIC; 844 + st->f_namelen = MAXNAMELEN - 1; 845 + st->f_bsize = mp->m_sb.sb_blocksize; 846 + st->f_fsid = u64_to_fsid(huge_encode_dev(mp->m_ddev_targp->bt_dev)); 901 847 902 - id = huge_encode_dev(mp->m_ddev_targp->bt_dev); 903 - statp->f_fsid = u64_to_fsid(id); 904 - 905 - icount = percpu_counter_sum(&mp->m_icount); 906 - ifree = percpu_counter_sum(&mp->m_ifree); 907 - fdblocks = percpu_counter_sum(&mp->m_fdblocks); 908 - 909 - spin_lock(&mp->m_sb_lock); 910 - statp->f_bsize = sbp->sb_blocksize; 911 - lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0; 912 - statp->f_blocks = sbp->sb_dblocks - lsize; 913 - spin_unlock(&mp->m_sb_lock); 914 - 915 - /* make sure statp->f_bfree does not underflow */ 916 - statp->f_bfree = max_t(int64_t, 0, 917 - fdblocks - xfs_fdblocks_unavailable(mp)); 918 - statp->f_bavail = statp->f_bfree; 919 - 920 - fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree); 921 - statp->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER); 922 - if (M_IGEO(mp)->maxicount) 923 - statp->f_files = min_t(typeof(statp->f_files), 924 - statp->f_files, 925 - M_IGEO(mp)->maxicount); 926 - 927 - /* If sb_icount overshot maxicount, report actual allocation */ 928 - statp->f_files = max_t(typeof(statp->f_files), 929 - statp->f_files, 930 - sbp->sb_icount); 931 - 932 - /* make sure statp->f_ffree does not underflow */ 933 - ffree = statp->f_files - (icount - ifree); 934 - statp->f_ffree = max_t(int64_t, ffree, 0); 848 + xfs_statfs_data(mp, st); 849 + xfs_statfs_inodes(mp, st); 935 850 936 851 if (XFS_IS_REALTIME_MOUNT(mp) && 937 - (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) { 938 - s64 freertx; 939 - 940 - statp->f_blocks = sbp->sb_rblocks; 941 - freertx = percpu_counter_sum_positive(&mp->m_frextents); 942 - statp->f_bavail = statp->f_bfree = 943 - xfs_rtbxlen_to_blen(mp, freertx); 944 - } 852 + (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) 853 + xfs_statfs_rt(mp, st); 945 854 946 855 if ((ip->i_diflags & XFS_DIFLAG_PROJINHERIT) && 947 856 ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) == 948 857 (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD)) 949 - xfs_qm_statvfs(ip, statp); 858 + xfs_qm_statvfs(ip, st); 950 859 860 + /* 861 + * XFS does not distinguish between blocks available to privileged and 862 + * unprivileged users. 863 + */ 864 + st->f_bavail = st->f_bfree; 951 865 return 0; 952 866 } 953 867 ··· 1780 1754 xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR); 1781 1755 1782 1756 if (xfs_has_reflink(mp)) { 1783 - if (mp->m_sb.sb_rblocks) { 1757 + if (xfs_has_realtime(mp) && 1758 + !xfs_reflink_supports_rextsize(mp, mp->m_sb.sb_rextsize)) { 1784 1759 xfs_alert(mp, 1785 - "reflink not compatible with realtime device!"); 1760 + "reflink not compatible with realtime extent size %u!", 1761 + mp->m_sb.sb_rextsize); 1786 1762 error = -EINVAL; 1787 1763 goto out_filestream_unmount; 1788 1764 } ··· 1795 1767 } 1796 1768 } 1797 1769 1798 - if (xfs_has_rmapbt(mp) && mp->m_sb.sb_rblocks) { 1799 - xfs_alert(mp, 1800 - "reverse mapping btree not compatible with realtime device!"); 1801 - error = -EINVAL; 1802 - goto out_filestream_unmount; 1803 - } 1804 1770 1805 1771 if (xfs_has_exchange_range(mp)) 1806 1772 xfs_warn_experimental(mp, XFS_EXPERIMENTAL_EXCHRANGE);

-1

fs/xfs/xfs_super.h

··· 92 92 93 93 extern const struct export_operations xfs_export_operations; 94 94 extern const struct quotactl_ops xfs_quotactl_operations; 95 - extern const struct dax_holder_operations xfs_dax_holder_operations; 96 95 97 96 extern void xfs_reinit_percpu_counters(struct xfs_mount *mp); 98 97

+182 -88

fs/xfs/xfs_trace.h

··· 14 14 * ino: filesystem inode number 15 15 * 16 16 * agbno: per-AG block number in fs blocks 17 + * rgbno: per-rtgroup block number in fs blocks 17 18 * startblock: physical block number for file mappings. This is either a 18 19 * segmented fsblock for data device mappings, or a rfsblock 19 20 * for realtime device mappings 20 21 * fsbcount: number of blocks in an extent, in fs blocks 22 + * 23 + * gbno: generic allocation group block number. This is an agbno for 24 + * space in a per-AG or a rgbno for space in a realtime group. 21 25 * 22 26 * daddr: physical block number in 512b blocks 23 27 * bbcount: number of blocks in a physical extent, in 512b blocks ··· 498 494 __entry->dev = bp->b_target->bt_dev; 499 495 __entry->bno = xfs_buf_daddr(bp); 500 496 __entry->nblks = bp->b_length; 501 - __entry->hold = atomic_read(&bp->b_hold); 497 + __entry->hold = bp->b_hold; 502 498 __entry->pincount = atomic_read(&bp->b_pin_count); 503 499 __entry->lockval = bp->b_sema.count; 504 500 __entry->flags = bp->b_flags; ··· 569 565 __entry->bno = xfs_buf_daddr(bp); 570 566 __entry->length = bp->b_length; 571 567 __entry->flags = flags; 572 - __entry->hold = atomic_read(&bp->b_hold); 568 + __entry->hold = bp->b_hold; 573 569 __entry->pincount = atomic_read(&bp->b_pin_count); 574 570 __entry->lockval = bp->b_sema.count; 575 571 __entry->caller_ip = caller_ip; ··· 612 608 __entry->dev = bp->b_target->bt_dev; 613 609 __entry->bno = xfs_buf_daddr(bp); 614 610 __entry->length = bp->b_length; 615 - __entry->hold = atomic_read(&bp->b_hold); 611 + __entry->hold = bp->b_hold; 616 612 __entry->pincount = atomic_read(&bp->b_pin_count); 617 613 __entry->lockval = bp->b_sema.count; 618 614 __entry->error = error; ··· 656 652 __entry->buf_bno = xfs_buf_daddr(bip->bli_buf); 657 653 __entry->buf_len = bip->bli_buf->b_length; 658 654 __entry->buf_flags = bip->bli_buf->b_flags; 659 - __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); 655 + __entry->buf_hold = bip->bli_buf->b_hold; 660 656 __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); 661 657 __entry->buf_lockval = bip->bli_buf->b_sema.count; 662 658 __entry->li_flags = bip->bli_item.li_flags; ··· 2299 2295 TRACE_DEFINE_ENUM(XFS_DINODE_FMT_EXTENTS); 2300 2296 TRACE_DEFINE_ENUM(XFS_DINODE_FMT_BTREE); 2301 2297 TRACE_DEFINE_ENUM(XFS_DINODE_FMT_UUID); 2298 + TRACE_DEFINE_ENUM(XFS_DINODE_FMT_META_BTREE); 2302 2299 2303 2300 DECLARE_EVENT_CLASS(xfs_swap_extent_class, 2304 2301 TP_PROTO(struct xfs_inode *ip, int which), ··· 2923 2918 /* rmap tracepoints */ 2924 2919 DECLARE_EVENT_CLASS(xfs_rmap_class, 2925 2920 TP_PROTO(struct xfs_btree_cur *cur, 2926 - xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, 2921 + xfs_agblock_t gbno, xfs_extlen_t len, bool unwritten, 2927 2922 const struct xfs_owner_info *oinfo), 2928 - TP_ARGS(cur, agbno, len, unwritten, oinfo), 2923 + TP_ARGS(cur, gbno, len, unwritten, oinfo), 2929 2924 TP_STRUCT__entry( 2930 2925 __field(dev_t, dev) 2926 + __field(enum xfs_group_type, type) 2931 2927 __field(xfs_agnumber_t, agno) 2932 - __field(xfs_agblock_t, agbno) 2928 + __field(xfs_agblock_t, gbno) 2933 2929 __field(xfs_extlen_t, len) 2934 2930 __field(uint64_t, owner) 2935 2931 __field(uint64_t, offset) ··· 2938 2932 ), 2939 2933 TP_fast_assign( 2940 2934 __entry->dev = cur->bc_mp->m_super->s_dev; 2935 + __entry->type = cur->bc_group->xg_type; 2941 2936 __entry->agno = cur->bc_group->xg_gno; 2942 - __entry->agbno = agbno; 2937 + __entry->gbno = gbno; 2943 2938 __entry->len = len; 2944 2939 __entry->owner = oinfo->oi_owner; 2945 2940 __entry->offset = oinfo->oi_offset; ··· 2948 2941 if (unwritten) 2949 2942 __entry->flags |= XFS_RMAP_UNWRITTEN; 2950 2943 ), 2951 - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%lx", 2944 + TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%lx", 2952 2945 MAJOR(__entry->dev), MINOR(__entry->dev), 2946 + __print_symbolic(__entry->type, XG_TYPE_STRINGS), 2953 2947 __entry->agno, 2954 - __entry->agbno, 2948 + __entry->gbno, 2955 2949 __entry->len, 2956 2950 __entry->owner, 2957 2951 __entry->offset, ··· 2961 2953 #define DEFINE_RMAP_EVENT(name) \ 2962 2954 DEFINE_EVENT(xfs_rmap_class, name, \ 2963 2955 TP_PROTO(struct xfs_btree_cur *cur, \ 2964 - xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, \ 2956 + xfs_agblock_t gbno, xfs_extlen_t len, bool unwritten, \ 2965 2957 const struct xfs_owner_info *oinfo), \ 2966 - TP_ARGS(cur, agbno, len, unwritten, oinfo)) 2958 + TP_ARGS(cur, gbno, len, unwritten, oinfo)) 2967 2959 2968 2960 /* btree cursor error/%ip tracepoint class */ 2969 2961 DECLARE_EVENT_CLASS(xfs_btree_error_class, ··· 3026 3018 TP_ARGS(cur, state, caller_ip), 3027 3019 TP_STRUCT__entry( 3028 3020 __field(dev_t, dev) 3021 + __field(enum xfs_group_type, type) 3029 3022 __field(xfs_agnumber_t, agno) 3030 - __field(xfs_ino_t, ino) 3031 3023 __field(int, state) 3032 3024 __field(unsigned long, caller_ip) 3033 3025 ), 3034 3026 TP_fast_assign( 3035 3027 __entry->dev = cur->bc_mp->m_super->s_dev; 3036 - switch (cur->bc_ops->type) { 3037 - case XFS_BTREE_TYPE_INODE: 3038 - __entry->agno = 0; 3039 - __entry->ino = cur->bc_ino.ip->i_ino; 3040 - break; 3041 - case XFS_BTREE_TYPE_AG: 3042 - __entry->agno = cur->bc_group->xg_gno; 3043 - __entry->ino = 0; 3044 - break; 3045 - case XFS_BTREE_TYPE_MEM: 3046 - __entry->agno = 0; 3047 - __entry->ino = 0; 3048 - break; 3049 - } 3028 + __entry->type = cur->bc_group->xg_type; 3029 + __entry->agno = cur->bc_group->xg_gno; 3050 3030 __entry->state = state; 3051 3031 __entry->caller_ip = caller_ip; 3052 3032 ), 3053 - TP_printk("dev %d:%d agno 0x%x ino 0x%llx state %d caller %pS", 3033 + TP_printk("dev %d:%d %sno 0x%x state %d caller %pS", 3054 3034 MAJOR(__entry->dev), MINOR(__entry->dev), 3035 + __print_symbolic(__entry->type, XG_TYPE_STRINGS), 3055 3036 __entry->agno, 3056 - __entry->ino, 3057 3037 __entry->state, 3058 3038 (char *)__entry->caller_ip) 3059 3039 ); 3060 3040 3061 3041 DECLARE_EVENT_CLASS(xfs_rmapbt_class, 3062 3042 TP_PROTO(struct xfs_btree_cur *cur, 3063 - xfs_agblock_t agbno, xfs_extlen_t len, 3043 + xfs_agblock_t gbno, xfs_extlen_t len, 3064 3044 uint64_t owner, uint64_t offset, unsigned int flags), 3065 - TP_ARGS(cur, agbno, len, owner, offset, flags), 3045 + TP_ARGS(cur, gbno, len, owner, offset, flags), 3066 3046 TP_STRUCT__entry( 3067 3047 __field(dev_t, dev) 3048 + __field(enum xfs_group_type, type) 3068 3049 __field(xfs_agnumber_t, agno) 3069 - __field(xfs_agblock_t, agbno) 3050 + __field(xfs_agblock_t, gbno) 3070 3051 __field(xfs_extlen_t, len) 3071 3052 __field(uint64_t, owner) 3072 3053 __field(uint64_t, offset) ··· 3063 3066 ), 3064 3067 TP_fast_assign( 3065 3068 __entry->dev = cur->bc_mp->m_super->s_dev; 3069 + __entry->type = cur->bc_group->xg_type; 3066 3070 __entry->agno = cur->bc_group->xg_gno; 3067 - __entry->agbno = agbno; 3071 + __entry->gbno = gbno; 3068 3072 __entry->len = len; 3069 3073 __entry->owner = owner; 3070 3074 __entry->offset = offset; 3071 3075 __entry->flags = flags; 3072 3076 ), 3073 - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", 3077 + TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", 3074 3078 MAJOR(__entry->dev), MINOR(__entry->dev), 3079 + __print_symbolic(__entry->type, XG_TYPE_STRINGS), 3075 3080 __entry->agno, 3076 - __entry->agbno, 3081 + __entry->gbno, 3077 3082 __entry->len, 3078 3083 __entry->owner, 3079 3084 __entry->offset, ··· 3084 3085 #define DEFINE_RMAPBT_EVENT(name) \ 3085 3086 DEFINE_EVENT(xfs_rmapbt_class, name, \ 3086 3087 TP_PROTO(struct xfs_btree_cur *cur, \ 3087 - xfs_agblock_t agbno, xfs_extlen_t len, \ 3088 + xfs_agblock_t gbno, xfs_extlen_t len, \ 3088 3089 uint64_t owner, uint64_t offset, unsigned int flags), \ 3089 - TP_ARGS(cur, agbno, len, owner, offset, flags)) 3090 + TP_ARGS(cur, gbno, len, owner, offset, flags)) 3090 3091 3091 3092 TRACE_DEFINE_ENUM(XFS_RMAP_MAP); 3092 3093 TRACE_DEFINE_ENUM(XFS_RMAP_MAP_SHARED); ··· 3103 3104 TP_STRUCT__entry( 3104 3105 __field(dev_t, dev) 3105 3106 __field(unsigned long long, owner) 3107 + __field(enum xfs_group_type, type) 3106 3108 __field(xfs_agnumber_t, agno) 3107 - __field(xfs_agblock_t, agbno) 3109 + __field(xfs_agblock_t, gbno) 3108 3110 __field(int, whichfork) 3109 3111 __field(xfs_fileoff_t, l_loff) 3110 3112 __field(xfs_filblks_t, l_len) ··· 3114 3114 ), 3115 3115 TP_fast_assign( 3116 3116 __entry->dev = mp->m_super->s_dev; 3117 - __entry->agno = XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock); 3118 - __entry->agbno = XFS_FSB_TO_AGBNO(mp, 3119 - ri->ri_bmap.br_startblock); 3117 + __entry->type = ri->ri_group->xg_type; 3118 + __entry->agno = ri->ri_group->xg_gno; 3119 + __entry->gbno = xfs_fsb_to_gbno(mp, 3120 + ri->ri_bmap.br_startblock, 3121 + ri->ri_group->xg_type); 3120 3122 __entry->owner = ri->ri_owner; 3121 3123 __entry->whichfork = ri->ri_whichfork; 3122 3124 __entry->l_loff = ri->ri_bmap.br_startoff; ··· 3126 3124 __entry->l_state = ri->ri_bmap.br_state; 3127 3125 __entry->op = ri->ri_type; 3128 3126 ), 3129 - TP_printk("dev %d:%d op %s agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d", 3127 + TP_printk("dev %d:%d op %s %sno 0x%x gbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d", 3130 3128 MAJOR(__entry->dev), MINOR(__entry->dev), 3131 3129 __print_symbolic(__entry->op, XFS_RMAP_INTENT_STRINGS), 3130 + __print_symbolic(__entry->type, XG_TYPE_STRINGS), 3132 3131 __entry->agno, 3133 - __entry->agbno, 3132 + __entry->gbno, 3134 3133 __entry->owner, 3135 3134 __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), 3136 3135 __entry->l_loff, ··· 3305 3302 /* refcount tracepoint classes */ 3306 3303 3307 3304 DECLARE_EVENT_CLASS(xfs_refcount_class, 3308 - TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno, 3305 + TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t gbno, 3309 3306 xfs_extlen_t len), 3310 - TP_ARGS(cur, agbno, len), 3307 + TP_ARGS(cur, gbno, len), 3311 3308 TP_STRUCT__entry( 3312 3309 __field(dev_t, dev) 3310 + __field(enum xfs_group_type, type) 3313 3311 __field(xfs_agnumber_t, agno) 3314 - __field(xfs_agblock_t, agbno) 3312 + __field(xfs_agblock_t, gbno) 3315 3313 __field(xfs_extlen_t, len) 3316 3314 ), 3317 3315 TP_fast_assign( 3318 3316 __entry->dev = cur->bc_mp->m_super->s_dev; 3317 + __entry->type = cur->bc_group->xg_type; 3319 3318 __entry->agno = cur->bc_group->xg_gno; 3320 - __entry->agbno = agbno; 3319 + __entry->gbno = gbno; 3321 3320 __entry->len = len; 3322 3321 ), 3323 - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", 3322 + TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x", 3324 3323 MAJOR(__entry->dev), MINOR(__entry->dev), 3324 + __print_symbolic(__entry->type, XG_TYPE_STRINGS), 3325 3325 __entry->agno, 3326 - __entry->agbno, 3326 + __entry->gbno, 3327 3327 __entry->len) 3328 3328 ); 3329 3329 #define DEFINE_REFCOUNT_EVENT(name) \ 3330 3330 DEFINE_EVENT(xfs_refcount_class, name, \ 3331 - TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno, \ 3331 + TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t gbno, \ 3332 3332 xfs_extlen_t len), \ 3333 - TP_ARGS(cur, agbno, len)) 3333 + TP_ARGS(cur, gbno, len)) 3334 3334 3335 3335 TRACE_DEFINE_ENUM(XFS_LOOKUP_EQi); 3336 3336 TRACE_DEFINE_ENUM(XFS_LOOKUP_LEi); 3337 3337 TRACE_DEFINE_ENUM(XFS_LOOKUP_GEi); 3338 3338 TRACE_EVENT(xfs_refcount_lookup, 3339 - TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno, 3339 + TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t gbno, 3340 3340 xfs_lookup_t dir), 3341 - TP_ARGS(cur, agbno, dir), 3341 + TP_ARGS(cur, gbno, dir), 3342 3342 TP_STRUCT__entry( 3343 3343 __field(dev_t, dev) 3344 + __field(enum xfs_group_type, type) 3344 3345 __field(xfs_agnumber_t, agno) 3345 - __field(xfs_agblock_t, agbno) 3346 + __field(xfs_agblock_t, gbno) 3346 3347 __field(xfs_lookup_t, dir) 3347 3348 ), 3348 3349 TP_fast_assign( 3349 3350 __entry->dev = cur->bc_mp->m_super->s_dev; 3351 + __entry->type = cur->bc_group->xg_type; 3350 3352 __entry->agno = cur->bc_group->xg_gno; 3351 - __entry->agbno = agbno; 3353 + __entry->gbno = gbno; 3352 3354 __entry->dir = dir; 3353 3355 ), 3354 - TP_printk("dev %d:%d agno 0x%x agbno 0x%x cmp %s(%d)", 3356 + TP_printk("dev %d:%d %sno 0x%x gbno 0x%x cmp %s(%d)", 3355 3357 MAJOR(__entry->dev), MINOR(__entry->dev), 3358 + __print_symbolic(__entry->type, XG_TYPE_STRINGS), 3356 3359 __entry->agno, 3357 - __entry->agbno, 3360 + __entry->gbno, 3358 3361 __print_symbolic(__entry->dir, XFS_AG_BTREE_CMP_FORMAT_STR), 3359 3362 __entry->dir) 3360 3363 ) ··· 3371 3362 TP_ARGS(cur, irec), 3372 3363 TP_STRUCT__entry( 3373 3364 __field(dev_t, dev) 3365 + __field(enum xfs_group_type, type) 3374 3366 __field(xfs_agnumber_t, agno) 3375 3367 __field(enum xfs_refc_domain, domain) 3376 3368 __field(xfs_agblock_t, startblock) ··· 3380 3370 ), 3381 3371 TP_fast_assign( 3382 3372 __entry->dev = cur->bc_mp->m_super->s_dev; 3373 + __entry->type = cur->bc_group->xg_type; 3383 3374 __entry->agno = cur->bc_group->xg_gno; 3384 3375 __entry->domain = irec->rc_domain; 3385 3376 __entry->startblock = irec->rc_startblock; 3386 3377 __entry->blockcount = irec->rc_blockcount; 3387 3378 __entry->refcount = irec->rc_refcount; 3388 3379 ), 3389 - TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u", 3380 + TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u", 3390 3381 MAJOR(__entry->dev), MINOR(__entry->dev), 3382 + __print_symbolic(__entry->type, XG_TYPE_STRINGS), 3391 3383 __entry->agno, 3392 3384 __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), 3393 3385 __entry->startblock, ··· 3405 3393 /* single-rcext and an agbno tracepoint class */ 3406 3394 DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, 3407 3395 TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, 3408 - xfs_agblock_t agbno), 3409 - TP_ARGS(cur, irec, agbno), 3396 + xfs_agblock_t gbno), 3397 + TP_ARGS(cur, irec, gbno), 3410 3398 TP_STRUCT__entry( 3411 3399 __field(dev_t, dev) 3400 + __field(enum xfs_group_type, type) 3412 3401 __field(xfs_agnumber_t, agno) 3413 3402 __field(enum xfs_refc_domain, domain) 3414 3403 __field(xfs_agblock_t, startblock) 3415 3404 __field(xfs_extlen_t, blockcount) 3416 3405 __field(xfs_nlink_t, refcount) 3417 - __field(xfs_agblock_t, agbno) 3406 + __field(xfs_agblock_t, gbno) 3418 3407 ), 3419 3408 TP_fast_assign( 3420 3409 __entry->dev = cur->bc_mp->m_super->s_dev; 3410 + __entry->type = cur->bc_group->xg_type; 3421 3411 __entry->agno = cur->bc_group->xg_gno; 3422 3412 __entry->domain = irec->rc_domain; 3423 3413 __entry->startblock = irec->rc_startblock; 3424 3414 __entry->blockcount = irec->rc_blockcount; 3425 3415 __entry->refcount = irec->rc_refcount; 3426 - __entry->agbno = agbno; 3416 + __entry->gbno = gbno; 3427 3417 ), 3428 - TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", 3418 + TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u @ gbno 0x%x", 3429 3419 MAJOR(__entry->dev), MINOR(__entry->dev), 3420 + __print_symbolic(__entry->type, XG_TYPE_STRINGS), 3430 3421 __entry->agno, 3431 3422 __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), 3432 3423 __entry->startblock, 3433 3424 __entry->blockcount, 3434 3425 __entry->refcount, 3435 - __entry->agbno) 3426 + __entry->gbno) 3436 3427 ) 3437 3428 3438 3429 #define DEFINE_REFCOUNT_EXTENT_AT_EVENT(name) \ 3439 3430 DEFINE_EVENT(xfs_refcount_extent_at_class, name, \ 3440 3431 TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, \ 3441 - xfs_agblock_t agbno), \ 3442 - TP_ARGS(cur, irec, agbno)) 3432 + xfs_agblock_t gbno), \ 3433 + TP_ARGS(cur, irec, gbno)) 3443 3434 3444 3435 /* double-rcext tracepoint class */ 3445 3436 DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, 3446 3437 TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, 3447 - struct xfs_refcount_irec *i2), 3438 + struct xfs_refcount_irec *i2), 3448 3439 TP_ARGS(cur, i1, i2), 3449 3440 TP_STRUCT__entry( 3450 3441 __field(dev_t, dev) 3442 + __field(enum xfs_group_type, type) 3451 3443 __field(xfs_agnumber_t, agno) 3452 3444 __field(enum xfs_refc_domain, i1_domain) 3453 3445 __field(xfs_agblock_t, i1_startblock) ··· 3464 3448 ), 3465 3449 TP_fast_assign( 3466 3450 __entry->dev = cur->bc_mp->m_super->s_dev; 3451 + __entry->type = cur->bc_group->xg_type; 3467 3452 __entry->agno = cur->bc_group->xg_gno; 3468 3453 __entry->i1_domain = i1->rc_domain; 3469 3454 __entry->i1_startblock = i1->rc_startblock; ··· 3475 3458 __entry->i2_blockcount = i2->rc_blockcount; 3476 3459 __entry->i2_refcount = i2->rc_refcount; 3477 3460 ), 3478 - TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " 3479 - "dom %s agbno 0x%x fsbcount 0x%x refcount %u", 3461 + TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u -- " 3462 + "dom %s gbno 0x%x fsbcount 0x%x refcount %u", 3480 3463 MAJOR(__entry->dev), MINOR(__entry->dev), 3464 + __print_symbolic(__entry->type, XG_TYPE_STRINGS), 3481 3465 __entry->agno, 3482 3466 __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS), 3483 3467 __entry->i1_startblock, ··· 3499 3481 /* double-rcext and an agbno tracepoint class */ 3500 3482 DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, 3501 3483 TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, 3502 - struct xfs_refcount_irec *i2, xfs_agblock_t agbno), 3503 - TP_ARGS(cur, i1, i2, agbno), 3484 + struct xfs_refcount_irec *i2, xfs_agblock_t gbno), 3485 + TP_ARGS(cur, i1, i2, gbno), 3504 3486 TP_STRUCT__entry( 3505 3487 __field(dev_t, dev) 3488 + __field(enum xfs_group_type, type) 3506 3489 __field(xfs_agnumber_t, agno) 3507 3490 __field(enum xfs_refc_domain, i1_domain) 3508 3491 __field(xfs_agblock_t, i1_startblock) ··· 3513 3494 __field(xfs_agblock_t, i2_startblock) 3514 3495 __field(xfs_extlen_t, i2_blockcount) 3515 3496 __field(xfs_nlink_t, i2_refcount) 3516 - __field(xfs_agblock_t, agbno) 3497 + __field(xfs_agblock_t, gbno) 3517 3498 ), 3518 3499 TP_fast_assign( 3519 3500 __entry->dev = cur->bc_mp->m_super->s_dev; 3501 + __entry->type = cur->bc_group->xg_type; 3520 3502 __entry->agno = cur->bc_group->xg_gno; 3521 3503 __entry->i1_domain = i1->rc_domain; 3522 3504 __entry->i1_startblock = i1->rc_startblock; ··· 3527 3507 __entry->i2_startblock = i2->rc_startblock; 3528 3508 __entry->i2_blockcount = i2->rc_blockcount; 3529 3509 __entry->i2_refcount = i2->rc_refcount; 3530 - __entry->agbno = agbno; 3510 + __entry->gbno = gbno; 3531 3511 ), 3532 - TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " 3533 - "dom %s agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", 3512 + TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u -- " 3513 + "dom %s gbno 0x%x fsbcount 0x%x refcount %u @ gbno 0x%x", 3534 3514 MAJOR(__entry->dev), MINOR(__entry->dev), 3515 + __print_symbolic(__entry->type, XG_TYPE_STRINGS), 3535 3516 __entry->agno, 3536 3517 __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS), 3537 3518 __entry->i1_startblock, ··· 3542 3521 __entry->i2_startblock, 3543 3522 __entry->i2_blockcount, 3544 3523 __entry->i2_refcount, 3545 - __entry->agbno) 3524 + __entry->gbno) 3546 3525 ) 3547 3526 3548 3527 #define DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(name) \ 3549 3528 DEFINE_EVENT(xfs_refcount_double_extent_at_class, name, \ 3550 3529 TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, \ 3551 - struct xfs_refcount_irec *i2, xfs_agblock_t agbno), \ 3552 - TP_ARGS(cur, i1, i2, agbno)) 3530 + struct xfs_refcount_irec *i2, xfs_agblock_t gbno), \ 3531 + TP_ARGS(cur, i1, i2, gbno)) 3553 3532 3554 3533 /* triple-rcext tracepoint class */ 3555 3534 DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, ··· 3558 3537 TP_ARGS(cur, i1, i2, i3), 3559 3538 TP_STRUCT__entry( 3560 3539 __field(dev_t, dev) 3540 + __field(enum xfs_group_type, type) 3561 3541 __field(xfs_agnumber_t, agno) 3562 3542 __field(enum xfs_refc_domain, i1_domain) 3563 3543 __field(xfs_agblock_t, i1_startblock) ··· 3575 3553 ), 3576 3554 TP_fast_assign( 3577 3555 __entry->dev = cur->bc_mp->m_super->s_dev; 3556 + __entry->type = cur->bc_group->xg_type; 3578 3557 __entry->agno = cur->bc_group->xg_gno; 3579 3558 __entry->i1_domain = i1->rc_domain; 3580 3559 __entry->i1_startblock = i1->rc_startblock; ··· 3590 3567 __entry->i3_blockcount = i3->rc_blockcount; 3591 3568 __entry->i3_refcount = i3->rc_refcount; 3592 3569 ), 3593 - TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " 3594 - "dom %s agbno 0x%x fsbcount 0x%x refcount %u -- " 3595 - "dom %s agbno 0x%x fsbcount 0x%x refcount %u", 3570 + TP_printk("dev %d:%d %sno 0x%x dom %s gbno 0x%x fsbcount 0x%x refcount %u -- " 3571 + "dom %s gbno 0x%x fsbcount 0x%x refcount %u -- " 3572 + "dom %s gbno 0x%x fsbcount 0x%x refcount %u", 3596 3573 MAJOR(__entry->dev), MINOR(__entry->dev), 3574 + __print_symbolic(__entry->type, XG_TYPE_STRINGS), 3597 3575 __entry->agno, 3598 3576 __print_symbolic(__entry->i1_domain, XFS_REFC_DOMAIN_STRINGS), 3599 3577 __entry->i1_startblock, ··· 3662 3638 TP_ARGS(mp, refc), 3663 3639 TP_STRUCT__entry( 3664 3640 __field(dev_t, dev) 3641 + __field(enum xfs_group_type, type) 3665 3642 __field(xfs_agnumber_t, agno) 3666 3643 __field(int, op) 3667 - __field(xfs_agblock_t, agbno) 3644 + __field(xfs_agblock_t, gbno) 3668 3645 __field(xfs_extlen_t, len) 3669 3646 ), 3670 3647 TP_fast_assign( 3671 3648 __entry->dev = mp->m_super->s_dev; 3672 - __entry->agno = XFS_FSB_TO_AGNO(mp, refc->ri_startblock); 3649 + __entry->type = refc->ri_group->xg_type; 3650 + __entry->agno = refc->ri_group->xg_gno; 3673 3651 __entry->op = refc->ri_type; 3674 - __entry->agbno = XFS_FSB_TO_AGBNO(mp, refc->ri_startblock); 3652 + __entry->gbno = xfs_fsb_to_gbno(mp, refc->ri_startblock, 3653 + refc->ri_group->xg_type); 3675 3654 __entry->len = refc->ri_blockcount; 3676 3655 ), 3677 - TP_printk("dev %d:%d op %s agno 0x%x agbno 0x%x fsbcount 0x%x", 3656 + TP_printk("dev %d:%d op %s %sno 0x%x gbno 0x%x fsbcount 0x%x", 3678 3657 MAJOR(__entry->dev), MINOR(__entry->dev), 3679 3658 __print_symbolic(__entry->op, XFS_REFCOUNT_INTENT_STRINGS), 3659 + __print_symbolic(__entry->type, XG_TYPE_STRINGS), 3680 3660 __entry->agno, 3681 - __entry->agbno, 3661 + __entry->gbno, 3682 3662 __entry->len) 3683 3663 ); 3684 3664 #define DEFINE_REFCOUNT_DEFERRED_EVENT(name) \ ··· 4021 3993 __entry->offset = frec->offset; 4022 3994 __entry->flags = frec->rm_flags; 4023 3995 ), 4024 - TP_printk("dev %d:%d keydev %d:%d agno 0x%x rmapbno 0x%x start_daddr 0x%llx len_daddr 0x%llx owner 0x%llx fileoff 0x%llx flags 0x%x", 3996 + TP_printk("dev %d:%d keydev %d:%d agno 0x%x gbno 0x%x start_daddr 0x%llx len_daddr 0x%llx owner 0x%llx fileoff 0x%llx flags 0x%x", 4025 3997 MAJOR(__entry->dev), MINOR(__entry->dev), 4026 3998 MAJOR(__entry->keydev), MINOR(__entry->keydev), 4027 3999 __entry->agno, ··· 4978 4950 __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino; 4979 4951 __entry->bno = xfs_buf_daddr(bp); 4980 4952 __entry->nblks = bp->b_length; 4981 - __entry->hold = atomic_read(&bp->b_hold); 4953 + __entry->hold = bp->b_hold; 4982 4954 __entry->pincount = atomic_read(&bp->b_pin_count); 4983 4955 __entry->lockval = bp->b_sema.count; 4984 4956 __entry->flags = bp->b_flags; ··· 5601 5573 xfs_ino_t ino), \ 5602 5574 TP_ARGS(dp, name, ino)) 5603 5575 DEFINE_METADIR_EVENT(xfs_metadir_lookup); 5576 + 5577 + /* metadata inode space reservations */ 5578 + 5579 + DECLARE_EVENT_CLASS(xfs_metafile_resv_class, 5580 + TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), 5581 + TP_ARGS(ip, len), 5582 + TP_STRUCT__entry( 5583 + __field(dev_t, dev) 5584 + __field(xfs_ino_t, ino) 5585 + __field(unsigned long long, freeblks) 5586 + __field(unsigned long long, reserved) 5587 + __field(unsigned long long, asked) 5588 + __field(unsigned long long, used) 5589 + __field(unsigned long long, len) 5590 + ), 5591 + TP_fast_assign( 5592 + struct xfs_mount *mp = ip->i_mount; 5593 + 5594 + __entry->dev = mp->m_super->s_dev; 5595 + __entry->ino = ip->i_ino; 5596 + __entry->freeblks = percpu_counter_sum(&mp->m_fdblocks); 5597 + __entry->reserved = ip->i_delayed_blks; 5598 + __entry->asked = ip->i_meta_resv_asked; 5599 + __entry->used = ip->i_nblocks; 5600 + __entry->len = len; 5601 + ), 5602 + TP_printk("dev %d:%d ino 0x%llx freeblks %llu resv %llu ask %llu used %llu len %llu", 5603 + MAJOR(__entry->dev), MINOR(__entry->dev), 5604 + __entry->ino, 5605 + __entry->freeblks, 5606 + __entry->reserved, 5607 + __entry->asked, 5608 + __entry->used, 5609 + __entry->len) 5610 + ) 5611 + #define DEFINE_METAFILE_RESV_EVENT(name) \ 5612 + DEFINE_EVENT(xfs_metafile_resv_class, name, \ 5613 + TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), \ 5614 + TP_ARGS(ip, len)) 5615 + DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init); 5616 + DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free); 5617 + DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_alloc_space); 5618 + DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free_space); 5619 + DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_critical); 5620 + DEFINE_INODE_ERROR_EVENT(xfs_metafile_resv_init_error); 5621 + 5622 + #ifdef CONFIG_XFS_RT 5623 + TRACE_EVENT(xfs_growfs_check_rtgeom, 5624 + TP_PROTO(const struct xfs_mount *mp, unsigned int min_logfsbs), 5625 + TP_ARGS(mp, min_logfsbs), 5626 + TP_STRUCT__entry( 5627 + __field(dev_t, dev) 5628 + __field(unsigned int, logblocks) 5629 + __field(unsigned int, min_logfsbs) 5630 + ), 5631 + TP_fast_assign( 5632 + __entry->dev = mp->m_super->s_dev; 5633 + __entry->logblocks = mp->m_sb.sb_logblocks; 5634 + __entry->min_logfsbs = min_logfsbs; 5635 + ), 5636 + TP_printk("dev %d:%d logblocks %u min_logfsbs %u", 5637 + MAJOR(__entry->dev), MINOR(__entry->dev), 5638 + __entry->logblocks, 5639 + __entry->min_logfsbs) 5640 + ); 5641 + #endif /* CONFIG_XFS_RT */ 5604 5642 5605 5643 #endif /* _TRACE_XFS_H */ 5606 5644

+4 -2

fs/xfs/xfs_trans.c

··· 100 100 /* 101 101 * Initialize the new transaction structure. 102 102 */ 103 - ntp->t_magic = XFS_TRANS_HEADER_MAGIC; 104 103 ntp->t_mountp = tp->t_mountp; 105 104 INIT_LIST_HEAD(&ntp->t_items); 106 105 INIT_LIST_HEAD(&ntp->t_busy); ··· 274 275 ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) || 275 276 xfs_has_lazysbcount(mp)); 276 277 277 - tp->t_magic = XFS_TRANS_HEADER_MAGIC; 278 278 tp->t_flags = flags; 279 279 tp->t_mountp = mp; 280 280 INIT_LIST_HEAD(&tp->t_items); ··· 1264 1266 xfs_ilock(ip, XFS_ILOCK_EXCL); 1265 1267 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 1266 1268 1269 + if (xfs_is_metadir_inode(ip)) 1270 + goto out; 1271 + 1267 1272 error = xfs_qm_dqattach_locked(ip, false); 1268 1273 if (error) { 1269 1274 /* Caller should have allocated the dquots! */ ··· 1335 1334 goto out_cancel; 1336 1335 } 1337 1336 1337 + out: 1338 1338 *tpp = tp; 1339 1339 return 0; 1340 1340

-1

fs/xfs/xfs_trans.h

··· 122 122 * This is the structure maintained for every active transaction. 123 123 */ 124 124 typedef struct xfs_trans { 125 - unsigned int t_magic; /* magic number */ 126 125 unsigned int t_log_res; /* amt of log space resvd */ 127 126 unsigned int t_log_count; /* count for perm log res */ 128 127 unsigned int t_blk_res; /* # of blocks resvd */

+2 -7

fs/xfs/xfs_trans_ail.c

··· 359 359 } 360 360 361 361 /* protected by ail_lock */ 362 - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { 363 - if (bp->b_flags & (_XBF_INODES | _XBF_DQUOTS)) 364 - clear_bit(XFS_LI_FAILED, &lip->li_flags); 365 - else 366 - xfs_clear_li_failed(lip); 367 - } 368 - 362 + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) 363 + clear_bit(XFS_LI_FAILED, &lip->li_flags); 369 364 xfs_buf_unlock(bp); 370 365 return XFS_ITEM_SUCCESS; 371 366 }

+4 -4

fs/xfs/xfs_trans_buf.c

··· 659 659 ASSERT(atomic_read(&bip->bli_refcount) > 0); 660 660 661 661 bip->bli_flags |= XFS_BLI_INODE_BUF; 662 - bp->b_flags |= _XBF_INODES; 662 + bp->b_iodone = xfs_buf_inode_iodone; 663 663 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); 664 664 } 665 665 ··· 684 684 ASSERT(atomic_read(&bip->bli_refcount) > 0); 685 685 686 686 bip->bli_flags |= XFS_BLI_STALE_INODE; 687 - bp->b_flags |= _XBF_INODES; 687 + bp->b_iodone = xfs_buf_inode_iodone; 688 688 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); 689 689 } 690 690 ··· 709 709 ASSERT(atomic_read(&bip->bli_refcount) > 0); 710 710 711 711 bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; 712 - bp->b_flags |= _XBF_INODES; 712 + bp->b_iodone = xfs_buf_inode_iodone; 713 713 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); 714 714 } 715 715 ··· 820 820 break; 821 821 } 822 822 823 - bp->b_flags |= _XBF_DQUOTS; 823 + bp->b_iodone = xfs_buf_dquot_iodone; 824 824 xfs_trans_buf_set_type(tp, bp, type); 825 825 }

+4 -4

fs/xfs/xfs_trans_dquot.c

··· 156 156 unsigned int field, 157 157 int64_t delta) 158 158 { 159 - ASSERT(!xfs_is_metadir_inode(ip) || XFS_IS_DQDETACHED(ip)); 159 + if (xfs_is_metadir_inode(ip)) 160 + return; 160 161 161 162 xfs_trans_mod_dquot(tp, dqp, field, delta); 162 163 ··· 247 246 xfs_mount_t *mp = tp->t_mountp; 248 247 249 248 if (!XFS_IS_QUOTA_ON(mp) || 250 - xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) 249 + xfs_is_quota_inode(&mp->m_sb, ip->i_ino) || 250 + xfs_is_metadir_inode(ip)) 251 251 return; 252 - 253 - ASSERT(!xfs_is_metadir_inode(ip) || XFS_IS_DQDETACHED(ip)); 254 252 255 253 if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot) 256 254 xfs_trans_mod_ino_dquot(tp, ip, ip->i_udquot, field, delta);

Configure Feed

Configure Feed