Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

ocfs2: give ocfs2 the ability to reclaim suballocator free bg

Patch series "ocfs2: give ocfs2 the ability to reclaim suballocator free
bg", v6.


This patch (of 2):

The current ocfs2 code can't reclaim suballocator block group space. In
some cases, this causes ocfs2 to hold onto a lot of space. For example,
when creating lots of small files, the space is held/managed by the
'//inode_alloc'. After the user deletes all the small files, the space
never returns to the '//global_bitmap'. This issue prevents ocfs2 from
providing the needed space even when there is enough free space in a small
ocfs2 volume.

This patch gives ocfs2 the ability to reclaim suballocator free space when
the block group is freed. For performance reasons, this patch keeps the
first suballocator block group active.

Link: https://lkml.kernel.org/r/20251212074505.25962-2-heming.zhao@suse.com
Signed-off-by: Heming Zhao <heming.zhao@suse.com>
Reviewed-by: Su Yue <glass.su@suse.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Heming Zhao and committed by
Andrew Morton
4a543316 b11052be

+299 -9
+299 -9
fs/ocfs2/suballoc.c
··· 295 295 return ocfs2_validate_gd_self(sb, bh, 0); 296 296 } 297 297 298 + /* 299 + * The hint group descriptor (gd) may already have been released 300 + * in _ocfs2_free_suballoc_bits(). We first check the gd signature, 301 + * then perform the standard ocfs2_read_group_descriptor() jobs. 302 + * 303 + * If the gd signature is invalid, we return 'rc=0' and set 304 + * '*released=1'. The caller is expected to handle this specific case. 305 + * Otherwise, we return the actual error code. 306 + * 307 + * We treat gd signature corruption case as a release case. The 308 + * caller ocfs2_claim_suballoc_bits() will use ocfs2_search_chain() 309 + * to search each gd block. The code will eventually find this 310 + * corrupted gd block - Late, but not missed. 311 + * 312 + * Note: 313 + * The caller is responsible for initializing the '*released' status. 314 + */ 315 + static int ocfs2_read_hint_group_descriptor(struct inode *inode, 316 + struct ocfs2_dinode *di, u64 gd_blkno, 317 + struct buffer_head **bh, int *released) 318 + { 319 + int rc; 320 + struct buffer_head *tmp = *bh; 321 + struct ocfs2_group_desc *gd; 322 + 323 + rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp, NULL); 324 + if (rc) 325 + goto out; 326 + 327 + gd = (struct ocfs2_group_desc *) tmp->b_data; 328 + if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { 329 + /* 330 + * Invalid gd cache was set in ocfs2_read_block(), 331 + * which will affect block_group allocation. 332 + * Path: 333 + * ocfs2_reserve_suballoc_bits 334 + * ocfs2_block_group_alloc 335 + * ocfs2_block_group_alloc_contig 336 + * ocfs2_set_new_buffer_uptodate 337 + */ 338 + ocfs2_remove_from_cache(INODE_CACHE(inode), tmp); 339 + *released = 1; /* we return 'rc=0' for this case */ 340 + goto free_bh; 341 + } 342 + 343 + /* below jobs same with ocfs2_read_group_descriptor() */ 344 + if (!buffer_jbd(tmp)) { 345 + rc = ocfs2_validate_group_descriptor(inode->i_sb, tmp); 346 + if (rc) 347 + goto free_bh; 348 + } 349 + 350 + rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0); 351 + if (rc) 352 + goto free_bh; 353 + 354 + /* If ocfs2_read_block() got us a new bh, pass it up. */ 355 + if (!*bh) 356 + *bh = tmp; 357 + 358 + return rc; 359 + 360 + free_bh: 361 + brelse(tmp); 362 + out: 363 + return rc; 364 + } 365 + 298 366 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di, 299 367 u64 gd_blkno, struct buffer_head **bh) 300 368 { ··· 1793 1725 u32 bits_wanted, 1794 1726 u32 min_bits, 1795 1727 struct ocfs2_suballoc_result *res, 1796 - u16 *bits_left) 1728 + u16 *bits_left, int *released) 1797 1729 { 1798 1730 int ret; 1799 1731 struct buffer_head *group_bh = NULL; ··· 1801 1733 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; 1802 1734 struct inode *alloc_inode = ac->ac_inode; 1803 1735 1804 - ret = ocfs2_read_group_descriptor(alloc_inode, di, 1805 - res->sr_bg_blkno, &group_bh); 1806 - if (ret < 0) { 1736 + ret = ocfs2_read_hint_group_descriptor(alloc_inode, di, 1737 + res->sr_bg_blkno, &group_bh, released); 1738 + if (*released) { 1739 + return 0; 1740 + } else if (ret < 0) { 1807 1741 mlog_errno(ret); 1808 1742 return ret; 1809 1743 } ··· 2020 1950 struct ocfs2_suballoc_result *res) 2021 1951 { 2022 1952 int status; 1953 + int released = 0; 2023 1954 u16 victim, i; 2024 1955 u16 bits_left = 0; 2025 1956 u64 hint = ac->ac_last_group; ··· 2047 1976 goto bail; 2048 1977 } 2049 1978 1979 + /* the hint bg may already be released, we quiet search this group. */ 2050 1980 res->sr_bg_blkno = hint; 2051 1981 if (res->sr_bg_blkno) { 2052 1982 /* Attempt to short-circuit the usual search mechanism ··· 2055 1983 * allocation group. This helps us maintain some 2056 1984 * contiguousness across allocations. */ 2057 1985 status = ocfs2_search_one_group(ac, handle, bits_wanted, 2058 - min_bits, res, &bits_left); 1986 + min_bits, res, &bits_left, 1987 + &released); 1988 + if (released) { 1989 + res->sr_bg_blkno = 0; 1990 + goto chain_search; 1991 + } 2059 1992 if (!status) 2060 1993 goto set_hint; 2061 1994 if (status < 0 && status != -ENOSPC) { ··· 2068 1991 goto bail; 2069 1992 } 2070 1993 } 2071 - 1994 + chain_search: 2072 1995 cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; 2073 1996 if (!le16_to_cpu(cl->cl_next_free_rec) || 2074 1997 le16_to_cpu(cl->cl_next_free_rec) > le16_to_cpu(cl->cl_count)) { ··· 2190 2113 return status; 2191 2114 } 2192 2115 2116 + /* 2117 + * after ocfs2 has the ability to release block group unused space, 2118 + * the ->ip_last_used_group may be invalid. so this function returns 2119 + * ac->ac_last_group need to verify. 2120 + * refer the 'hint' in ocfs2_claim_suballoc_bits() for more details. 2121 + */ 2193 2122 static void ocfs2_init_inode_ac_group(struct inode *dir, 2194 2123 struct buffer_head *parent_di_bh, 2195 2124 struct ocfs2_alloc_context *ac) ··· 2635 2552 } 2636 2553 2637 2554 /* 2555 + * Reclaim the suballocator managed space to main bitmap. 2556 + * This function first works on the suballocator to perform the 2557 + * cleanup rec/alloc_inode job, then switches to the main bitmap 2558 + * to reclaim released space. 2559 + * 2560 + * handle: The transaction handle 2561 + * alloc_inode: The suballoc inode 2562 + * alloc_bh: The buffer_head of suballoc inode 2563 + * group_bh: The group descriptor buffer_head of suballocator managed. 2564 + * Caller should release the input group_bh. 2565 + */ 2566 + static int _ocfs2_reclaim_suballoc_to_main(handle_t *handle, 2567 + struct inode *alloc_inode, 2568 + struct buffer_head *alloc_bh, 2569 + struct buffer_head *group_bh) 2570 + { 2571 + int idx, status = 0; 2572 + int i, next_free_rec, len = 0; 2573 + __le16 old_bg_contig_free_bits = 0; 2574 + u16 start_bit; 2575 + u32 tmp_used; 2576 + u64 bg_blkno, start_blk; 2577 + unsigned int count; 2578 + struct ocfs2_chain_rec *rec; 2579 + struct buffer_head *main_bm_bh = NULL; 2580 + struct inode *main_bm_inode = NULL; 2581 + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); 2582 + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; 2583 + struct ocfs2_chain_list *cl = &fe->id2.i_chain; 2584 + struct ocfs2_group_desc *group = (struct ocfs2_group_desc *) group_bh->b_data; 2585 + 2586 + idx = le16_to_cpu(group->bg_chain); 2587 + rec = &(cl->cl_recs[idx]); 2588 + 2589 + status = ocfs2_extend_trans(handle, 2590 + ocfs2_calc_group_alloc_credits(osb->sb, 2591 + le16_to_cpu(cl->cl_cpg))); 2592 + if (status) { 2593 + mlog_errno(status); 2594 + goto bail; 2595 + } 2596 + status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 2597 + alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); 2598 + if (status < 0) { 2599 + mlog_errno(status); 2600 + goto bail; 2601 + } 2602 + 2603 + /* 2604 + * Only clear the suballocator rec item in-place. 2605 + * 2606 + * If idx is not the last, we don't compress (remove the empty item) 2607 + * the cl_recs[]. If not, we need to do lots jobs. 2608 + * 2609 + * Compress cl_recs[] code example: 2610 + * if (idx != cl->cl_next_free_rec - 1) 2611 + * memmove(&cl->cl_recs[idx], &cl->cl_recs[idx + 1], 2612 + * sizeof(struct ocfs2_chain_rec) * 2613 + * (cl->cl_next_free_rec - idx - 1)); 2614 + * for(i = idx; i < cl->cl_next_free_rec-1; i++) { 2615 + * group->bg_chain = "later group->bg_chain"; 2616 + * group->bg_blkno = xxx; 2617 + * ... ... 2618 + * } 2619 + */ 2620 + 2621 + tmp_used = le32_to_cpu(fe->id1.bitmap1.i_total); 2622 + fe->id1.bitmap1.i_total = cpu_to_le32(tmp_used - le32_to_cpu(rec->c_total)); 2623 + 2624 + /* Substraction 1 for the block group itself */ 2625 + tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 2626 + fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - 1); 2627 + 2628 + tmp_used = le32_to_cpu(fe->i_clusters); 2629 + fe->i_clusters = cpu_to_le32(tmp_used - le16_to_cpu(cl->cl_cpg)); 2630 + 2631 + spin_lock(&OCFS2_I(alloc_inode)->ip_lock); 2632 + OCFS2_I(alloc_inode)->ip_clusters -= le32_to_cpu(fe->i_clusters); 2633 + fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, 2634 + le32_to_cpu(fe->i_clusters))); 2635 + spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); 2636 + i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); 2637 + alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); 2638 + 2639 + ocfs2_journal_dirty(handle, alloc_bh); 2640 + ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); 2641 + 2642 + start_blk = le64_to_cpu(rec->c_blkno); 2643 + count = le32_to_cpu(rec->c_total) / le16_to_cpu(cl->cl_bpc); 2644 + 2645 + /* 2646 + * If the rec is the last one, let's compress the chain list by 2647 + * removing the empty cl_recs[] at the end. 2648 + */ 2649 + next_free_rec = le16_to_cpu(cl->cl_next_free_rec); 2650 + if (idx == (next_free_rec - 1)) { 2651 + len++; /* the last item should be counted first */ 2652 + for (i = (next_free_rec - 2); i > 0; i--) { 2653 + if (cl->cl_recs[i].c_free == cl->cl_recs[i].c_total) 2654 + len++; 2655 + else 2656 + break; 2657 + } 2658 + } 2659 + le16_add_cpu(&cl->cl_next_free_rec, -len); 2660 + 2661 + rec->c_free = 0; 2662 + rec->c_total = 0; 2663 + rec->c_blkno = 0; 2664 + ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), group_bh); 2665 + memset(group, 0, sizeof(struct ocfs2_group_desc)); 2666 + 2667 + /* prepare job for reclaim clusters */ 2668 + main_bm_inode = ocfs2_get_system_file_inode(osb, 2669 + GLOBAL_BITMAP_SYSTEM_INODE, 2670 + OCFS2_INVALID_SLOT); 2671 + if (!main_bm_inode) 2672 + goto bail; /* ignore the error in reclaim path */ 2673 + 2674 + inode_lock(main_bm_inode); 2675 + 2676 + status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); 2677 + if (status < 0) 2678 + goto free_bm_inode; /* ignore the error in reclaim path */ 2679 + 2680 + ocfs2_block_to_cluster_group(main_bm_inode, start_blk, &bg_blkno, 2681 + &start_bit); 2682 + fe = (struct ocfs2_dinode *) main_bm_bh->b_data; 2683 + cl = &fe->id2.i_chain; 2684 + /* reuse group_bh, caller will release the input group_bh */ 2685 + group_bh = NULL; 2686 + 2687 + /* reclaim clusters to global_bitmap */ 2688 + status = ocfs2_read_group_descriptor(main_bm_inode, fe, bg_blkno, 2689 + &group_bh); 2690 + if (status < 0) { 2691 + mlog_errno(status); 2692 + goto free_bm_bh; 2693 + } 2694 + group = (struct ocfs2_group_desc *) group_bh->b_data; 2695 + 2696 + if ((count + start_bit) > le16_to_cpu(group->bg_bits)) { 2697 + ocfs2_error(alloc_inode->i_sb, 2698 + "reclaim length (%d) beyands block group length (%d)", 2699 + count + start_bit, le16_to_cpu(group->bg_bits)); 2700 + goto free_group_bh; 2701 + } 2702 + 2703 + old_bg_contig_free_bits = group->bg_contig_free_bits; 2704 + status = ocfs2_block_group_clear_bits(handle, main_bm_inode, 2705 + group, group_bh, 2706 + start_bit, count, 0, 2707 + _ocfs2_clear_bit); 2708 + if (status < 0) { 2709 + mlog_errno(status); 2710 + goto free_group_bh; 2711 + } 2712 + 2713 + status = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode), 2714 + main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE); 2715 + if (status < 0) { 2716 + mlog_errno(status); 2717 + ocfs2_block_group_set_bits(handle, main_bm_inode, group, group_bh, 2718 + start_bit, count, 2719 + le16_to_cpu(old_bg_contig_free_bits), 1); 2720 + goto free_group_bh; 2721 + } 2722 + 2723 + idx = le16_to_cpu(group->bg_chain); 2724 + rec = &(cl->cl_recs[idx]); 2725 + 2726 + le32_add_cpu(&rec->c_free, count); 2727 + tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 2728 + fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); 2729 + ocfs2_journal_dirty(handle, main_bm_bh); 2730 + 2731 + free_group_bh: 2732 + brelse(group_bh); 2733 + 2734 + free_bm_bh: 2735 + ocfs2_inode_unlock(main_bm_inode, 1); 2736 + brelse(main_bm_bh); 2737 + 2738 + free_bm_inode: 2739 + inode_unlock(main_bm_inode); 2740 + iput(main_bm_inode); 2741 + 2742 + bail: 2743 + return status; 2744 + } 2745 + 2746 + /* 2638 2747 * expects the suballoc inode to already be locked. 2639 2748 */ 2640 2749 static int _ocfs2_free_suballoc_bits(handle_t *handle, ··· 2838 2563 void (*undo_fn)(unsigned int bit, 2839 2564 unsigned long *bitmap)) 2840 2565 { 2841 - int status = 0; 2566 + int idx, status = 0; 2842 2567 u32 tmp_used; 2843 2568 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; 2844 2569 struct ocfs2_chain_list *cl = &fe->id2.i_chain; 2845 2570 struct buffer_head *group_bh = NULL; 2846 2571 struct ocfs2_group_desc *group; 2572 + struct ocfs2_chain_rec *rec; 2847 2573 __le16 old_bg_contig_free_bits = 0; 2848 2574 2849 2575 /* The alloc_bh comes from ocfs2_free_dinode() or ··· 2890 2614 goto bail; 2891 2615 } 2892 2616 2893 - le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free, 2894 - count); 2617 + idx = le16_to_cpu(group->bg_chain); 2618 + rec = &(cl->cl_recs[idx]); 2619 + 2620 + le32_add_cpu(&rec->c_free, count); 2895 2621 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); 2896 2622 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); 2897 2623 ocfs2_journal_dirty(handle, alloc_bh); 2624 + 2625 + /* 2626 + * Reclaim suballocator free space. 2627 + * Bypass: global_bitmap, non empty rec, first rec in cl_recs[] 2628 + */ 2629 + if (ocfs2_is_cluster_bitmap(alloc_inode) || 2630 + (le32_to_cpu(rec->c_free) != (le32_to_cpu(rec->c_total) - 1)) || 2631 + (le16_to_cpu(cl->cl_next_free_rec) == 1)) { 2632 + goto bail; 2633 + } 2634 + 2635 + _ocfs2_reclaim_suballoc_to_main(handle, alloc_inode, alloc_bh, group_bh); 2898 2636 2899 2637 bail: 2900 2638 brelse(group_bh);