Merge tag 'erofs-for-6.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs

+11

Documentation/ABI/testing/sysfs-fs-erofs

··· 16 16 readahead on atomic contexts only. 17 17 - 1 (force on): enable for readpage and readahead. 18 18 - 2 (force off): disable for all situations. 19 + 20 + What: /sys/fs/erofs/<disk>/drop_caches 21 + Date: November 2024 22 + Contact: "Guo Chunhai" <guochunhai@vivo.com> 23 + Description: Writing to this will drop compression-related caches, 24 + currently used to drop in-memory pclusters and cached 25 + compressed folios: 26 + 27 + - 1 : invalidate cached compressed folios 28 + - 2 : drop in-memory pclusters 29 + - 3 : drop in-memory pclusters and cached compressed folios

+37 -32

fs/erofs/data.c

··· 10 10 11 11 void erofs_unmap_metabuf(struct erofs_buf *buf) 12 12 { 13 - if (buf->kmap_type == EROFS_KMAP) 14 - kunmap_local(buf->base); 13 + if (!buf->base) 14 + return; 15 + kunmap_local(buf->base); 15 16 buf->base = NULL; 16 - buf->kmap_type = EROFS_NO_KMAP; 17 17 } 18 18 19 19 void erofs_put_metabuf(struct erofs_buf *buf) ··· 38 38 } 39 39 if (!folio || !folio_contains(folio, index)) { 40 40 erofs_put_metabuf(buf); 41 - folio = read_mapping_folio(buf->mapping, index, NULL); 41 + folio = read_mapping_folio(buf->mapping, index, buf->file); 42 42 if (IS_ERR(folio)) 43 43 return folio; 44 44 } 45 45 buf->page = folio_file_page(folio, index); 46 - 47 - if (buf->kmap_type == EROFS_NO_KMAP) { 48 - if (type == EROFS_KMAP) 49 - buf->base = kmap_local_page(buf->page); 50 - buf->kmap_type = type; 51 - } else if (buf->kmap_type != type) { 52 - DBG_BUGON(1); 53 - return ERR_PTR(-EFAULT); 54 - } 46 + if (!buf->base && type == EROFS_KMAP) 47 + buf->base = kmap_local_page(buf->page); 55 48 if (type == EROFS_NO_KMAP) 56 49 return NULL; 57 50 return buf->base + (offset & ~PAGE_MASK); ··· 54 61 { 55 62 struct erofs_sb_info *sbi = EROFS_SB(sb); 56 63 57 - if (erofs_is_fileio_mode(sbi)) 58 - buf->mapping = file_inode(sbi->fdev)->i_mapping; 59 - else if (erofs_is_fscache_mode(sb)) 64 + buf->file = NULL; 65 + if (erofs_is_fileio_mode(sbi)) { 66 + buf->file = sbi->fdev; /* some fs like FUSE needs it */ 67 + buf->mapping = buf->file->f_mapping; 68 + } else if (erofs_is_fscache_mode(sb)) 60 69 buf->mapping = sbi->s_fscache->inode->i_mapping; 61 70 else 62 71 buf->mapping = sb->s_bdev->bd_mapping; ··· 345 350 struct erofs_buf buf = { 346 351 .page = kmap_to_page(ptr), 347 352 .base = ptr, 348 - .kmap_type = EROFS_KMAP, 349 353 }; 350 354 351 355 DBG_BUGON(iomap->type != IOMAP_INLINE); ··· 405 411 if (IS_DAX(inode)) 406 412 return dax_iomap_rw(iocb, to, &erofs_iomap_ops); 407 413 #endif 408 - if (iocb->ki_flags & IOCB_DIRECT) { 409 - struct block_device *bdev = inode->i_sb->s_bdev; 410 - unsigned int blksize_mask; 411 - 412 - if (bdev) 413 - blksize_mask = bdev_logical_block_size(bdev) - 1; 414 - else 415 - blksize_mask = i_blocksize(inode) - 1; 416 - 417 - if ((iocb->ki_pos | iov_iter_count(to) | 418 - iov_iter_alignment(to)) & blksize_mask) 419 - return -EINVAL; 420 - 414 + if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_sb->s_bdev) 421 415 return iomap_dio_rw(iocb, to, &erofs_iomap_ops, 422 416 NULL, 0, NULL, 0); 423 - } 424 417 return filemap_read(iocb, to, 0); 425 418 } 426 419 ··· 454 473 #define erofs_file_mmap generic_file_readonly_mmap 455 474 #endif 456 475 476 + static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence) 477 + { 478 + struct inode *inode = file->f_mapping->host; 479 + const struct iomap_ops *ops = &erofs_iomap_ops; 480 + 481 + if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) 482 + #ifdef CONFIG_EROFS_FS_ZIP 483 + ops = &z_erofs_iomap_report_ops; 484 + #else 485 + return generic_file_llseek(file, offset, whence); 486 + #endif 487 + 488 + if (whence == SEEK_HOLE) 489 + offset = iomap_seek_hole(inode, offset, ops); 490 + else if (whence == SEEK_DATA) 491 + offset = iomap_seek_data(inode, offset, ops); 492 + else 493 + return generic_file_llseek(file, offset, whence); 494 + 495 + if (offset < 0) 496 + return offset; 497 + return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 498 + } 499 + 457 500 const struct file_operations erofs_file_fops = { 458 - .llseek = generic_file_llseek, 501 + .llseek = erofs_file_llseek, 459 502 .read_iter = erofs_file_read_iter, 460 503 .mmap = erofs_file_mmap, 461 504 .get_unmapped_area = thp_get_unmapped_area,

+6 -6

fs/erofs/inode.c

··· 318 318 unsigned int query_flags) 319 319 { 320 320 struct inode *const inode = d_inode(path->dentry); 321 + struct block_device *bdev = inode->i_sb->s_bdev; 321 322 bool compressed = 322 323 erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout); 323 324 ··· 331 330 /* 332 331 * Return the DIO alignment restrictions if requested. 333 332 * 334 - * In EROFS, STATX_DIOALIGN is not supported in ondemand mode and 335 - * compressed files, so in these cases we report no DIO support. 333 + * In EROFS, STATX_DIOALIGN is only supported in bdev-based mode 334 + * and uncompressed inodes, otherwise we report no DIO support. 336 335 */ 337 336 if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { 338 337 stat->result_mask |= STATX_DIOALIGN; 339 - if (!erofs_is_fscache_mode(inode->i_sb) && !compressed) { 340 - stat->dio_mem_align = 341 - bdev_logical_block_size(inode->i_sb->s_bdev); 342 - stat->dio_offset_align = stat->dio_mem_align; 338 + if (bdev && !compressed) { 339 + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; 340 + stat->dio_offset_align = bdev_logical_block_size(bdev); 343 341 } 344 342 } 345 343 generic_fillattr(idmap, request_mask, inode, stat);

+10 -25

fs/erofs/internal.h

··· 20 20 #include <linux/iomap.h> 21 21 #include "erofs_fs.h" 22 22 23 - /* redefine pr_fmt "erofs: " */ 24 - #undef pr_fmt 25 - #define pr_fmt(fmt) "erofs: " fmt 26 - 27 - __printf(3, 4) void _erofs_err(struct super_block *sb, 28 - const char *function, const char *fmt, ...); 23 + __printf(2, 3) void _erofs_printk(struct super_block *sb, const char *fmt, ...); 29 24 #define erofs_err(sb, fmt, ...) \ 30 - _erofs_err(sb, __func__, fmt "\n", ##__VA_ARGS__) 31 - __printf(3, 4) void _erofs_info(struct super_block *sb, 32 - const char *function, const char *fmt, ...); 25 + _erofs_printk(sb, KERN_ERR fmt "\n", ##__VA_ARGS__) 33 26 #define erofs_info(sb, fmt, ...) \ 34 - _erofs_info(sb, __func__, fmt "\n", ##__VA_ARGS__) 27 + _erofs_printk(sb, KERN_INFO fmt "\n", ##__VA_ARGS__) 28 + 35 29 #ifdef CONFIG_EROFS_FS_DEBUG 36 30 #define DBG_BUGON BUG_ON 37 31 #else ··· 202 208 EROFS_ZIP_CACHE_READAROUND 203 209 }; 204 210 205 - /* basic unit of the workstation of a super_block */ 206 - struct erofs_workgroup { 207 - pgoff_t index; 208 - struct lockref lockref; 209 - }; 210 - 211 211 enum erofs_kmap_type { 212 212 EROFS_NO_KMAP, /* don't map the buffer */ 213 213 EROFS_KMAP, /* use kmap_local_page() to map the buffer */ ··· 209 221 210 222 struct erofs_buf { 211 223 struct address_space *mapping; 224 + struct file *file; 212 225 struct page *page; 213 226 void *base; 214 - enum erofs_kmap_type kmap_type; 215 227 }; 216 228 #define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL }) 217 229 ··· 444 456 void erofs_release_pages(struct page **pagepool); 445 457 446 458 #ifdef CONFIG_EROFS_FS_ZIP 447 - void erofs_workgroup_put(struct erofs_workgroup *grp); 448 - struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, 449 - pgoff_t index); 450 - struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, 451 - struct erofs_workgroup *grp); 452 - void erofs_workgroup_free_rcu(struct erofs_workgroup *grp); 459 + #define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) 460 + 461 + extern atomic_long_t erofs_global_shrink_cnt; 453 462 void erofs_shrinker_register(struct super_block *sb); 454 463 void erofs_shrinker_unregister(struct super_block *sb); 455 464 int __init erofs_init_shrinker(void); 456 465 void erofs_exit_shrinker(void); 457 466 int __init z_erofs_init_subsystem(void); 458 467 void z_erofs_exit_subsystem(void); 459 - int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi, 460 - struct erofs_workgroup *egrp); 468 + unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, 469 + unsigned long nr_shrink); 461 470 int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, 462 471 int flags); 463 472 void *z_erofs_get_gbuf(unsigned int requiredpages);

+12 -23

fs/erofs/super.c

··· 18 18 19 19 static struct kmem_cache *erofs_inode_cachep __read_mostly; 20 20 21 - void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...) 21 + void _erofs_printk(struct super_block *sb, const char *fmt, ...) 22 22 { 23 23 struct va_format vaf; 24 24 va_list args; 25 + int level; 25 26 26 27 va_start(args, fmt); 27 28 28 - vaf.fmt = fmt; 29 + level = printk_get_level(fmt); 30 + vaf.fmt = printk_skip_level(fmt); 29 31 vaf.va = &args; 30 - 31 32 if (sb) 32 - pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf); 33 + printk("%c%cerofs (device %s): %pV", 34 + KERN_SOH_ASCII, level, sb->s_id, &vaf); 33 35 else 34 - pr_err("%s: %pV", func, &vaf); 35 - va_end(args); 36 - } 37 - 38 - void _erofs_info(struct super_block *sb, const char *func, const char *fmt, ...) 39 - { 40 - struct va_format vaf; 41 - va_list args; 42 - 43 - va_start(args, fmt); 44 - 45 - vaf.fmt = fmt; 46 - vaf.va = &args; 47 - 48 - if (sb) 49 - pr_info("(device %s): %pV", sb->s_id, &vaf); 50 - else 51 - pr_info("%pV", &vaf); 36 + printk("%c%cerofs: %pV", KERN_SOH_ASCII, level, &vaf); 52 37 va_end(args); 53 38 } 54 39 ··· 616 631 errorfc(fc, "unsupported blksize for fscache mode"); 617 632 return -EINVAL; 618 633 } 619 - if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) { 634 + 635 + if (erofs_is_fileio_mode(sbi)) { 636 + sb->s_blocksize = 1 << sbi->blkszbits; 637 + sb->s_blocksize_bits = sbi->blkszbits; 638 + } else if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) { 620 639 errorfc(fc, "failed to set erofs blksize"); 621 640 return -EINVAL; 622 641 }

+17

fs/erofs/sysfs.c

··· 10 10 11 11 enum { 12 12 attr_feature, 13 + attr_drop_caches, 13 14 attr_pointer_ui, 14 15 attr_pointer_bool, 15 16 }; ··· 58 57 59 58 #ifdef CONFIG_EROFS_FS_ZIP 60 59 EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts); 60 + EROFS_ATTR_FUNC(drop_caches, 0200); 61 61 #endif 62 62 63 63 static struct attribute *erofs_attrs[] = { 64 64 #ifdef CONFIG_EROFS_FS_ZIP 65 65 ATTR_LIST(sync_decompress), 66 + ATTR_LIST(drop_caches), 66 67 #endif 67 68 NULL, 68 69 }; ··· 166 163 return -EINVAL; 167 164 *(bool *)ptr = !!t; 168 165 return len; 166 + #ifdef CONFIG_EROFS_FS_ZIP 167 + case attr_drop_caches: 168 + ret = kstrtoul(skip_spaces(buf), 0, &t); 169 + if (ret) 170 + return ret; 171 + if (t < 1 || t > 3) 172 + return -EINVAL; 173 + 174 + if (t & 2) 175 + z_erofs_shrink_scan(sbi, ~0UL); 176 + if (t & 1) 177 + invalidate_mapping_pages(MNGD_MAPPING(sbi), 0, -1); 178 + return len; 179 + #endif 169 180 } 170 181 return 0; 171 182 }

+167 -54

fs/erofs/zdata.c

··· 44 44 * A: Field should be accessed / updated in atomic for parallelized code. 45 45 */ 46 46 struct z_erofs_pcluster { 47 - struct erofs_workgroup obj; 48 47 struct mutex lock; 48 + struct lockref lockref; 49 49 50 50 /* A: point to next chained pcluster or TAILs */ 51 51 z_erofs_next_pcluster_t next; 52 + 53 + /* I: start block address of this pcluster */ 54 + erofs_off_t index; 52 55 53 56 /* L: the maximum decompression size of this round */ 54 57 unsigned int length; ··· 111 108 112 109 static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) 113 110 { 114 - return !pcl->obj.index; 111 + return !pcl->index; 115 112 } 116 113 117 114 static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) ··· 119 116 return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT; 120 117 } 121 118 122 - #define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) 123 119 static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo) 124 120 { 125 121 return fo->mapping == MNGD_MAPPING(sbi); ··· 550 548 if (READ_ONCE(pcl->compressed_bvecs[i].page)) 551 549 continue; 552 550 553 - page = find_get_page(mc, pcl->obj.index + i); 551 + page = find_get_page(mc, pcl->index + i); 554 552 if (!page) { 555 553 /* I/O is needed, no possible to decompress directly */ 556 554 standalone = false; ··· 566 564 continue; 567 565 set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); 568 566 } 569 - spin_lock(&pcl->obj.lockref.lock); 567 + spin_lock(&pcl->lockref.lock); 570 568 if (!pcl->compressed_bvecs[i].page) { 571 569 pcl->compressed_bvecs[i].page = page ? page : newpage; 572 - spin_unlock(&pcl->obj.lockref.lock); 570 + spin_unlock(&pcl->lockref.lock); 573 571 continue; 574 572 } 575 - spin_unlock(&pcl->obj.lockref.lock); 573 + spin_unlock(&pcl->lockref.lock); 576 574 577 575 if (page) 578 576 put_page(page); ··· 589 587 } 590 588 591 589 /* (erofs_shrinker) disconnect cached encoded data with pclusters */ 592 - int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi, 593 - struct erofs_workgroup *grp) 590 + static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi, 591 + struct z_erofs_pcluster *pcl) 594 592 { 595 - struct z_erofs_pcluster *const pcl = 596 - container_of(grp, struct z_erofs_pcluster, obj); 597 593 unsigned int pclusterpages = z_erofs_pclusterpages(pcl); 598 594 struct folio *folio; 599 595 int i; ··· 626 626 return true; 627 627 628 628 ret = false; 629 - spin_lock(&pcl->obj.lockref.lock); 630 - if (pcl->obj.lockref.count <= 0) { 629 + spin_lock(&pcl->lockref.lock); 630 + if (pcl->lockref.count <= 0) { 631 631 DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); 632 632 for (; bvec < end; ++bvec) { 633 633 if (bvec->page && page_folio(bvec->page) == folio) { ··· 638 638 } 639 639 } 640 640 } 641 - spin_unlock(&pcl->obj.lockref.lock); 641 + spin_unlock(&pcl->lockref.lock); 642 642 return ret; 643 643 } 644 644 ··· 689 689 690 690 if (exclusive) { 691 691 /* give priority for inplaceio to use file pages first */ 692 - spin_lock(&pcl->obj.lockref.lock); 692 + spin_lock(&pcl->lockref.lock); 693 693 while (fe->icur > 0) { 694 694 if (pcl->compressed_bvecs[--fe->icur].page) 695 695 continue; 696 696 pcl->compressed_bvecs[fe->icur] = *bvec; 697 - spin_unlock(&pcl->obj.lockref.lock); 697 + spin_unlock(&pcl->lockref.lock); 698 698 return 0; 699 699 } 700 - spin_unlock(&pcl->obj.lockref.lock); 700 + spin_unlock(&pcl->lockref.lock); 701 701 702 702 /* otherwise, check if it can be used as a bvpage */ 703 703 if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && ··· 710 710 return ret; 711 711 } 712 712 713 + static bool z_erofs_get_pcluster(struct z_erofs_pcluster *pcl) 714 + { 715 + if (lockref_get_not_zero(&pcl->lockref)) 716 + return true; 717 + 718 + spin_lock(&pcl->lockref.lock); 719 + if (__lockref_is_dead(&pcl->lockref)) { 720 + spin_unlock(&pcl->lockref.lock); 721 + return false; 722 + } 723 + 724 + if (!pcl->lockref.count++) 725 + atomic_long_dec(&erofs_global_shrink_cnt); 726 + spin_unlock(&pcl->lockref.lock); 727 + return true; 728 + } 729 + 713 730 static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) 714 731 { 715 732 struct erofs_map_blocks *map = &fe->map; 716 733 struct super_block *sb = fe->inode->i_sb; 734 + struct erofs_sb_info *sbi = EROFS_SB(sb); 717 735 bool ztailpacking = map->m_flags & EROFS_MAP_META; 718 - struct z_erofs_pcluster *pcl; 719 - struct erofs_workgroup *grp; 736 + struct z_erofs_pcluster *pcl, *pre; 720 737 int err; 721 738 722 739 if (!(map->m_flags & EROFS_MAP_ENCODED) || ··· 747 730 if (IS_ERR(pcl)) 748 731 return PTR_ERR(pcl); 749 732 750 - spin_lock_init(&pcl->obj.lockref.lock); 751 - pcl->obj.lockref.count = 1; /* one ref for this request */ 733 + spin_lock_init(&pcl->lockref.lock); 734 + pcl->lockref.count = 1; /* one ref for this request */ 752 735 pcl->algorithmformat = map->m_algorithmformat; 753 736 pcl->length = 0; 754 737 pcl->partial = true; ··· 766 749 DBG_BUGON(!mutex_trylock(&pcl->lock)); 767 750 768 751 if (ztailpacking) { 769 - pcl->obj.index = 0; /* which indicates ztailpacking */ 752 + pcl->index = 0; /* which indicates ztailpacking */ 770 753 } else { 771 - pcl->obj.index = erofs_blknr(sb, map->m_pa); 772 - 773 - grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); 774 - if (IS_ERR(grp)) { 775 - err = PTR_ERR(grp); 776 - goto err_out; 754 + pcl->index = erofs_blknr(sb, map->m_pa); 755 + while (1) { 756 + xa_lock(&sbi->managed_pslots); 757 + pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->index, 758 + NULL, pcl, GFP_KERNEL); 759 + if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) { 760 + xa_unlock(&sbi->managed_pslots); 761 + break; 762 + } 763 + /* try to legitimize the current in-tree one */ 764 + xa_unlock(&sbi->managed_pslots); 765 + cond_resched(); 777 766 } 778 - 779 - if (grp != &pcl->obj) { 780 - fe->pcl = container_of(grp, 781 - struct z_erofs_pcluster, obj); 767 + if (xa_is_err(pre)) { 768 + err = xa_err(pre); 769 + goto err_out; 770 + } else if (pre) { 771 + fe->pcl = pre; 782 772 err = -EEXIST; 783 773 goto err_out; 784 774 } ··· 805 781 struct erofs_map_blocks *map = &fe->map; 806 782 struct super_block *sb = fe->inode->i_sb; 807 783 erofs_blk_t blknr = erofs_blknr(sb, map->m_pa); 808 - struct erofs_workgroup *grp = NULL; 784 + struct z_erofs_pcluster *pcl = NULL; 809 785 int ret; 810 786 811 787 DBG_BUGON(fe->pcl); ··· 813 789 DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); 814 790 815 791 if (!(map->m_flags & EROFS_MAP_META)) { 816 - grp = erofs_find_workgroup(sb, blknr); 792 + while (1) { 793 + rcu_read_lock(); 794 + pcl = xa_load(&EROFS_SB(sb)->managed_pslots, blknr); 795 + if (!pcl || z_erofs_get_pcluster(pcl)) { 796 + DBG_BUGON(pcl && blknr != pcl->index); 797 + rcu_read_unlock(); 798 + break; 799 + } 800 + rcu_read_unlock(); 801 + } 817 802 } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { 818 803 DBG_BUGON(1); 819 804 return -EFSCORRUPTED; 820 805 } 821 806 822 - if (grp) { 823 - fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); 807 + if (pcl) { 808 + fe->pcl = pcl; 824 809 ret = -EEXIST; 825 810 } else { 826 811 ret = z_erofs_register_pcluster(fe); ··· 884 851 struct z_erofs_pcluster, rcu)); 885 852 } 886 853 887 - void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) 854 + static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi, 855 + struct z_erofs_pcluster *pcl) 888 856 { 889 - struct z_erofs_pcluster *const pcl = 890 - container_of(grp, struct z_erofs_pcluster, obj); 857 + if (pcl->lockref.count) 858 + return false; 891 859 892 - call_rcu(&pcl->rcu, z_erofs_rcu_callback); 860 + /* 861 + * Note that all cached folios should be detached before deleted from 862 + * the XArray. Otherwise some folios could be still attached to the 863 + * orphan old pcluster when the new one is available in the tree. 864 + */ 865 + if (erofs_try_to_free_all_cached_folios(sbi, pcl)) 866 + return false; 867 + 868 + /* 869 + * It's impossible to fail after the pcluster is freezed, but in order 870 + * to avoid some race conditions, add a DBG_BUGON to observe this. 871 + */ 872 + DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->index) != pcl); 873 + 874 + lockref_mark_dead(&pcl->lockref); 875 + return true; 876 + } 877 + 878 + static bool erofs_try_to_release_pcluster(struct erofs_sb_info *sbi, 879 + struct z_erofs_pcluster *pcl) 880 + { 881 + bool free; 882 + 883 + spin_lock(&pcl->lockref.lock); 884 + free = __erofs_try_to_release_pcluster(sbi, pcl); 885 + spin_unlock(&pcl->lockref.lock); 886 + if (free) { 887 + atomic_long_dec(&erofs_global_shrink_cnt); 888 + call_rcu(&pcl->rcu, z_erofs_rcu_callback); 889 + } 890 + return free; 891 + } 892 + 893 + unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, 894 + unsigned long nr_shrink) 895 + { 896 + struct z_erofs_pcluster *pcl; 897 + unsigned int freed = 0; 898 + unsigned long index; 899 + 900 + xa_lock(&sbi->managed_pslots); 901 + xa_for_each(&sbi->managed_pslots, index, pcl) { 902 + /* try to shrink each valid pcluster */ 903 + if (!erofs_try_to_release_pcluster(sbi, pcl)) 904 + continue; 905 + xa_unlock(&sbi->managed_pslots); 906 + 907 + ++freed; 908 + if (!--nr_shrink) 909 + return freed; 910 + xa_lock(&sbi->managed_pslots); 911 + } 912 + xa_unlock(&sbi->managed_pslots); 913 + return freed; 914 + } 915 + 916 + static void z_erofs_put_pcluster(struct erofs_sb_info *sbi, 917 + struct z_erofs_pcluster *pcl, bool try_free) 918 + { 919 + bool free = false; 920 + 921 + if (lockref_put_or_lock(&pcl->lockref)) 922 + return; 923 + 924 + DBG_BUGON(__lockref_is_dead(&pcl->lockref)); 925 + if (!--pcl->lockref.count) { 926 + if (try_free && xa_trylock(&sbi->managed_pslots)) { 927 + free = __erofs_try_to_release_pcluster(sbi, pcl); 928 + xa_unlock(&sbi->managed_pslots); 929 + } 930 + atomic_long_add(!free, &erofs_global_shrink_cnt); 931 + } 932 + spin_unlock(&pcl->lockref.lock); 933 + if (free) 934 + call_rcu(&pcl->rcu, z_erofs_rcu_callback); 893 935 } 894 936 895 937 static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) ··· 985 877 * any longer if the pcluster isn't hosted by ourselves. 986 878 */ 987 879 if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE) 988 - erofs_workgroup_put(&pcl->obj); 880 + z_erofs_put_pcluster(EROFS_I_SB(fe->inode), pcl, false); 989 881 990 882 fe->pcl = NULL; 991 883 } ··· 1287 1179 int i, j, jtop, err2; 1288 1180 struct page *page; 1289 1181 bool overlapped; 1182 + bool try_free = true; 1290 1183 1291 1184 mutex_lock(&pcl->lock); 1292 1185 be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT; ··· 1345 1236 /* managed folios are still left in compressed_bvecs[] */ 1346 1237 for (i = 0; i < pclusterpages; ++i) { 1347 1238 page = be->compressed_pages[i]; 1348 - if (!page || 1349 - erofs_folio_is_managed(sbi, page_folio(page))) 1239 + if (!page) 1350 1240 continue; 1241 + if (erofs_folio_is_managed(sbi, page_folio(page))) { 1242 + try_free = false; 1243 + continue; 1244 + } 1351 1245 (void)z_erofs_put_shortlivedpage(be->pagepool, page); 1352 1246 WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); 1353 1247 } ··· 1396 1284 /* pcluster lock MUST be taken before the following line */ 1397 1285 WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); 1398 1286 mutex_unlock(&pcl->lock); 1287 + 1288 + if (z_erofs_is_inline_pcluster(pcl)) 1289 + z_erofs_free_pcluster(pcl); 1290 + else 1291 + z_erofs_put_pcluster(sbi, pcl, try_free); 1399 1292 return err; 1400 1293 } 1401 1294 ··· 1423 1306 owned = READ_ONCE(be.pcl->next); 1424 1307 1425 1308 err = z_erofs_decompress_pcluster(&be, err) ?: err; 1426 - if (z_erofs_is_inline_pcluster(be.pcl)) 1427 - z_erofs_free_pcluster(be.pcl); 1428 - else 1429 - erofs_workgroup_put(&be.pcl->obj); 1430 1309 } 1431 1310 return err; 1432 1311 } ··· 1504 1391 bvec->bv_offset = 0; 1505 1392 bvec->bv_len = PAGE_SIZE; 1506 1393 repeat: 1507 - spin_lock(&pcl->obj.lockref.lock); 1394 + spin_lock(&pcl->lockref.lock); 1508 1395 zbv = pcl->compressed_bvecs[nr]; 1509 - spin_unlock(&pcl->obj.lockref.lock); 1396 + spin_unlock(&pcl->lockref.lock); 1510 1397 if (!zbv.page) 1511 1398 goto out_allocfolio; 1512 1399 ··· 1568 1455 folio_put(folio); 1569 1456 out_allocfolio: 1570 1457 page = __erofs_allocpage(&f->pagepool, gfp, true); 1571 - spin_lock(&pcl->obj.lockref.lock); 1458 + spin_lock(&pcl->lockref.lock); 1572 1459 if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) { 1573 1460 if (page) 1574 1461 erofs_pagepool_add(&f->pagepool, page); 1575 - spin_unlock(&pcl->obj.lockref.lock); 1462 + spin_unlock(&pcl->lockref.lock); 1576 1463 cond_resched(); 1577 1464 goto repeat; 1578 1465 } 1579 1466 pcl->compressed_bvecs[nr].page = page ? page : ERR_PTR(-ENOMEM); 1580 - spin_unlock(&pcl->obj.lockref.lock); 1467 + spin_unlock(&pcl->lockref.lock); 1581 1468 bvec->bv_page = page; 1582 1469 if (!page) 1583 1470 return; 1584 1471 folio = page_folio(page); 1585 1472 out_tocache: 1586 1473 if (!tocache || bs != PAGE_SIZE || 1587 - filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp)) { 1474 + filemap_add_folio(mc, folio, pcl->index + nr, gfp)) { 1588 1475 /* turn into a temporary shortlived folio (1 ref) */ 1589 1476 folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE; 1590 1477 return; ··· 1716 1603 1717 1604 /* no device id here, thus it will always succeed */ 1718 1605 mdev = (struct erofs_map_dev) { 1719 - .m_pa = erofs_pos(sb, pcl->obj.index), 1606 + .m_pa = erofs_pos(sb, pcl->index), 1720 1607 }; 1721 1608 (void)erofs_map_dev(sb, &mdev); 1722 1609

+9 -8

fs/erofs/zmap.c

··· 219 219 unsigned int amortizedshift; 220 220 erofs_off_t pos; 221 221 222 - if (lcn >= totalidx) 222 + if (lcn >= totalidx || vi->z_logical_clusterbits > 14) 223 223 return -EINVAL; 224 224 225 225 m->lcn = lcn; ··· 390 390 u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits; 391 391 int err; 392 392 393 - do { 393 + while (1) { 394 394 /* handle the last EOF pcluster (no next HEAD lcluster) */ 395 395 if ((lcn << lclusterbits) >= inode->i_size) { 396 396 map->m_llen = inode->i_size - map->m_la; ··· 402 402 return err; 403 403 404 404 if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { 405 - DBG_BUGON(!m->delta[1] && 406 - m->clusterofs != 1 << lclusterbits); 405 + /* work around invalid d1 generated by pre-1.0 mkfs */ 406 + if (unlikely(!m->delta[1])) { 407 + m->delta[1] = 1; 408 + DBG_BUGON(1); 409 + } 407 410 } else if (m->type == Z_EROFS_LCLUSTER_TYPE_PLAIN || 408 411 m->type == Z_EROFS_LCLUSTER_TYPE_HEAD1 || 409 412 m->type == Z_EROFS_LCLUSTER_TYPE_HEAD2) { 410 - /* go on until the next HEAD lcluster */ 411 413 if (lcn != headlcn) 412 - break; 414 + break; /* ends at the next HEAD lcluster */ 413 415 m->delta[1] = 1; 414 416 } else { 415 417 erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu", ··· 420 418 return -EOPNOTSUPP; 421 419 } 422 420 lcn += m->delta[1]; 423 - } while (m->delta[1]); 424 - 421 + } 425 422 map->m_llen = (lcn << lclusterbits) + m->clusterofs - map->m_la; 426 423 return 0; 427 424 }

+7 -148

fs/erofs/zutil.c

··· 2 2 /* 3 3 * Copyright (C) 2018 HUAWEI, Inc. 4 4 * https://www.huawei.com/ 5 + * Copyright (C) 2024 Alibaba Cloud 5 6 */ 6 7 #include "internal.h" 7 8 ··· 20 19 module_param_named(global_buffers, z_erofs_gbuf_count, uint, 0444); 21 20 module_param_named(reserved_pages, z_erofs_rsv_nrpages, uint, 0444); 22 21 23 - static atomic_long_t erofs_global_shrink_cnt; /* for all mounted instances */ 24 - /* protected by 'erofs_sb_list_lock' */ 25 - static unsigned int shrinker_run_no; 22 + atomic_long_t erofs_global_shrink_cnt; /* for all mounted instances */ 26 23 27 - /* protects the mounted 'erofs_sb_list' */ 24 + /* protects `erofs_sb_list_lock` and the mounted `erofs_sb_list` */ 28 25 static DEFINE_SPINLOCK(erofs_sb_list_lock); 29 26 static LIST_HEAD(erofs_sb_list); 27 + static unsigned int shrinker_run_no; 30 28 static struct shrinker *erofs_shrinker_info; 31 29 32 30 static unsigned int z_erofs_gbuf_id(void) ··· 214 214 } 215 215 } 216 216 217 - static bool erofs_workgroup_get(struct erofs_workgroup *grp) 218 - { 219 - if (lockref_get_not_zero(&grp->lockref)) 220 - return true; 221 - 222 - spin_lock(&grp->lockref.lock); 223 - if (__lockref_is_dead(&grp->lockref)) { 224 - spin_unlock(&grp->lockref.lock); 225 - return false; 226 - } 227 - 228 - if (!grp->lockref.count++) 229 - atomic_long_dec(&erofs_global_shrink_cnt); 230 - spin_unlock(&grp->lockref.lock); 231 - return true; 232 - } 233 - 234 - struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, 235 - pgoff_t index) 236 - { 237 - struct erofs_sb_info *sbi = EROFS_SB(sb); 238 - struct erofs_workgroup *grp; 239 - 240 - repeat: 241 - rcu_read_lock(); 242 - grp = xa_load(&sbi->managed_pslots, index); 243 - if (grp) { 244 - if (!erofs_workgroup_get(grp)) { 245 - /* prefer to relax rcu read side */ 246 - rcu_read_unlock(); 247 - goto repeat; 248 - } 249 - 250 - DBG_BUGON(index != grp->index); 251 - } 252 - rcu_read_unlock(); 253 - return grp; 254 - } 255 - 256 - struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, 257 - struct erofs_workgroup *grp) 258 - { 259 - struct erofs_sb_info *const sbi = EROFS_SB(sb); 260 - struct erofs_workgroup *pre; 261 - 262 - DBG_BUGON(grp->lockref.count < 1); 263 - repeat: 264 - xa_lock(&sbi->managed_pslots); 265 - pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index, 266 - NULL, grp, GFP_KERNEL); 267 - if (pre) { 268 - if (xa_is_err(pre)) { 269 - pre = ERR_PTR(xa_err(pre)); 270 - } else if (!erofs_workgroup_get(pre)) { 271 - /* try to legitimize the current in-tree one */ 272 - xa_unlock(&sbi->managed_pslots); 273 - cond_resched(); 274 - goto repeat; 275 - } 276 - grp = pre; 277 - } 278 - xa_unlock(&sbi->managed_pslots); 279 - return grp; 280 - } 281 - 282 - static void __erofs_workgroup_free(struct erofs_workgroup *grp) 283 - { 284 - atomic_long_dec(&erofs_global_shrink_cnt); 285 - erofs_workgroup_free_rcu(grp); 286 - } 287 - 288 - void erofs_workgroup_put(struct erofs_workgroup *grp) 289 - { 290 - if (lockref_put_or_lock(&grp->lockref)) 291 - return; 292 - 293 - DBG_BUGON(__lockref_is_dead(&grp->lockref)); 294 - if (grp->lockref.count == 1) 295 - atomic_long_inc(&erofs_global_shrink_cnt); 296 - --grp->lockref.count; 297 - spin_unlock(&grp->lockref.lock); 298 - } 299 - 300 - static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, 301 - struct erofs_workgroup *grp) 302 - { 303 - int free = false; 304 - 305 - spin_lock(&grp->lockref.lock); 306 - if (grp->lockref.count) 307 - goto out; 308 - 309 - /* 310 - * Note that all cached pages should be detached before deleted from 311 - * the XArray. Otherwise some cached pages could be still attached to 312 - * the orphan old workgroup when the new one is available in the tree. 313 - */ 314 - if (erofs_try_to_free_all_cached_folios(sbi, grp)) 315 - goto out; 316 - 317 - /* 318 - * It's impossible to fail after the workgroup is freezed, 319 - * however in order to avoid some race conditions, add a 320 - * DBG_BUGON to observe this in advance. 321 - */ 322 - DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp); 323 - 324 - lockref_mark_dead(&grp->lockref); 325 - free = true; 326 - out: 327 - spin_unlock(&grp->lockref.lock); 328 - if (free) 329 - __erofs_workgroup_free(grp); 330 - return free; 331 - } 332 - 333 - static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, 334 - unsigned long nr_shrink) 335 - { 336 - struct erofs_workgroup *grp; 337 - unsigned int freed = 0; 338 - unsigned long index; 339 - 340 - xa_lock(&sbi->managed_pslots); 341 - xa_for_each(&sbi->managed_pslots, index, grp) { 342 - /* try to shrink each valid workgroup */ 343 - if (!erofs_try_to_release_workgroup(sbi, grp)) 344 - continue; 345 - xa_unlock(&sbi->managed_pslots); 346 - 347 - ++freed; 348 - if (!--nr_shrink) 349 - return freed; 350 - xa_lock(&sbi->managed_pslots); 351 - } 352 - xa_unlock(&sbi->managed_pslots); 353 - return freed; 354 - } 355 - 356 217 void erofs_shrinker_register(struct super_block *sb) 357 218 { 358 219 struct erofs_sb_info *sbi = EROFS_SB(sb); ··· 230 369 struct erofs_sb_info *const sbi = EROFS_SB(sb); 231 370 232 371 mutex_lock(&sbi->umount_mutex); 233 - /* clean up all remaining workgroups in memory */ 234 - erofs_shrink_workstation(sbi, ~0UL); 372 + /* clean up all remaining pclusters in memory */ 373 + z_erofs_shrink_scan(sbi, ~0UL); 235 374 236 375 spin_lock(&erofs_sb_list_lock); 237 376 list_del(&sbi->list); ··· 279 418 280 419 spin_unlock(&erofs_sb_list_lock); 281 420 sbi->shrinker_run_no = run_no; 282 - 283 - freed += erofs_shrink_workstation(sbi, nr - freed); 284 - 421 + freed += z_erofs_shrink_scan(sbi, nr - freed); 285 422 spin_lock(&erofs_sb_list_lock); 286 423 /* Get the next list element before we move this one */ 287 424 p = p->next;

Configure Feed

Configure Feed