Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

xfs: cache open zone in inode->i_private

The MRU cache for open zones is unfortunately still not ideal, as it can
time out pretty easily when doing heavy I/O to hard disks using up most
or all open zones. One option would be to just increase the timeout,
but while looking into that I realized we're just better off caching it
indefinitely as there is no real downside to that once we don't hold a
reference to the cache open zone.

So switch the open zone to RCU freeing, and then stash the last used
open zone into inode->i_private. This helps to significantly reduce
fragmentation by keeping I/O localized to zones for workloads that
write using many open files to HDD.

Fixes: 4e4d52075577 ("xfs: add the zoned space allocator")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hans Holmberg <hans.holmberg@wdc.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Tested-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>

authored by

Christoph Hellwig and committed by
Carlos Maiolino
ca3d643a a8c861f4

+53 -85
-1
fs/xfs/xfs_mount.h
··· 236 236 bool m_update_sb; /* sb needs update in mount */ 237 237 unsigned int m_max_open_zones; 238 238 unsigned int m_zonegc_low_space; 239 - struct xfs_mru_cache *m_zone_cache; /* Inode to open zone cache */ 240 239 241 240 /* max_atomic_write mount option value */ 242 241 unsigned long long m_awu_max_bytes;
+6
fs/xfs/xfs_super.c
··· 786 786 787 787 truncate_inode_pages_final(&inode->i_data); 788 788 clear_inode(inode); 789 + 790 + if (IS_ENABLED(CONFIG_XFS_RT) && 791 + S_ISREG(inode->i_mode) && inode->i_private) { 792 + xfs_open_zone_put(inode->i_private); 793 + inode->i_private = NULL; 794 + } 789 795 } 790 796 791 797 static void
+45 -84
fs/xfs/xfs_zone_alloc.c
··· 26 26 #include "xfs_trace.h" 27 27 #include "xfs_mru_cache.h" 28 28 29 + static void 30 + xfs_open_zone_free_rcu( 31 + struct callback_head *cb) 32 + { 33 + struct xfs_open_zone *oz = container_of(cb, typeof(*oz), oz_rcu); 34 + 35 + xfs_rtgroup_rele(oz->oz_rtg); 36 + kfree(oz); 37 + } 38 + 29 39 void 30 40 xfs_open_zone_put( 31 41 struct xfs_open_zone *oz) 32 42 { 33 - if (atomic_dec_and_test(&oz->oz_ref)) { 34 - xfs_rtgroup_rele(oz->oz_rtg); 35 - kfree(oz); 36 - } 43 + if (atomic_dec_and_test(&oz->oz_ref)) 44 + call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu); 37 45 } 38 46 39 47 static inline uint32_t ··· 765 757 } 766 758 767 759 /* 768 - * Cache the last zone written to for an inode so that it is considered first 769 - * for subsequent writes. 770 - */ 771 - struct xfs_zone_cache_item { 772 - struct xfs_mru_cache_elem mru; 773 - struct xfs_open_zone *oz; 774 - }; 775 - 776 - static inline struct xfs_zone_cache_item * 777 - xfs_zone_cache_item(struct xfs_mru_cache_elem *mru) 778 - { 779 - return container_of(mru, struct xfs_zone_cache_item, mru); 780 - } 781 - 782 - static void 783 - xfs_zone_cache_free_func( 784 - void *data, 785 - struct xfs_mru_cache_elem *mru) 786 - { 787 - struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru); 788 - 789 - xfs_open_zone_put(item->oz); 790 - kfree(item); 791 - } 792 - 793 - /* 794 760 * Check if we have a cached last open zone available for the inode and 795 761 * if yes return a reference to it. 796 762 */ 797 763 static struct xfs_open_zone * 798 - xfs_cached_zone( 799 - struct xfs_mount *mp, 800 - struct xfs_inode *ip) 764 + xfs_get_cached_zone( 765 + struct xfs_inode *ip) 801 766 { 802 - struct xfs_mru_cache_elem *mru; 803 - struct xfs_open_zone *oz; 767 + struct xfs_open_zone *oz; 804 768 805 - mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); 806 - if (!mru) 807 - return NULL; 808 - oz = xfs_zone_cache_item(mru)->oz; 769 + rcu_read_lock(); 770 + oz = VFS_I(ip)->i_private; 809 771 if (oz) { 810 772 /* 811 773 * GC only steals open zones at mount time, so no GC zones 812 774 * should end up in the cache. 813 775 */ 814 776 ASSERT(!oz->oz_is_gc); 815 - ASSERT(atomic_read(&oz->oz_ref) > 0); 816 - atomic_inc(&oz->oz_ref); 777 + if (!atomic_inc_not_zero(&oz->oz_ref)) 778 + oz = NULL; 817 779 } 818 - xfs_mru_cache_done(mp->m_zone_cache); 780 + rcu_read_unlock(); 781 + 819 782 return oz; 820 783 } 821 784 822 785 /* 823 - * Update the last used zone cache for a given inode. 786 + * Stash our zone in the inode so that is is reused for future allocations. 824 787 * 825 - * The caller must have a reference on the open zone. 788 + * The open_zone structure will be pinned until either the inode is freed or 789 + * until the cached open zone is replaced with a different one because the 790 + * current one was full when we tried to use it. This means we keep any 791 + * open zone around forever as long as any inode that used it for the last 792 + * write is cached, which slightly increases the memory use of cached inodes 793 + * that were every written to, but significantly simplifies the cached zone 794 + * lookup. Because the open_zone is clearly marked as full when all data 795 + * in the underlying RTG was written, the caching is always safe. 826 796 */ 827 797 static void 828 - xfs_zone_cache_create_association( 829 - struct xfs_inode *ip, 830 - struct xfs_open_zone *oz) 798 + xfs_set_cached_zone( 799 + struct xfs_inode *ip, 800 + struct xfs_open_zone *oz) 831 801 { 832 - struct xfs_mount *mp = ip->i_mount; 833 - struct xfs_zone_cache_item *item = NULL; 834 - struct xfs_mru_cache_elem *mru; 802 + struct xfs_open_zone *old_oz; 835 803 836 - ASSERT(atomic_read(&oz->oz_ref) > 0); 837 804 atomic_inc(&oz->oz_ref); 838 - 839 - mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); 840 - if (mru) { 841 - /* 842 - * If we have an association already, update it to point to the 843 - * new zone. 844 - */ 845 - item = xfs_zone_cache_item(mru); 846 - xfs_open_zone_put(item->oz); 847 - item->oz = oz; 848 - xfs_mru_cache_done(mp->m_zone_cache); 849 - return; 850 - } 851 - 852 - item = kmalloc(sizeof(*item), GFP_KERNEL); 853 - if (!item) { 854 - xfs_open_zone_put(oz); 855 - return; 856 - } 857 - item->oz = oz; 858 - xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru); 805 + old_oz = xchg(&VFS_I(ip)->i_private, oz); 806 + if (old_oz) 807 + xfs_open_zone_put(old_oz); 859 808 } 860 809 861 810 static void ··· 856 891 * the inode is still associated with a zone and use that if so. 857 892 */ 858 893 if (!*oz) 859 - *oz = xfs_cached_zone(mp, ip); 894 + *oz = xfs_get_cached_zone(ip); 860 895 861 896 if (!*oz) { 862 897 select_zone: 863 898 *oz = xfs_select_zone(mp, write_hint, pack_tight); 864 899 if (!*oz) 865 900 goto out_error; 866 - 867 - xfs_zone_cache_create_association(ip, *oz); 901 + xfs_set_cached_zone(ip, *oz); 868 902 } 869 903 870 904 alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size), ··· 941 977 xfs_open_zone_put(oz); 942 978 } 943 979 spin_unlock(&zi->zi_open_zones_lock); 980 + 981 + /* 982 + * Wait for all open zones to be freed so that they drop the group 983 + * references: 984 + */ 985 + rcu_barrier(); 944 986 } 945 987 946 988 struct xfs_init_zones { ··· 1260 1290 error = xfs_zone_gc_mount(mp); 1261 1291 if (error) 1262 1292 goto out_free_zone_info; 1263 - 1264 - /* 1265 - * Set up a mru cache to track inode to open zone for data placement 1266 - * purposes. The magic values for group count and life time is the 1267 - * same as the defaults for file streams, which seems sane enough. 1268 - */ 1269 - xfs_mru_cache_create(&mp->m_zone_cache, mp, 1270 - 5000, 10, xfs_zone_cache_free_func); 1271 1293 return 0; 1272 1294 1273 1295 out_free_zone_info: ··· 1273 1311 { 1274 1312 xfs_zone_gc_unmount(mp); 1275 1313 xfs_free_zone_info(mp->m_zone_info); 1276 - xfs_mru_cache_destroy(mp->m_zone_cache); 1277 1314 }
+2
fs/xfs/xfs_zone_priv.h
··· 44 44 * the life time of an open zone. 45 45 */ 46 46 struct xfs_rtgroup *oz_rtg; 47 + 48 + struct rcu_head oz_rcu; 47 49 }; 48 50 49 51 /*