Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

vfs: optimize inode cache access patterns

The inode structure layout is largely random, and some of the vfs paths
really do care. The path lookup in particular is already quite D$
intensive, and profiles show that accessing the 'inode->i_op->xyz'
fields is quite costly.

We already optimized the dcache to not unnecessarily load the d_op
structure for members that are often NULL using the DCACHE_OP_xyz bits
in dentry->d_flags, and this does something very similar for the inode
ops that are used during pathname lookup.

It also re-orders the fields so that the fields accessed by 'stat' are
together at the beginning of the inode structure, and roughly in the
order accessed.

The effect of this seems to be in the 1-2% range for an empty kernel
"make -j" run (which is fairly kernel-intensive, mostly in filename
lookup), so it's visible. The numbers are fairly noisy, though, and
likely depend a lot on exact microarchitecture. So there's more tuning
to be done.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

+106 -34
+1
fs/inode.c
··· 143 143 inode->i_op = &empty_iops; 144 144 inode->i_fop = &empty_fops; 145 145 inode->i_nlink = 1; 146 + inode->i_opflags = 0; 146 147 inode->i_uid = 0; 147 148 inode->i_gid = 0; 148 149 atomic_set(&inode->i_writecount, 0);
+66 -10
fs/namei.c
··· 308 308 return -EACCES; 309 309 } 310 310 311 + /* 312 + * We _really_ want to just do "generic_permission()" without 313 + * even looking at the inode->i_op values. So we keep a cache 314 + * flag in inode->i_opflags, that says "this has not special 315 + * permission function, use the fast case". 316 + */ 317 + static inline int do_inode_permission(struct inode *inode, int mask) 318 + { 319 + if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { 320 + if (likely(inode->i_op->permission)) 321 + return inode->i_op->permission(inode, mask); 322 + 323 + /* This gets set once for the inode lifetime */ 324 + spin_lock(&inode->i_lock); 325 + inode->i_opflags |= IOP_FASTPERM; 326 + spin_unlock(&inode->i_lock); 327 + } 328 + return generic_permission(inode, mask); 329 + } 330 + 311 331 /** 312 332 * inode_permission - check for access rights to a given inode 313 333 * @inode: inode to check permission on ··· 342 322 { 343 323 int retval; 344 324 345 - if (mask & MAY_WRITE) { 325 + if (unlikely(mask & MAY_WRITE)) { 346 326 umode_t mode = inode->i_mode; 347 327 348 328 /* ··· 359 339 return -EACCES; 360 340 } 361 341 362 - if (inode->i_op->permission) 363 - retval = inode->i_op->permission(inode, mask); 364 - else 365 - retval = generic_permission(inode, mask); 366 - 342 + retval = do_inode_permission(inode, mask); 367 343 if (retval) 368 344 return retval; 369 345 ··· 1261 1245 } 1262 1246 } 1263 1247 1248 + /* 1249 + * Do we need to follow links? We _really_ want to be able 1250 + * to do this check without having to look at inode->i_op, 1251 + * so we keep a cache of "no, this doesn't need follow_link" 1252 + * for the common case. 1253 + */ 1254 + static inline int do_follow_link(struct inode *inode, int follow) 1255 + { 1256 + if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { 1257 + if (likely(inode->i_op->follow_link)) 1258 + return follow; 1259 + 1260 + /* This gets set once for the inode lifetime */ 1261 + spin_lock(&inode->i_lock); 1262 + inode->i_opflags |= IOP_NOFOLLOW; 1263 + spin_unlock(&inode->i_lock); 1264 + } 1265 + return 0; 1266 + } 1267 + 1264 1268 static inline int walk_component(struct nameidata *nd, struct path *path, 1265 1269 struct qstr *name, int type, int follow) 1266 1270 { ··· 1303 1267 terminate_walk(nd); 1304 1268 return -ENOENT; 1305 1269 } 1306 - if (unlikely(inode->i_op->follow_link) && follow) { 1270 + if (do_follow_link(inode, follow)) { 1307 1271 if (nd->flags & LOOKUP_RCU) { 1308 1272 if (unlikely(unlazy_walk(nd, path->dentry))) { 1309 1273 terminate_walk(nd); ··· 1353 1317 current->link_count--; 1354 1318 nd->depth--; 1355 1319 return res; 1320 + } 1321 + 1322 + /* 1323 + * We really don't want to look at inode->i_op->lookup 1324 + * when we don't have to. So we keep a cache bit in 1325 + * the inode ->i_opflags field that says "yes, we can 1326 + * do lookup on this inode". 1327 + */ 1328 + static inline int can_lookup(struct inode *inode) 1329 + { 1330 + if (likely(inode->i_opflags & IOP_LOOKUP)) 1331 + return 1; 1332 + if (likely(!inode->i_op->lookup)) 1333 + return 0; 1334 + 1335 + /* We do this once for the lifetime of the inode */ 1336 + spin_lock(&inode->i_lock); 1337 + inode->i_opflags |= IOP_LOOKUP; 1338 + spin_unlock(&inode->i_lock); 1339 + return 1; 1356 1340 } 1357 1341 1358 1342 /* ··· 1454 1398 if (err) 1455 1399 return err; 1456 1400 } 1401 + if (can_lookup(nd->inode)) 1402 + continue; 1457 1403 err = -ENOTDIR; 1458 - if (!nd->inode->i_op->lookup) 1459 - break; 1460 - continue; 1404 + break; 1461 1405 /* here ends the main loop */ 1462 1406 1463 1407 last_component:
+2 -2
fs/stat.c
··· 27 27 stat->uid = inode->i_uid; 28 28 stat->gid = inode->i_gid; 29 29 stat->rdev = inode->i_rdev; 30 + stat->size = i_size_read(inode); 30 31 stat->atime = inode->i_atime; 31 32 stat->mtime = inode->i_mtime; 32 33 stat->ctime = inode->i_ctime; 33 - stat->size = i_size_read(inode); 34 - stat->blocks = inode->i_blocks; 35 34 stat->blksize = (1 << inode->i_blkbits); 35 + stat->blocks = inode->i_blocks; 36 36 } 37 37 38 38 EXPORT_SYMBOL(generic_fillattr);
+37 -22
include/linux/fs.h
··· 738 738 struct posix_acl; 739 739 #define ACL_NOT_CACHED ((void *)(-1)) 740 740 741 + #define IOP_FASTPERM 0x0001 742 + #define IOP_LOOKUP 0x0002 743 + #define IOP_NOFOLLOW 0x0004 744 + 745 + /* 746 + * Keep mostly read-only and often accessed (especially for 747 + * the RCU path lookup and 'stat' data) fields at the beginning 748 + * of the 'struct inode' 749 + */ 741 750 struct inode { 742 - /* RCU path lookup touches following: */ 743 751 umode_t i_mode; 752 + unsigned short i_opflags; 744 753 uid_t i_uid; 745 754 gid_t i_gid; 755 + unsigned int i_flags; 756 + 757 + #ifdef CONFIG_FS_POSIX_ACL 758 + struct posix_acl *i_acl; 759 + struct posix_acl *i_default_acl; 760 + #endif 761 + 746 762 const struct inode_operations *i_op; 747 763 struct super_block *i_sb; 764 + struct address_space *i_mapping; 748 765 749 - spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ 750 - unsigned int i_flags; 751 - unsigned long i_state; 752 766 #ifdef CONFIG_SECURITY 753 767 void *i_security; 754 768 #endif 755 - struct mutex i_mutex; 756 769 770 + /* Stat data, not accessed from path walking */ 771 + unsigned long i_ino; 772 + unsigned int i_nlink; 773 + dev_t i_rdev; 774 + loff_t i_size; 775 + struct timespec i_atime; 776 + struct timespec i_mtime; 777 + struct timespec i_ctime; 778 + unsigned int i_blkbits; 779 + blkcnt_t i_blocks; 780 + 781 + #ifdef __NEED_I_SIZE_ORDERED 782 + seqcount_t i_size_seqcount; 783 + #endif 784 + 785 + /* Misc */ 786 + unsigned long i_state; 787 + spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ 788 + struct mutex i_mutex; 757 789 758 790 unsigned long dirtied_when; /* jiffies of first dirtying */ 759 791 ··· 797 765 struct list_head i_dentry; 798 766 struct rcu_head i_rcu; 799 767 }; 800 - unsigned long i_ino; 801 768 atomic_t i_count; 802 - unsigned int i_nlink; 803 - dev_t i_rdev; 804 - unsigned int i_blkbits; 805 769 u64 i_version; 806 - loff_t i_size; 807 - #ifdef __NEED_I_SIZE_ORDERED 808 - seqcount_t i_size_seqcount; 809 - #endif 810 - struct timespec i_atime; 811 - struct timespec i_mtime; 812 - struct timespec i_ctime; 813 - blkcnt_t i_blocks; 814 770 unsigned short i_bytes; 815 771 atomic_t i_dio_count; 816 772 const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ 817 773 struct file_lock *i_flock; 818 - struct address_space *i_mapping; 819 774 struct address_space i_data; 820 775 #ifdef CONFIG_QUOTA 821 776 struct dquot *i_dquot[MAXQUOTAS]; ··· 825 806 atomic_t i_readcount; /* struct files open RO */ 826 807 #endif 827 808 atomic_t i_writecount; 828 - #ifdef CONFIG_FS_POSIX_ACL 829 - struct posix_acl *i_acl; 830 - struct posix_acl *i_default_acl; 831 - #endif 832 809 void *i_private; /* fs or device private pointer */ 833 810 }; 834 811