Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

NFSD: Implement NFSD_IO_DIRECT for NFS READ

Add an experimental option that forces NFS READ operations to use
direct I/O instead of reading through the NFS server's page cache.

There is already at least one other layer of read caching: the page
cache on NFS clients.

The server's page cache, in many cases, is unlikely to provide
additional benefit. Some benchmarks have demonstrated that the
server's page cache is actively detrimental for workloads whose
working set is larger than the server's available physical memory.

For instance, on small NFS servers, cached NFS file content can
squeeze out local memory consumers. For large sequential workloads,
an enormous amount of data flows into and out of the page cache
and is consumed by NFS clients exactly once -- caching that data
is expensive to do and totally valueless.

For now this is a hidden option that can be enabled on test
systems for benchmarking. In the longer term, this option might
be enabled persistently or per-export. When the exported file
system does not support direct I/O, NFSD falls back to using
either DONTCACHE or buffered I/O to fulfill NFS READ requests.

Suggested-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>

+87
+2
fs/nfsd/debugfs.c
··· 44 44 * Contents: 45 45 * %0: NFS READ will use buffered IO 46 46 * %1: NFS READ will use dontcache (buffered IO w/ dropbehind) 47 + * %2: NFS READ will use direct IO 47 48 * 48 49 * This setting takes immediate effect for all NFS versions, 49 50 * all exports, and in all NFSD net namespaces. ··· 65 64 nfsd_io_cache_read = NFSD_IO_BUFFERED; 66 65 break; 67 66 case NFSD_IO_DONTCACHE: 67 + case NFSD_IO_DIRECT: 68 68 /* 69 69 * Must disable splice_read when enabling 70 70 * NFSD_IO_DONTCACHE.
+1
fs/nfsd/nfsd.h
··· 160 160 /* Any new NFSD_IO enum value must be added at the end */ 161 161 NFSD_IO_BUFFERED, 162 162 NFSD_IO_DONTCACHE, 163 + NFSD_IO_DIRECT, 163 164 }; 164 165 165 166 extern u64 nfsd_io_cache_read __read_mostly;
+1
fs/nfsd/trace.h
··· 464 464 DEFINE_NFSD_IO_EVENT(read_start); 465 465 DEFINE_NFSD_IO_EVENT(read_splice); 466 466 DEFINE_NFSD_IO_EVENT(read_vector); 467 + DEFINE_NFSD_IO_EVENT(read_direct); 467 468 DEFINE_NFSD_IO_EVENT(read_io_done); 468 469 DEFINE_NFSD_IO_EVENT(read_done); 469 470 DEFINE_NFSD_IO_EVENT(write_start);
+83
fs/nfsd/vfs.c
··· 1074 1074 return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); 1075 1075 } 1076 1076 1077 + /* 1078 + * The byte range of the client's READ request is expanded on both ends 1079 + * until it meets the underlying file system's direct I/O alignment 1080 + * requirements. After the internal read is complete, the byte range of 1081 + * the NFS READ payload is reduced to the byte range that was originally 1082 + * requested. 1083 + * 1084 + * Note that a direct read can be done only when the xdr_buf containing 1085 + * the NFS READ reply does not already have contents in its .pages array. 1086 + * This is due to potentially restrictive alignment requirements on the 1087 + * read buffer. When .page_len and @base are zero, the .pages array is 1088 + * guaranteed to be page-aligned. 1089 + */ 1090 + static noinline_for_stack __be32 1091 + nfsd_direct_read(struct svc_rqst *rqstp, struct svc_fh *fhp, 1092 + struct nfsd_file *nf, loff_t offset, unsigned long *count, 1093 + u32 *eof) 1094 + { 1095 + u64 dio_start, dio_end; 1096 + unsigned long v, total; 1097 + struct iov_iter iter; 1098 + struct kiocb kiocb; 1099 + ssize_t host_err; 1100 + size_t len; 1101 + 1102 + init_sync_kiocb(&kiocb, nf->nf_file); 1103 + kiocb.ki_flags |= IOCB_DIRECT; 1104 + 1105 + /* Read a properly-aligned region of bytes into rq_bvec */ 1106 + dio_start = round_down(offset, nf->nf_dio_read_offset_align); 1107 + dio_end = round_up((u64)offset + *count, nf->nf_dio_read_offset_align); 1108 + 1109 + kiocb.ki_pos = dio_start; 1110 + 1111 + v = 0; 1112 + total = dio_end - dio_start; 1113 + while (total && v < rqstp->rq_maxpages && 1114 + rqstp->rq_next_page < rqstp->rq_page_end) { 1115 + len = min_t(size_t, total, PAGE_SIZE); 1116 + bvec_set_page(&rqstp->rq_bvec[v], *rqstp->rq_next_page, 1117 + len, 0); 1118 + 1119 + total -= len; 1120 + ++rqstp->rq_next_page; 1121 + ++v; 1122 + } 1123 + 1124 + trace_nfsd_read_direct(rqstp, fhp, offset, *count - total); 1125 + iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, 1126 + dio_end - dio_start - total); 1127 + 1128 + host_err = vfs_iocb_iter_read(nf->nf_file, &kiocb, &iter); 1129 + if (host_err >= 0) { 1130 + unsigned int pad = offset - dio_start; 1131 + 1132 + /* The returned payload starts after the pad */ 1133 + rqstp->rq_res.page_base = pad; 1134 + 1135 + /* Compute the count of bytes to be returned */ 1136 + if (host_err > pad + *count) 1137 + host_err = *count; 1138 + else if (host_err > pad) 1139 + host_err -= pad; 1140 + else 1141 + host_err = 0; 1142 + } else if (unlikely(host_err == -EINVAL)) { 1143 + struct inode *inode = d_inode(fhp->fh_dentry); 1144 + 1145 + pr_info_ratelimited("nfsd: Direct I/O alignment failure on %s/%ld\n", 1146 + inode->i_sb->s_id, inode->i_ino); 1147 + host_err = -ESERVERFAULT; 1148 + } 1149 + 1150 + return nfsd_finish_read(rqstp, fhp, nf->nf_file, offset, count, 1151 + eof, host_err); 1152 + } 1153 + 1077 1154 /** 1078 1155 * nfsd_iter_read - Perform a VFS read using an iterator 1079 1156 * @rqstp: RPC transaction context ··· 1183 1106 switch (nfsd_io_cache_read) { 1184 1107 case NFSD_IO_BUFFERED: 1185 1108 break; 1109 + case NFSD_IO_DIRECT: 1110 + /* When dio_read_offset_align is zero, dio is not supported */ 1111 + if (nf->nf_dio_read_offset_align && !rqstp->rq_res.page_len) 1112 + return nfsd_direct_read(rqstp, fhp, nf, offset, 1113 + count, eof); 1114 + fallthrough; 1186 1115 case NFSD_IO_DONTCACHE: 1187 1116 if (file->f_op->fop_flags & FOP_DONTCACHE) 1188 1117 kiocb.ki_flags = IOCB_DONTCACHE;