Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'nfs-for-7.1-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs

Pull NFS client updates from Trond Myklebust:
"Bugfixes:

- Fix handling of ENOSPC so that if we have to resend writes, they
are written synchronously

- SUNRPC RDMA transport fixes from Chuck

- Several fixes for delegated timestamps in NFSv4.2

- Failure to obtain a directory delegation should not cause stat() to
fail with NFSv4

- Rename was failing to update timestamps when a directory delegation
is held on NFSv4

- Ensure we check rsize/wsize after crossing a NFSv4 filesystem
boundary

- NFSv4/pnfs:

- If the server is down, retry the layout returns on reboot

- Fallback to MDS could result in a short write being incorrectly
logged

Cleanups:

- Use memcpy_and_pad in decode_fh"

* tag 'nfs-for-7.1-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (21 commits)
NFS: Fix RCU dereference of cl_xprt in nfs_compare_super_address
NFS: remove redundant __private attribute from nfs_page_class
NFSv4.2: fix CLONE/COPY attrs in presence of delegated attributes
NFS: fix writeback in presence of errors
nfs: use memcpy_and_pad in decode_fh
NFSv4.1: Apply session size limits on clone path
NFSv4: retry GETATTR if GET_DIR_DELEGATION failed
NFS: fix RENAME attr in presence of directory delegations
pnfs/flexfiles: validate ds_versions_cnt is non-zero
NFS/blocklayout: print each device used for SCSI layouts
xprtrdma: Post receive buffers after RPC completion
xprtrdma: Scale receive batch size with credit window
xprtrdma: Replace rpcrdma_mr_seg with xdr_buf cursor
xprtrdma: Decouple frwr_wp_create from frwr_map
xprtrdma: Close lost-wakeup race in xprt_rdma_alloc_slot
xprtrdma: Avoid 250 ms delay on backlog wakeup
xprtrdma: Close sendctx get/put race that can block a transport
nfs: update inode ctime after removexattr operation
nfs: fix utimensat() for atime with delegated timestamps
NFS: improve "Server wrote zero bytes" error
...

+442 -214
+5 -2
fs/nfs/blocklayout/dev.c
··· 370 370 if (!devname) 371 371 return ERR_PTR(-ENOMEM); 372 372 373 - bdev_file = bdev_file_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE, 374 - NULL, NULL); 373 + bdev_file = bdev_file_open_by_path(devname, 374 + BLK_OPEN_READ | BLK_OPEN_WRITE, NULL, NULL); 375 375 if (IS_ERR(bdev_file)) { 376 376 dprintk("failed to open device %s (%ld)\n", 377 377 devname, PTR_ERR(bdev_file)); 378 + } else { 379 + pr_info("pNFS: using block device %s\n", 380 + file_bdev(bdev_file)->bd_disk->disk_name); 378 381 } 379 382 380 383 kfree(devname);
+1 -2
fs/nfs/callback_xdr.c
··· 96 96 p = xdr_inline_decode(xdr, fh->size); 97 97 if (unlikely(p == NULL)) 98 98 return htonl(NFS4ERR_RESOURCE); 99 - memcpy(&fh->data[0], p, fh->size); 100 - memset(&fh->data[fh->size], 0, sizeof(fh->data) - fh->size); 99 + memcpy_and_pad(fh->data, sizeof(fh->data), p, fh->size, 0); 101 100 return 0; 102 101 } 103 102
+5
fs/nfs/flexfilelayout/flexfilelayoutdev.c
··· 97 97 if (unlikely(!p)) 98 98 goto out_err_drain_dsaddrs; 99 99 version_count = be32_to_cpup(p); 100 + 101 + if (version_count == 0) { 102 + ret = -EINVAL; 103 + goto out_err_drain_dsaddrs; 104 + } 100 105 dprintk("%s: version count %d\n", __func__, version_count); 101 106 102 107 ds_versions = kzalloc_objs(struct nfs4_ff_ds_version, version_count,
+3 -9
fs/nfs/inode.c
··· 692 692 693 693 void nfs_update_delegated_mtime_locked(struct inode *inode) 694 694 { 695 - if (nfs_have_delegated_mtime(inode)) 695 + if (nfs_have_delegated_mtime(inode) || 696 + nfs_have_directory_delegation(inode)) 696 697 nfs_update_mtime(inode); 697 698 } 698 699 ··· 758 757 } else if (nfs_have_delegated_atime(inode) && 759 758 attr->ia_valid & ATTR_ATIME && 760 759 !(attr->ia_valid & ATTR_MTIME)) { 761 - if (attr->ia_valid & ATTR_ATIME_SET) { 762 - if (uid_eq(task_uid, owner_uid)) { 763 - spin_lock(&inode->i_lock); 764 - nfs_set_timestamps_to_ts(inode, attr); 765 - spin_unlock(&inode->i_lock); 766 - attr->ia_valid &= ~(ATTR_ATIME|ATTR_ATIME_SET); 767 - } 768 - } else { 760 + if (!(attr->ia_valid & ATTR_ATIME_SET)) { 769 761 nfs_update_delegated_atime(inode); 770 762 attr->ia_valid &= ~ATTR_ATIME; 771 763 }
+2
fs/nfs/internal.h
··· 253 253 u32 minor_version); 254 254 extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, 255 255 struct inode *); 256 + extern void nfs4_session_limit_rwsize(struct nfs_server *server); 257 + extern void nfs4_session_limit_xasize(struct nfs_server *server); 256 258 extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, 257 259 const struct sockaddr_storage *ds_addr, int ds_addrlen, 258 260 int ds_proto, unsigned int ds_timeo,
+14 -1
fs/nfs/localio.c
··· 865 865 file_start_write(filp); 866 866 n_iters = atomic_read(&iocb->n_iters); 867 867 for (int i = 0; i < n_iters ; i++) { 868 + size_t icount; 869 + 868 870 if (iocb->iter_is_dio_aligned[i]) { 869 871 iocb->kiocb.ki_flags |= IOCB_DIRECT; 870 872 /* Only use AIO completion if DIO-aligned segment is last */ ··· 883 881 if (status == -EIOCBQUEUED) 884 882 continue; 885 883 /* Break on completion, errors, or short writes */ 884 + icount = iov_iter_count(&iocb->iters[i]); 886 885 if (nfs_local_pgio_done(iocb, status) || status < 0 || 887 - (size_t)status < iov_iter_count(&iocb->iters[i])) { 886 + (size_t)status < icount) { 887 + if ((size_t)status < icount) { 888 + struct nfs_lock_context *ctx = 889 + iocb->hdr->req->wb_lock_context; 890 + 891 + set_bit(NFS_CONTEXT_WRITE_SYNC, 892 + &ctx->open_context->flags); 893 + } 888 894 nfs_local_write_iocb_done(iocb); 889 895 break; 890 896 } ··· 911 901 __func__, hdr->args.count, hdr->args.offset, 912 902 (hdr->args.stable == NFS_UNSTABLE) ? "unstable" : "stable"); 913 903 904 + if (test_bit(NFS_CONTEXT_WRITE_SYNC, 905 + &hdr->req->wb_lock_context->open_context->flags)) 906 + hdr->args.stable = NFS_FILE_SYNC; 914 907 switch (hdr->args.stable) { 915 908 default: 916 909 break;
+17 -2
fs/nfs/nfs42proc.c
··· 401 401 NFS_INO_INVALID_MTIME | 402 402 NFS_INO_INVALID_BLOCKS); 403 403 spin_unlock(&inode->i_lock); 404 + nfs_update_delegated_mtime(inode); 404 405 } 405 406 406 407 static ssize_t _nfs42_proc_copy(struct file *src, ··· 1373 1372 static int _nfs42_proc_removexattr(struct inode *inode, const char *name) 1374 1373 { 1375 1374 struct nfs_server *server = NFS_SERVER(inode); 1375 + __u32 bitmask[NFS_BITMASK_SZ]; 1376 1376 struct nfs42_removexattrargs args = { 1377 1377 .fh = NFS_FH(inode), 1378 + .bitmask = bitmask, 1378 1379 .xattr_name = name, 1379 1380 }; 1380 - struct nfs42_removexattrres res; 1381 + struct nfs42_removexattrres res = { 1382 + .server = server, 1383 + }; 1381 1384 struct rpc_message msg = { 1382 1385 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVEXATTR], 1383 1386 .rpc_argp = &args, ··· 1390 1385 int ret; 1391 1386 unsigned long timestamp = jiffies; 1392 1387 1388 + res.fattr = nfs_alloc_fattr(); 1389 + if (!res.fattr) 1390 + return -ENOMEM; 1391 + 1392 + nfs4_bitmask_set(bitmask, server->cache_consistency_bitmask, 1393 + inode, NFS_INO_INVALID_CHANGE); 1394 + 1393 1395 ret = nfs4_call_sync(server->client, server, &msg, &args.seq_args, 1394 1396 &res.seq_res, 1); 1395 1397 trace_nfs4_removexattr(inode, name, ret); 1396 - if (!ret) 1398 + if (!ret) { 1397 1399 nfs4_update_changeattr(inode, &res.cinfo, timestamp, 0); 1400 + ret = nfs_post_op_update_inode(inode, res.fattr); 1401 + } 1398 1402 1403 + kfree(res.fattr); 1399 1404 return ret; 1400 1405 } 1401 1406
+8 -2
fs/nfs/nfs42xdr.c
··· 263 263 #define NFS4_enc_removexattr_sz (compound_encode_hdr_maxsz + \ 264 264 encode_sequence_maxsz + \ 265 265 encode_putfh_maxsz + \ 266 - encode_removexattr_maxsz) 266 + encode_removexattr_maxsz + \ 267 + encode_getattr_maxsz) 267 268 #define NFS4_dec_removexattr_sz (compound_decode_hdr_maxsz + \ 268 269 decode_sequence_maxsz + \ 269 270 decode_putfh_maxsz + \ 270 - decode_removexattr_maxsz) 271 + decode_removexattr_maxsz + \ 272 + decode_getattr_maxsz) 271 273 272 274 /* 273 275 * These values specify the maximum amount of data that is not ··· 871 869 encode_sequence(xdr, &args->seq_args, &hdr); 872 870 encode_putfh(xdr, args->fh, &hdr); 873 871 encode_removexattr(xdr, args->xattr_name, &hdr); 872 + encode_getfattr(xdr, args->bitmask, &hdr); 874 873 encode_nops(&hdr); 875 874 } 876 875 ··· 1821 1818 goto out; 1822 1819 1823 1820 status = decode_removexattr(xdr, &res->cinfo); 1821 + if (status) 1822 + goto out; 1823 + status = decode_getfattr(xdr, res->fattr, res->server); 1824 1824 out: 1825 1825 return status; 1826 1826 }
+2 -2
fs/nfs/nfs4client.c
··· 855 855 * Limit the mount rsize, wsize and dtsize using negotiated fore 856 856 * channel attributes. 857 857 */ 858 - static void nfs4_session_limit_rwsize(struct nfs_server *server) 858 + void nfs4_session_limit_rwsize(struct nfs_server *server) 859 859 { 860 860 struct nfs4_session *sess; 861 861 u32 server_resp_sz; ··· 878 878 /* 879 879 * Limit xattr sizes using the channel attributes. 880 880 */ 881 - static void nfs4_session_limit_xasize(struct nfs_server *server) 881 + void nfs4_session_limit_xasize(struct nfs_server *server) 882 882 { 883 883 #ifdef CONFIG_NFS_V4_2 884 884 struct nfs4_session *sess;
+32 -10
fs/nfs/nfs4proc.c
··· 4469 4469 case -ENOTSUPP: 4470 4470 case -EOPNOTSUPP: 4471 4471 server->caps &= ~NFS_CAP_DIR_DELEG; 4472 + break; 4473 + case -NFS4ERR_INVAL: 4474 + case -NFS4ERR_IO: 4475 + case -NFS4ERR_DIRDELEG_UNAVAIL: 4476 + case -NFS4ERR_NOTDIR: 4477 + clear_bit(NFS_INO_REQ_DIR_DELEG, &(NFS_I(inode)->flags)); 4478 + status = -EAGAIN; 4472 4479 } 4473 4480 } 4474 4481 ··· 4497 4490 default: 4498 4491 err = nfs4_handle_exception(server, err, &exception); 4499 4492 break; 4493 + case -EAGAIN: 4500 4494 case -ENOTSUPP: 4501 4495 case -EOPNOTSUPP: 4502 4496 exception.retry = true; ··· 5060 5052 res->new_fattr->time_start, 5061 5053 NFS_INO_INVALID_NLINK | 5062 5054 NFS_INO_INVALID_DATA); 5055 + nfs_update_delegated_mtime(new_dir); 5063 5056 } else 5064 5057 nfs4_update_changeattr(old_dir, &res->old_cinfo, 5065 5058 res->old_fattr->time_start, ··· 9778 9769 if (!nfs41_sequence_process(task, &lrp->res.seq_res)) 9779 9770 return; 9780 9771 9781 - if (task->tk_rpc_status == -ETIMEDOUT) { 9782 - lrp->rpc_status = -EAGAIN; 9783 - lrp->res.lrs_present = 0; 9784 - return; 9785 - } 9786 - /* 9787 - * Was there an RPC level error? Assume the call succeeded, 9788 - * and that we need to release the layout 9789 - */ 9790 - if (task->tk_rpc_status != 0 && RPC_WAS_SENT(task)) { 9772 + if (task->tk_rpc_status < 0) { 9773 + switch (task->tk_rpc_status) { 9774 + case -EACCES: 9775 + case -EIO: 9776 + case -EKEYEXPIRED: 9777 + case -ERESTARTSYS: 9778 + case -EINTR: 9779 + lrp->rpc_status = 0; 9780 + break; 9781 + case -ENETDOWN: 9782 + case -ENETUNREACH: 9783 + if (task->tk_flags & RPC_TASK_NETUNREACH_FATAL) 9784 + lrp->rpc_status = 0; 9785 + else 9786 + lrp->rpc_status = -EAGAIN; 9787 + break; 9788 + default: 9789 + lrp->rpc_status = -EAGAIN; 9790 + break; 9791 + } 9791 9792 lrp->res.lrs_present = 0; 9792 9793 return; 9793 9794 } ··· 10636 10617 server = nfs_clone_server(source, fh, fattr, flavor); 10637 10618 if (IS_ERR(server)) 10638 10619 return server; 10620 + 10621 + nfs4_session_limit_rwsize(server); 10622 + nfs4_session_limit_xasize(server); 10639 10623 10640 10624 error = nfs4_delegation_hash_alloc(server); 10641 10625 if (error) {
+1 -1
fs/nfs/nfstrace.h
··· 1496 1496 __field(dev_t, dev) 1497 1497 __field(u32, fhandle) 1498 1498 __field(u64, fileid) 1499 - __field(const struct nfs_page *__private, req) 1499 + __field(const struct nfs_page *, req) 1500 1500 __field(loff_t, offset) 1501 1501 __field(unsigned int, count) 1502 1502 __field(unsigned long, flags)
+3
fs/nfs/pagelist.c
··· 1186 1186 1187 1187 nfs_page_group_lock(req); 1188 1188 1189 + if (test_bit(NFS_CONTEXT_WRITE_SYNC, 1190 + &req->wb_lock_context->open_context->flags)) 1191 + desc->pg_ioflags |= FLUSH_STABLE; 1189 1192 subreq = req; 1190 1193 subreq_size = subreq->wb_bytes; 1191 1194 for(;;) {
+17 -5
fs/nfs/pnfs.c
··· 1698 1698 /* If the call was not sent, let caller handle it */ 1699 1699 if (!RPC_WAS_SENT(task)) 1700 1700 return 0; 1701 - /* 1702 - * Otherwise, assume the call succeeded and 1703 - * that we need to release the layout 1704 - */ 1705 - *ret = 0; 1701 + switch (task->tk_rpc_status) { 1702 + default: 1703 + /* 1704 + * Defer the layoutreturn if it was due 1705 + * to the server being down. 1706 + */ 1707 + *ret = -NFS4ERR_NOMATCHING_LAYOUT; 1708 + break; 1709 + case -EACCES: 1710 + case -EIO: 1711 + case -EKEYEXPIRED: 1712 + case -ERESTARTSYS: 1713 + case -EINTR: 1714 + /* Don't retry */ 1715 + *ret = 0; 1716 + break; 1717 + } 1706 1718 (*respp)->lrs_present = 0; 1707 1719 retval = 0; 1708 1720 break;
+13 -3
fs/nfs/super.c
··· 1166 1166 static int nfs_compare_super_address(struct nfs_server *server1, 1167 1167 struct nfs_server *server2) 1168 1168 { 1169 + struct rpc_xprt *xprt1, *xprt2; 1169 1170 struct sockaddr *sap1, *sap2; 1170 - struct rpc_xprt *xprt1 = server1->client->cl_xprt; 1171 - struct rpc_xprt *xprt2 = server2->client->cl_xprt; 1171 + 1172 + rcu_read_lock(); 1173 + 1174 + xprt1 = rcu_dereference(server1->client->cl_xprt); 1175 + xprt2 = rcu_dereference(server2->client->cl_xprt); 1172 1176 1173 1177 if (!net_eq(xprt1->xprt_net, xprt2->xprt_net)) 1174 - return 0; 1178 + goto out_unlock; 1179 + 1180 + rcu_read_unlock(); 1175 1181 1176 1182 sap1 = (struct sockaddr *)&server1->nfs_client->cl_addr; 1177 1183 sap2 = (struct sockaddr *)&server2->nfs_client->cl_addr; ··· 1209 1203 } 1210 1204 1211 1205 return 1; 1206 + 1207 + out_unlock: 1208 + rcu_read_unlock(); 1209 + return 0; 1212 1210 } 1213 1211 1214 1212 static int nfs_compare_userns(const struct nfs_server *old,
+10 -1
fs/nfs/write.c
··· 926 926 goto remove_req; 927 927 } 928 928 if (nfs_write_need_commit(hdr)) { 929 + struct nfs_open_context *ctx = 930 + hdr->req->wb_lock_context->open_context; 931 + 929 932 /* Reset wb_nio, since the write was successful. */ 930 933 req->wb_nio = 0; 931 934 memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); 935 + clear_bit(NFS_CONTEXT_WRITE_SYNC, &ctx->flags); 932 936 nfs_mark_request_commit(req, hdr->lseg, &cinfo, 933 937 hdr->ds_commit_idx); 934 938 goto next; ··· 1554 1550 struct nfs_pgio_args *argp = &hdr->args; 1555 1551 struct nfs_pgio_res *resp = &hdr->res; 1556 1552 1557 - if (resp->count < argp->count) { 1553 + if (resp->count < argp->count && !list_empty(&hdr->pages)) { 1558 1554 static unsigned long complain; 1555 + struct nfs_open_context *ctx = 1556 + hdr->req->wb_lock_context->open_context; 1559 1557 1558 + set_bit(NFS_CONTEXT_WRITE_SYNC, &ctx->flags); 1560 1559 /* This a short write! */ 1561 1560 nfs_inc_stats(hdr->inode, NFSIOS_SHORTWRITE); 1562 1561 ··· 1843 1836 /* We have a mismatch. Write the page again */ 1844 1837 dprintk(" mismatch\n"); 1845 1838 nfs_mark_request_dirty(req); 1839 + set_bit(NFS_CONTEXT_WRITE_SYNC, 1840 + &req->wb_lock_context->open_context->flags); 1846 1841 atomic_long_inc(&NFS_I(data->inode)->redirtied_pages); 1847 1842 next: 1848 1843 nfs_unlock_and_release_request(req);
+1
include/linux/nfs_fs.h
··· 109 109 #define NFS_CONTEXT_BAD (2) 110 110 #define NFS_CONTEXT_UNLOCK (3) 111 111 #define NFS_CONTEXT_FILE_OPEN (4) 112 + #define NFS_CONTEXT_WRITE_SYNC (5) 112 113 113 114 struct nfs4_threshold *mdsthreshold; 114 115 struct list_head list;
+3
include/linux/nfs_xdr.h
··· 1611 1611 struct nfs42_removexattrargs { 1612 1612 struct nfs4_sequence_args seq_args; 1613 1613 struct nfs_fh *fh; 1614 + const u32 *bitmask; 1614 1615 const char *xattr_name; 1615 1616 }; 1616 1617 1617 1618 struct nfs42_removexattrres { 1618 1619 struct nfs4_sequence_res seq_res; 1619 1620 struct nfs4_change_info cinfo; 1621 + struct nfs_fattr *fattr; 1622 + const struct nfs_server *server; 1620 1623 }; 1621 1624 1622 1625 #endif /* CONFIG_NFS_V4_2 */
+2
include/linux/sunrpc/xprt.h
··· 404 404 unsigned int max_req); 405 405 void xprt_free(struct rpc_xprt *); 406 406 void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task); 407 + void xprt_add_backlog_noncongested(struct rpc_xprt *xprt, 408 + struct rpc_task *task); 407 409 bool xprt_wake_up_backlog(struct rpc_xprt *xprt, struct rpc_rqst *req); 408 410 void xprt_cleanup_ids(void); 409 411
+14 -14
include/trace/events/rpcrdma.h
··· 392 392 const struct rpc_task *task, 393 393 unsigned int pos, 394 394 struct rpcrdma_mr *mr, 395 - int nsegs 395 + bool is_last 396 396 ), 397 397 398 - TP_ARGS(task, pos, mr, nsegs), 398 + TP_ARGS(task, pos, mr, is_last), 399 399 400 400 TP_STRUCT__entry( 401 401 __field(unsigned int, task_id) ··· 405 405 __field(u32, handle) 406 406 __field(u32, length) 407 407 __field(u64, offset) 408 - __field(int, nsegs) 408 + __field(bool, is_last) 409 409 ), 410 410 411 411 TP_fast_assign( ··· 416 416 __entry->handle = mr->mr_handle; 417 417 __entry->length = mr->mr_length; 418 418 __entry->offset = mr->mr_offset; 419 - __entry->nsegs = nsegs; 419 + __entry->is_last = is_last; 420 420 ), 421 421 422 422 TP_printk(SUNRPC_TRACE_TASK_SPECIFIER ··· 424 424 __entry->task_id, __entry->client_id, 425 425 __entry->pos, __entry->length, 426 426 (unsigned long long)__entry->offset, __entry->handle, 427 - __entry->nents < __entry->nsegs ? "more" : "last" 427 + __entry->is_last ? "last" : "more" 428 428 ) 429 429 ); 430 430 ··· 434 434 const struct rpc_task *task, \ 435 435 unsigned int pos, \ 436 436 struct rpcrdma_mr *mr, \ 437 - int nsegs \ 437 + bool is_last \ 438 438 ), \ 439 - TP_ARGS(task, pos, mr, nsegs)) 439 + TP_ARGS(task, pos, mr, is_last)) 440 440 441 441 DECLARE_EVENT_CLASS(xprtrdma_wrch_event, 442 442 TP_PROTO( 443 443 const struct rpc_task *task, 444 444 struct rpcrdma_mr *mr, 445 - int nsegs 445 + bool is_last 446 446 ), 447 447 448 - TP_ARGS(task, mr, nsegs), 448 + TP_ARGS(task, mr, is_last), 449 449 450 450 TP_STRUCT__entry( 451 451 __field(unsigned int, task_id) ··· 454 454 __field(u32, handle) 455 455 __field(u32, length) 456 456 __field(u64, offset) 457 - __field(int, nsegs) 457 + __field(bool, is_last) 458 458 ), 459 459 460 460 TP_fast_assign( ··· 464 464 __entry->handle = mr->mr_handle; 465 465 __entry->length = mr->mr_length; 466 466 __entry->offset = mr->mr_offset; 467 - __entry->nsegs = nsegs; 467 + __entry->is_last = is_last; 468 468 ), 469 469 470 470 TP_printk(SUNRPC_TRACE_TASK_SPECIFIER ··· 472 472 __entry->task_id, __entry->client_id, 473 473 __entry->length, (unsigned long long)__entry->offset, 474 474 __entry->handle, 475 - __entry->nents < __entry->nsegs ? "more" : "last" 475 + __entry->is_last ? "last" : "more" 476 476 ) 477 477 ); 478 478 ··· 481 481 TP_PROTO( \ 482 482 const struct rpc_task *task, \ 483 483 struct rpcrdma_mr *mr, \ 484 - int nsegs \ 484 + bool is_last \ 485 485 ), \ 486 - TP_ARGS(task, mr, nsegs)) 486 + TP_ARGS(task, mr, is_last)) 487 487 488 488 TRACE_DEFINE_ENUM(DMA_BIDIRECTIONAL); 489 489 TRACE_DEFINE_ENUM(DMA_TO_DEVICE);
+16
net/sunrpc/xprt.c
··· 1663 1663 } 1664 1664 EXPORT_SYMBOL_GPL(xprt_add_backlog); 1665 1665 1666 + /** 1667 + * xprt_add_backlog_noncongested - queue task on backlog 1668 + * @xprt: transport whose backlog queue receives the task 1669 + * @task: task to queue 1670 + * 1671 + * Like xprt_add_backlog, but does not set XPRT_CONGESTED. 1672 + * For transports whose free_slot path does not synchronize 1673 + * with xprt_throttle_congested via reserve_lock. 1674 + */ 1675 + void xprt_add_backlog_noncongested(struct rpc_xprt *xprt, 1676 + struct rpc_task *task) 1677 + { 1678 + rpc_sleep_on(&xprt->backlog, task, xprt_complete_request_init); 1679 + } 1680 + EXPORT_SYMBOL_GPL(xprt_add_backlog_noncongested); 1681 + 1666 1682 static bool __xprt_set_rq(struct rpc_task *task, void *data) 1667 1683 { 1668 1684 struct rpc_rqst *req = data;
+145 -32
net/sunrpc/xprtrdma/frwr_ops.c
··· 244 244 } 245 245 ep->re_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; 246 246 ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ 247 + ep->re_recv_batch = ep->re_max_requests >> 2; 247 248 ep->re_attr.cap.max_recv_wr = ep->re_max_requests; 248 249 ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; 249 - ep->re_attr.cap.max_recv_wr += RPCRDMA_MAX_RECV_BATCH; 250 + ep->re_attr.cap.max_recv_wr += ep->re_recv_batch; 250 251 ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ 251 252 252 253 ep->re_max_rdma_segs = ··· 269 268 } 270 269 271 270 /** 272 - * frwr_map - Register a memory region 271 + * frwr_map - Register a memory region from an xdr_buf cursor 273 272 * @r_xprt: controlling transport 274 - * @seg: memory region co-ordinates 275 - * @nsegs: number of segments remaining 273 + * @cur: cursor tracking position within the xdr_buf 276 274 * @writing: true when RDMA Write will be used 277 275 * @xid: XID of RPC using the registered memory 278 276 * @mr: MR to fill in ··· 279 279 * Prepare a REG_MR Work Request to register a memory region 280 280 * for remote access via RDMA READ or RDMA WRITE. 281 281 * 282 - * Returns the next segment or a negative errno pointer. 283 - * On success, @mr is filled in. 282 + * Returns 0 on success (cursor advanced past consumed data, 283 + * @mr populated) or a negative errno on failure. 284 284 */ 285 - struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, 286 - struct rpcrdma_mr_seg *seg, 287 - int nsegs, bool writing, __be32 xid, 288 - struct rpcrdma_mr *mr) 285 + int frwr_map(struct rpcrdma_xprt *r_xprt, 286 + struct rpcrdma_xdr_cursor *cur, 287 + bool writing, __be32 xid, 288 + struct rpcrdma_mr *mr) 289 289 { 290 290 struct rpcrdma_ep *ep = r_xprt->rx_ep; 291 + const struct xdr_buf *xdrbuf = cur->xc_buf; 292 + bool sg_gaps = ep->re_mrtype == IB_MR_TYPE_SG_GAPS; 293 + unsigned int max_depth = ep->re_max_fr_depth; 291 294 struct ib_reg_wr *reg_wr; 292 295 int i, n, dma_nents; 293 296 struct ib_mr *ibmr; 294 297 u8 key; 295 298 296 - if (nsegs > ep->re_max_fr_depth) 297 - nsegs = ep->re_max_fr_depth; 298 - for (i = 0; i < nsegs;) { 299 - sg_set_page(&mr->mr_sg[i], seg->mr_page, 300 - seg->mr_len, seg->mr_offset); 299 + i = 0; 301 300 302 - ++seg; 303 - ++i; 304 - if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS) 305 - continue; 306 - if ((i < nsegs && seg->mr_offset) || 307 - offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) 308 - break; 301 + /* Head kvec */ 302 + if (!(cur->xc_flags & XC_HEAD_DONE)) { 303 + const struct kvec *head = &xdrbuf->head[0]; 304 + 305 + sg_set_page(&mr->mr_sg[i], 306 + virt_to_page(head->iov_base), 307 + head->iov_len, 308 + offset_in_page(head->iov_base)); 309 + cur->xc_flags |= XC_HEAD_DONE; 310 + i++; 311 + /* Without sg-gap support, each non-contiguous region 312 + * must be registered as a separate MR. Returning 313 + * here after the head kvec causes the caller to 314 + * invoke frwr_map() again for the page list and 315 + * tail. 316 + */ 317 + if (!sg_gaps) 318 + goto finish; 309 319 } 320 + 321 + /* Page list */ 322 + if (!(cur->xc_flags & XC_PAGES_DONE) && xdrbuf->page_len) { 323 + unsigned int page_base, remaining; 324 + struct page **ppages; 325 + 326 + remaining = xdrbuf->page_len - cur->xc_page_offset; 327 + page_base = offset_in_page(xdrbuf->page_base + 328 + cur->xc_page_offset); 329 + ppages = xdrbuf->pages + 330 + ((xdrbuf->page_base + cur->xc_page_offset) 331 + >> PAGE_SHIFT); 332 + 333 + while (remaining > 0 && i < max_depth) { 334 + unsigned int len; 335 + 336 + len = min_t(unsigned int, 337 + PAGE_SIZE - page_base, remaining); 338 + sg_set_page(&mr->mr_sg[i], *ppages, 339 + len, page_base); 340 + cur->xc_page_offset += len; 341 + i++; 342 + ppages++; 343 + remaining -= len; 344 + 345 + if (!sg_gaps && remaining > 0 && 346 + offset_in_page(page_base + len)) 347 + goto finish; 348 + page_base = 0; 349 + } 350 + if (remaining == 0) 351 + cur->xc_flags |= XC_PAGES_DONE; 352 + } else if (!(cur->xc_flags & XC_PAGES_DONE)) { 353 + cur->xc_flags |= XC_PAGES_DONE; 354 + } 355 + 356 + /* Tail kvec */ 357 + if (!(cur->xc_flags & XC_TAIL_DONE) && xdrbuf->tail[0].iov_len && 358 + i < max_depth) { 359 + const struct kvec *tail = &xdrbuf->tail[0]; 360 + 361 + if (!sg_gaps && i > 0) { 362 + struct scatterlist *prev = &mr->mr_sg[i - 1]; 363 + 364 + if (offset_in_page(prev->offset + prev->length) || 365 + offset_in_page(tail->iov_base)) 366 + goto finish; 367 + } 368 + sg_set_page(&mr->mr_sg[i], 369 + virt_to_page(tail->iov_base), 370 + tail->iov_len, 371 + offset_in_page(tail->iov_base)); 372 + cur->xc_flags |= XC_TAIL_DONE; 373 + i++; 374 + } else if (!(cur->xc_flags & XC_TAIL_DONE) && 375 + !xdrbuf->tail[0].iov_len) { 376 + cur->xc_flags |= XC_TAIL_DONE; 377 + } 378 + 379 + finish: 310 380 mr->mr_dir = rpcrdma_data_dir(writing); 311 381 mr->mr_nents = i; 312 382 ··· 408 338 mr->mr_offset = ibmr->iova; 409 339 trace_xprtrdma_mr_map(mr); 410 340 411 - return seg; 341 + return 0; 412 342 413 343 out_dmamap_err: 414 344 trace_xprtrdma_frwr_sgerr(mr, i); 415 - return ERR_PTR(-EIO); 345 + return -EIO; 416 346 417 347 out_mapmr_err: 418 348 trace_xprtrdma_frwr_maperr(mr, n); 419 - return ERR_PTR(-EIO); 349 + return -EIO; 420 350 } 421 351 422 352 /** ··· 739 669 */ 740 670 int frwr_wp_create(struct rpcrdma_xprt *r_xprt) 741 671 { 672 + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 742 673 struct rpcrdma_ep *ep = r_xprt->rx_ep; 743 - struct rpcrdma_mr_seg seg; 674 + struct ib_reg_wr *reg_wr; 744 675 struct rpcrdma_mr *mr; 676 + struct ib_mr *ibmr; 677 + int dma_nents; 678 + int ret; 745 679 746 680 mr = rpcrdma_mr_get(r_xprt); 747 681 if (!mr) ··· 753 679 mr->mr_req = NULL; 754 680 ep->re_write_pad_mr = mr; 755 681 756 - seg.mr_len = XDR_UNIT; 757 - seg.mr_page = virt_to_page(ep->re_write_pad); 758 - seg.mr_offset = offset_in_page(ep->re_write_pad); 759 - if (IS_ERR(frwr_map(r_xprt, &seg, 1, true, xdr_zero, mr))) 760 - return -EIO; 682 + sg_init_table(mr->mr_sg, 1); 683 + sg_set_page(mr->mr_sg, virt_to_page(ep->re_write_pad), 684 + XDR_UNIT, offset_in_page(ep->re_write_pad)); 685 + 686 + mr->mr_dir = DMA_FROM_DEVICE; 687 + mr->mr_nents = 1; 688 + dma_nents = ib_dma_map_sg(ep->re_id->device, mr->mr_sg, 689 + mr->mr_nents, mr->mr_dir); 690 + if (!dma_nents) { 691 + ret = -EIO; 692 + goto out_mr; 693 + } 694 + mr->mr_device = ep->re_id->device; 695 + 696 + ibmr = mr->mr_ibmr; 697 + if (ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, 698 + PAGE_SIZE) != dma_nents) { 699 + ret = -EIO; 700 + goto out_unmap; 701 + } 702 + 703 + /* IOVA is not tagged with an XID; the write-pad is not RPC-specific. */ 704 + ib_update_fast_reg_key(ibmr, ib_inc_rkey(ibmr->rkey)); 705 + 706 + reg_wr = &mr->mr_regwr; 707 + reg_wr->mr = ibmr; 708 + reg_wr->key = ibmr->rkey; 709 + reg_wr->access = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE; 710 + 711 + mr->mr_handle = ibmr->rkey; 712 + mr->mr_length = ibmr->length; 713 + mr->mr_offset = ibmr->iova; 714 + 761 715 trace_xprtrdma_mr_fastreg(mr); 762 716 763 717 mr->mr_cqe.done = frwr_wc_fastreg; ··· 795 693 mr->mr_regwr.wr.opcode = IB_WR_REG_MR; 796 694 mr->mr_regwr.wr.send_flags = 0; 797 695 798 - return ib_post_send(ep->re_id->qp, &mr->mr_regwr.wr, NULL); 696 + ret = ib_post_send(ep->re_id->qp, &mr->mr_regwr.wr, NULL); 697 + if (!ret) 698 + return 0; 699 + 700 + out_unmap: 701 + frwr_mr_unmap(mr); 702 + out_mr: 703 + ep->re_write_pad_mr = NULL; 704 + spin_lock(&buf->rb_lock); 705 + rpcrdma_mr_push(mr, &buf->rb_mrs); 706 + spin_unlock(&buf->rb_lock); 707 + return ret; 799 708 }
+66 -111
net/sunrpc/xprtrdma/rpc_rdma.c
··· 200 200 return 0; 201 201 } 202 202 203 - /* Convert @vec to a single SGL element. 204 - * 205 - * Returns pointer to next available SGE, and bumps the total number 206 - * of SGEs consumed. 207 - */ 208 - static struct rpcrdma_mr_seg * 209 - rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, 210 - unsigned int *n) 203 + static void 204 + rpcrdma_xdr_cursor_init(struct rpcrdma_xdr_cursor *cur, 205 + const struct xdr_buf *xdrbuf, 206 + unsigned int pos, enum rpcrdma_chunktype type) 211 207 { 212 - seg->mr_page = virt_to_page(vec->iov_base); 213 - seg->mr_offset = offset_in_page(vec->iov_base); 214 - seg->mr_len = vec->iov_len; 215 - ++seg; 216 - ++(*n); 217 - return seg; 208 + cur->xc_buf = xdrbuf; 209 + cur->xc_page_offset = 0; 210 + cur->xc_flags = 0; 211 + 212 + if (pos != 0) 213 + cur->xc_flags |= XC_HEAD_DONE; 214 + if (!xdrbuf->page_len) 215 + cur->xc_flags |= XC_PAGES_DONE; 216 + if (type == rpcrdma_readch || type == rpcrdma_writech || 217 + !xdrbuf->tail[0].iov_len) 218 + cur->xc_flags |= XC_TAIL_DONE; 218 219 } 219 220 220 - /* Convert @xdrbuf into SGEs no larger than a page each. As they 221 - * are registered, these SGEs are then coalesced into RDMA segments 222 - * when the selected memreg mode supports it. 223 - * 224 - * Returns positive number of SGEs consumed, or a negative errno. 225 - */ 226 - 227 - static int 228 - rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, 229 - unsigned int pos, enum rpcrdma_chunktype type, 230 - struct rpcrdma_mr_seg *seg) 221 + static bool 222 + rpcrdma_xdr_cursor_done(const struct rpcrdma_xdr_cursor *cur) 231 223 { 232 - unsigned long page_base; 233 - unsigned int len, n; 234 - struct page **ppages; 235 - 236 - n = 0; 237 - if (pos == 0) 238 - seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n); 239 - 240 - len = xdrbuf->page_len; 241 - ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); 242 - page_base = offset_in_page(xdrbuf->page_base); 243 - while (len) { 244 - seg->mr_page = *ppages; 245 - seg->mr_offset = page_base; 246 - seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); 247 - len -= seg->mr_len; 248 - ++ppages; 249 - ++seg; 250 - ++n; 251 - page_base = 0; 252 - } 253 - 254 - if (type == rpcrdma_readch || type == rpcrdma_writech) 255 - goto out; 256 - 257 - if (xdrbuf->tail[0].iov_len) 258 - rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); 259 - 260 - out: 261 - if (unlikely(n > RPCRDMA_MAX_SEGS)) 262 - return -EIO; 263 - return n; 224 + return (cur->xc_flags & (XC_HEAD_DONE | XC_PAGES_DONE | 225 + XC_TAIL_DONE)) == 226 + (XC_HEAD_DONE | XC_PAGES_DONE | XC_TAIL_DONE); 264 227 } 265 228 266 229 static int ··· 255 292 return 0; 256 293 } 257 294 258 - static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, 259 - struct rpcrdma_req *req, 260 - struct rpcrdma_mr_seg *seg, 261 - int nsegs, bool writing, 262 - struct rpcrdma_mr **mr) 295 + static int rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, 296 + struct rpcrdma_req *req, 297 + struct rpcrdma_xdr_cursor *cur, 298 + bool writing, struct rpcrdma_mr **mr) 263 299 { 264 300 *mr = rpcrdma_mr_pop(&req->rl_free_mrs); 265 301 if (!*mr) { ··· 269 307 } 270 308 271 309 rpcrdma_mr_push(*mr, &req->rl_registered); 272 - return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr); 310 + return frwr_map(r_xprt, cur, writing, req->rl_slot.rq_xid, *mr); 273 311 274 312 out_getmr_err: 275 313 trace_xprtrdma_nomrs_err(r_xprt, req); 276 314 xprt_wait_for_buffer_space(&r_xprt->rx_xprt); 277 315 rpcrdma_mrs_refresh(r_xprt); 278 - return ERR_PTR(-EAGAIN); 316 + return -EAGAIN; 279 317 } 280 318 281 319 /* Register and XDR encode the Read list. Supports encoding a list of read ··· 298 336 enum rpcrdma_chunktype rtype) 299 337 { 300 338 struct xdr_stream *xdr = &req->rl_stream; 301 - struct rpcrdma_mr_seg *seg; 339 + struct rpcrdma_xdr_cursor cur; 302 340 struct rpcrdma_mr *mr; 303 341 unsigned int pos; 304 - int nsegs; 342 + int ret; 305 343 306 344 if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped) 307 345 goto done; ··· 309 347 pos = rqst->rq_snd_buf.head[0].iov_len; 310 348 if (rtype == rpcrdma_areadch) 311 349 pos = 0; 312 - seg = req->rl_segments; 313 - nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos, 314 - rtype, seg); 315 - if (nsegs < 0) 316 - return nsegs; 350 + rpcrdma_xdr_cursor_init(&cur, &rqst->rq_snd_buf, pos, rtype); 317 351 318 352 do { 319 - seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, false, &mr); 320 - if (IS_ERR(seg)) 321 - return PTR_ERR(seg); 353 + ret = rpcrdma_mr_prepare(r_xprt, req, &cur, false, &mr); 354 + if (ret) 355 + return ret; 322 356 323 357 if (encode_read_segment(xdr, mr, pos) < 0) 324 358 return -EMSGSIZE; 325 359 326 - trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr, nsegs); 360 + trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr, 361 + rpcrdma_xdr_cursor_done(&cur)); 327 362 r_xprt->rx_stats.read_chunk_count++; 328 - nsegs -= mr->mr_nents; 329 - } while (nsegs); 363 + } while (!rpcrdma_xdr_cursor_done(&cur)); 330 364 331 365 done: 332 366 if (xdr_stream_encode_item_absent(xdr) < 0) ··· 352 394 { 353 395 struct xdr_stream *xdr = &req->rl_stream; 354 396 struct rpcrdma_ep *ep = r_xprt->rx_ep; 355 - struct rpcrdma_mr_seg *seg; 397 + struct rpcrdma_xdr_cursor cur; 356 398 struct rpcrdma_mr *mr; 357 - int nsegs, nchunks; 399 + int nchunks, ret; 358 400 __be32 *segcount; 359 401 360 402 if (wtype != rpcrdma_writech) 361 403 goto done; 362 404 363 - seg = req->rl_segments; 364 - nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 365 - rqst->rq_rcv_buf.head[0].iov_len, 366 - wtype, seg); 367 - if (nsegs < 0) 368 - return nsegs; 405 + rpcrdma_xdr_cursor_init(&cur, &rqst->rq_rcv_buf, 406 + rqst->rq_rcv_buf.head[0].iov_len, wtype); 369 407 370 408 if (xdr_stream_encode_item_present(xdr) < 0) 371 409 return -EMSGSIZE; ··· 372 418 373 419 nchunks = 0; 374 420 do { 375 - seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); 376 - if (IS_ERR(seg)) 377 - return PTR_ERR(seg); 421 + ret = rpcrdma_mr_prepare(r_xprt, req, &cur, true, &mr); 422 + if (ret) 423 + return ret; 378 424 379 425 if (encode_rdma_segment(xdr, mr) < 0) 380 426 return -EMSGSIZE; 381 427 382 - trace_xprtrdma_chunk_write(rqst->rq_task, mr, nsegs); 428 + trace_xprtrdma_chunk_write(rqst->rq_task, mr, 429 + rpcrdma_xdr_cursor_done(&cur)); 383 430 r_xprt->rx_stats.write_chunk_count++; 384 431 r_xprt->rx_stats.total_rdma_request += mr->mr_length; 385 432 nchunks++; 386 - nsegs -= mr->mr_nents; 387 - } while (nsegs); 433 + } while (!rpcrdma_xdr_cursor_done(&cur)); 388 434 389 435 if (xdr_pad_size(rqst->rq_rcv_buf.page_len)) { 390 436 if (encode_rdma_segment(xdr, ep->re_write_pad_mr) < 0) 391 437 return -EMSGSIZE; 392 438 393 439 trace_xprtrdma_chunk_wp(rqst->rq_task, ep->re_write_pad_mr, 394 - nsegs); 440 + true); 395 441 r_xprt->rx_stats.write_chunk_count++; 396 - r_xprt->rx_stats.total_rdma_request += mr->mr_length; 442 + r_xprt->rx_stats.total_rdma_request += 443 + ep->re_write_pad_mr->mr_length; 397 444 nchunks++; 398 - nsegs -= mr->mr_nents; 399 445 } 400 446 401 447 /* Update count of segments in this Write chunk */ ··· 425 471 enum rpcrdma_chunktype wtype) 426 472 { 427 473 struct xdr_stream *xdr = &req->rl_stream; 428 - struct rpcrdma_mr_seg *seg; 474 + struct rpcrdma_xdr_cursor cur; 429 475 struct rpcrdma_mr *mr; 430 - int nsegs, nchunks; 476 + int nchunks, ret; 431 477 __be32 *segcount; 432 478 433 479 if (wtype != rpcrdma_replych) { ··· 436 482 return 0; 437 483 } 438 484 439 - seg = req->rl_segments; 440 - nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); 441 - if (nsegs < 0) 442 - return nsegs; 485 + rpcrdma_xdr_cursor_init(&cur, &rqst->rq_rcv_buf, 0, wtype); 443 486 444 487 if (xdr_stream_encode_item_present(xdr) < 0) 445 488 return -EMSGSIZE; ··· 447 496 448 497 nchunks = 0; 449 498 do { 450 - seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); 451 - if (IS_ERR(seg)) 452 - return PTR_ERR(seg); 499 + ret = rpcrdma_mr_prepare(r_xprt, req, &cur, true, &mr); 500 + if (ret) 501 + return ret; 453 502 454 503 if (encode_rdma_segment(xdr, mr) < 0) 455 504 return -EMSGSIZE; 456 505 457 - trace_xprtrdma_chunk_reply(rqst->rq_task, mr, nsegs); 506 + trace_xprtrdma_chunk_reply(rqst->rq_task, mr, 507 + rpcrdma_xdr_cursor_done(&cur)); 458 508 r_xprt->rx_stats.reply_chunk_count++; 459 509 r_xprt->rx_stats.total_rdma_request += mr->mr_length; 460 510 nchunks++; 461 - nsegs -= mr->mr_nents; 462 - } while (nsegs); 511 + } while (!rpcrdma_xdr_cursor_done(&cur)); 463 512 464 513 /* Update count of segments in the Reply chunk */ 465 514 *segcount = cpu_to_be32(nchunks); ··· 1422 1471 credits = 1; /* don't deadlock */ 1423 1472 else if (credits > r_xprt->rx_ep->re_max_requests) 1424 1473 credits = r_xprt->rx_ep->re_max_requests; 1425 - rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1)); 1426 1474 if (buf->rb_credits != credits) 1427 1475 rpcrdma_update_cwnd(r_xprt, credits); 1428 1476 ··· 1440 1490 /* LocalInv completion will complete the RPC */ 1441 1491 else 1442 1492 kref_put(&req->rl_kref, rpcrdma_reply_done); 1443 - return; 1444 1493 1445 - out_badversion: 1446 - trace_xprtrdma_reply_vers_err(rep); 1447 - goto out; 1494 + out_post: 1495 + rpcrdma_post_recvs(r_xprt, 1496 + credits + (buf->rb_bc_srv_max_requests << 1)); 1497 + return; 1448 1498 1449 1499 out_norqst: 1450 1500 spin_unlock(&xprt->queue_lock); 1451 1501 trace_xprtrdma_reply_rqst_err(rep); 1502 + rpcrdma_rep_put(buf, rep); 1503 + goto out_post; 1504 + 1505 + out_badversion: 1506 + trace_xprtrdma_reply_vers_err(rep); 1452 1507 goto out; 1453 1508 1454 1509 out_shortreply:
+15 -2
net/sunrpc/xprtrdma/transport.c
··· 510 510 return; 511 511 512 512 out_sleep: 513 - task->tk_status = -ENOMEM; 514 - xprt_add_backlog(xprt, task); 513 + task->tk_status = -EAGAIN; 514 + xprt_add_backlog_noncongested(xprt, task); 515 + /* A buffer freed between buffer_get and rpc_sleep_on 516 + * goes back to the pool with no waiter to wake. 517 + * Re-check after joining the backlog to close that gap. 518 + */ 519 + req = rpcrdma_buffer_get(&r_xprt->rx_buf); 520 + if (req) { 521 + struct rpc_rqst *rqst = &req->rl_slot; 522 + 523 + if (!xprt_wake_up_backlog(xprt, rqst)) { 524 + memset(rqst, 0, sizeof(*rqst)); 525 + rpcrdma_buffer_put(&r_xprt->rx_buf, req); 526 + } 527 + } 515 528 } 516 529 517 530 /**
+17 -2
net/sunrpc/xprtrdma/verbs.c
··· 708 708 */ 709 709 xprt_wait_for_buffer_space(&r_xprt->rx_xprt); 710 710 r_xprt->rx_stats.empty_sendctx_q++; 711 + 712 + /* Recheck: a Send completion between the ring-empty test 713 + * and the set_bit could cause its xprt_write_space() to 714 + * miss, leaving XPRT_WRITE_SPACE set with a non-full ring. 715 + * The smp_mb__after_atomic() pairs with smp_store_release() 716 + * in rpcrdma_sendctx_put_locked(). 717 + */ 718 + smp_mb__after_atomic(); 719 + next_head = rpcrdma_sendctx_next(buf, buf->rb_sc_head); 720 + if (next_head != READ_ONCE(buf->rb_sc_tail)) 721 + xprt_write_space(&r_xprt->rx_xprt); 722 + 711 723 return NULL; 712 724 } 713 725 ··· 751 739 752 740 } while (buf->rb_sc_ctxs[next_tail] != sc); 753 741 754 - /* Paired with READ_ONCE */ 742 + /* Paired with READ_ONCE in rpcrdma_sendctx_get_locked(): 743 + * both the fast-path ring-full test and the post-set_bit 744 + * recheck in the slow path depend on this store-release. 745 + */ 755 746 smp_store_release(&buf->rb_sc_tail, next_tail); 756 747 757 748 xprt_write_space(&r_xprt->rx_xprt); ··· 1374 1359 if (likely(ep->re_receive_count > needed)) 1375 1360 goto out; 1376 1361 needed -= ep->re_receive_count; 1377 - needed += RPCRDMA_MAX_RECV_BATCH; 1362 + needed += ep->re_recv_batch; 1378 1363 1379 1364 if (atomic_inc_return(&ep->re_receiving) > 1) 1380 1365 goto out_dec;
+30 -13
net/sunrpc/xprtrdma/xprt_rdma.h
··· 96 96 struct rpcrdma_notification re_rn; 97 97 int re_receive_count; 98 98 unsigned int re_max_requests; /* depends on device */ 99 + unsigned int re_recv_batch; 99 100 unsigned int re_inline_send; /* negotiated */ 100 101 unsigned int re_inline_recv; /* negotiated */ 101 102 ··· 284 283 * registered or invalidated. Must handle a Reply chunk: 285 284 */ 286 285 enum { 287 - RPCRDMA_MAX_IOV_SEGS = 3, 286 + RPCRDMA_MAX_IOV_SEGS = 3, /* head, page-boundary, tail */ 288 287 RPCRDMA_MAX_DATA_SEGS = ((1 * 1024 * 1024) / PAGE_SIZE) + 1, 289 288 RPCRDMA_MAX_SEGS = RPCRDMA_MAX_DATA_SEGS + 290 289 RPCRDMA_MAX_IOV_SEGS, 291 290 }; 292 291 293 - /* Arguments for DMA mapping and registration */ 294 - struct rpcrdma_mr_seg { 295 - u32 mr_len; /* length of segment */ 296 - struct page *mr_page; /* underlying struct page */ 297 - u64 mr_offset; /* IN: page offset, OUT: iova */ 292 + /** 293 + * struct rpcrdma_xdr_cursor - tracks position within an xdr_buf 294 + * for iterative MR registration 295 + * @xc_buf: the xdr_buf being iterated 296 + * @xc_page_offset: byte offset into the page region consumed so far 297 + * @xc_flags: combination of XC_* bits 298 + * 299 + * Each XC_*_DONE flag indicates that this region has no 300 + * remaining MR registration work. That condition holds both when the region 301 + * has already been registered by a prior frwr_map() call and 302 + * when the region is excluded from this chunk type (pre-set 303 + * at init time by rpcrdma_xdr_cursor_init()). frwr_map() 304 + * treats the two cases identically: skip the region. 305 + */ 306 + struct rpcrdma_xdr_cursor { 307 + const struct xdr_buf *xc_buf; 308 + unsigned int xc_page_offset; 309 + unsigned int xc_flags; 298 310 }; 311 + 312 + #define XC_HEAD_DONE BIT(0) 313 + #define XC_PAGES_DONE BIT(1) 314 + #define XC_TAIL_DONE BIT(2) 299 315 300 316 /* The Send SGE array is provisioned to send a maximum size 301 317 * inline request: ··· 348 330 349 331 struct list_head rl_free_mrs; 350 332 struct list_head rl_registered; 351 - struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; 352 333 }; 353 334 354 335 static inline struct rpcrdma_req * ··· 467 450 } 468 451 469 452 /* Setting this to 0 ensures interoperability with early servers. 470 - * Setting this to 1 enhances certain unaligned read/write performance. 471 - * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */ 453 + * Setting this to 1 enhances unaligned read/write performance. 454 + * Default is 0, see sysctl entry and rpc_rdma.c */ 472 455 extern int xprt_rdma_pad_optimize; 473 456 474 457 /* This setting controls the hunt for a supported memory ··· 552 535 int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device); 553 536 int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr); 554 537 void frwr_mr_release(struct rpcrdma_mr *mr); 555 - struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, 556 - struct rpcrdma_mr_seg *seg, 557 - int nsegs, bool writing, __be32 xid, 558 - struct rpcrdma_mr *mr); 538 + int frwr_map(struct rpcrdma_xprt *r_xprt, 539 + struct rpcrdma_xdr_cursor *cur, 540 + bool writing, __be32 xid, 541 + struct rpcrdma_mr *mr); 559 542 int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); 560 543 void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs); 561 544 void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);