summaryrefslogtreecommitdiffstats
path: root/net/sunrpc
diff options
context:
space:
mode:
authorTrond Myklebust <trond.myklebust@hammerspace.com>2019-11-18 10:55:55 +0100
committerTrond Myklebust <trond.myklebust@hammerspace.com>2019-11-18 10:55:55 +0100
commit4e121fcae809a94aa952407bd74b0757b858ce19 (patch)
tree6dfb8b2850d47a93f6b8310673e2a7aa8f4c992a /net/sunrpc
parentf751c5452594f6ef77b39c78f9888275e60d0770 (diff)
parenta52c23b8b207d676d6cdf531af482a79fa622b9d (diff)
Merge tag 'nfs-rdma-for-5.5-1' of git://git.linux-nfs.org/projects/anna/linux-nfs
NFSoRDMA Client Updates for Linux 5.5 New Features: - New tracepoints for congestion control and Local Invalidate WRs Bugfixes and Cleanups: - Eliminate log noise in call_reserveresult - Fix unstable connections after a reconnect - Clean up some code duplication - Close race between waking a sender and posting a receive - Fix MR list corruption, and clean up MR usage - Remove unused rpcrdma_sendctx fields - Try to avoid DMA mapping pages if it is too costly - Wake pending tasks if connection fails - Replace some dprintk()s with tracepoints
Diffstat (limited to 'net/sunrpc')
-rw-r--r--net/sunrpc/clnt.c14
-rw-r--r--net/sunrpc/xprt.c22
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c2
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c53
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c413
-rw-r--r--net/sunrpc/xprtrdma/transport.c33
-rw-r--r--net/sunrpc/xprtrdma/verbs.c194
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h18
8 files changed, 421 insertions, 328 deletions
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 5baf9b9be2e8..a3379765605d 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1679,8 +1679,6 @@ call_reserveresult(struct rpc_task *task)
return;
}
- printk(KERN_ERR "%s: status=%d, but no request slot, exiting\n",
- __func__, status);
rpc_call_rpcerror(task, -EIO);
return;
}
@@ -1689,11 +1687,8 @@ call_reserveresult(struct rpc_task *task)
* Even though there was an error, we may have acquired
* a request slot somehow. Make sure not to leak it.
*/
- if (task->tk_rqstp) {
- printk(KERN_ERR "%s: status=%d, request allocated anyway\n",
- __func__, status);
+ if (task->tk_rqstp)
xprt_release(task);
- }
switch (status) {
case -ENOMEM:
@@ -1702,14 +1697,9 @@ call_reserveresult(struct rpc_task *task)
case -EAGAIN: /* woken up; retry */
task->tk_action = call_retry_reserve;
return;
- case -EIO: /* probably a shutdown */
- break;
default:
- printk(KERN_ERR "%s: unrecognized error %d, exiting\n",
- __func__, status);
- break;
+ rpc_call_rpcerror(task, status);
}
- rpc_call_rpcerror(task, status);
}
/*
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 41df4c507193..1aafe8d3f3f4 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -205,20 +205,20 @@ int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) {
if (task == xprt->snd_task)
- return 1;
+ goto out_locked;
goto out_sleep;
}
if (test_bit(XPRT_WRITE_SPACE, &xprt->state))
goto out_unlock;
xprt->snd_task = task;
+out_locked:
+ trace_xprt_reserve_xprt(xprt, task);
return 1;
out_unlock:
xprt_clear_locked(xprt);
out_sleep:
- dprintk("RPC: %5u failed to lock transport %p\n",
- task->tk_pid, xprt);
task->tk_status = -EAGAIN;
if (RPC_IS_SOFT(task))
rpc_sleep_on_timeout(&xprt->sending, task, NULL,
@@ -269,23 +269,22 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) {
if (task == xprt->snd_task)
- return 1;
+ goto out_locked;
goto out_sleep;
}
if (req == NULL) {
xprt->snd_task = task;
- return 1;
+ goto out_locked;
}
if (test_bit(XPRT_WRITE_SPACE, &xprt->state))
goto out_unlock;
if (!xprt_need_congestion_window_wait(xprt)) {
xprt->snd_task = task;
- return 1;
+ goto out_locked;
}
out_unlock:
xprt_clear_locked(xprt);
out_sleep:
- dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt);
task->tk_status = -EAGAIN;
if (RPC_IS_SOFT(task))
rpc_sleep_on_timeout(&xprt->sending, task, NULL,
@@ -293,6 +292,9 @@ out_sleep:
else
rpc_sleep_on(&xprt->sending, task, NULL);
return 0;
+out_locked:
+ trace_xprt_reserve_cong(xprt, task);
+ return 1;
}
EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong);
@@ -357,6 +359,7 @@ void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
xprt_clear_locked(xprt);
__xprt_lock_write_next(xprt);
}
+ trace_xprt_release_xprt(xprt, task);
}
EXPORT_SYMBOL_GPL(xprt_release_xprt);
@@ -374,6 +377,7 @@ void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
xprt_clear_locked(xprt);
__xprt_lock_write_next_cong(xprt);
}
+ trace_xprt_release_cong(xprt, task);
}
EXPORT_SYMBOL_GPL(xprt_release_xprt_cong);
@@ -395,8 +399,7 @@ __xprt_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req)
{
if (req->rq_cong)
return 1;
- dprintk("RPC: %5u xprt_cwnd_limited cong = %lu cwnd = %lu\n",
- req->rq_task->tk_pid, xprt->cong, xprt->cwnd);
+ trace_xprt_get_cong(xprt, req->rq_task);
if (RPCXPRT_CONGESTED(xprt)) {
xprt_set_congestion_window_wait(xprt);
return 0;
@@ -418,6 +421,7 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req)
req->rq_cong = 0;
xprt->cong -= RPC_CWNDSCALE;
xprt_test_and_clear_congestion_window_wait(xprt);
+ trace_xprt_put_cong(xprt, req->rq_task);
__xprt_lock_write_next_cong(xprt);
}
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index b458bf53ca69..9d02eae353c6 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -79,7 +79,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
*p = xdr_zero;
if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN,
- &rqst->rq_snd_buf, rpcrdma_noch))
+ &rqst->rq_snd_buf, rpcrdma_noch_pullup))
return -EIO;
trace_xprtrdma_cb_reply(rqst);
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 30065a28628c..523722be6a16 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -36,8 +36,8 @@
* connect worker from running concurrently.
*
* When the underlying transport disconnects, MRs that are in flight
- * are flushed and are likely unusable. Thus all flushed MRs are
- * destroyed. New MRs are created on demand.
+ * are flushed and are likely unusable. Thus all MRs are destroyed.
+ * New MRs are created on demand.
*/
#include <linux/sunrpc/rpc_rdma.h>
@@ -88,8 +88,10 @@ void frwr_release_mr(struct rpcrdma_mr *mr)
kfree(mr);
}
-static void frwr_mr_recycle(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
+static void frwr_mr_recycle(struct rpcrdma_mr *mr)
{
+ struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
+
trace_xprtrdma_mr_recycle(mr);
if (mr->mr_dir != DMA_NONE) {
@@ -107,32 +109,6 @@ static void frwr_mr_recycle(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
frwr_release_mr(mr);
}
-/* MRs are dynamically allocated, so simply clean up and release the MR.
- * A replacement MR will subsequently be allocated on demand.
- */
-static void
-frwr_mr_recycle_worker(struct work_struct *work)
-{
- struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr,
- mr_recycle);
-
- frwr_mr_recycle(mr->mr_xprt, mr);
-}
-
-/* frwr_recycle - Discard MRs
- * @req: request to reset
- *
- * Used after a reconnect. These MRs could be in flight, we can't
- * tell. Safe thing to do is release them.
- */
-void frwr_recycle(struct rpcrdma_req *req)
-{
- struct rpcrdma_mr *mr;
-
- while ((mr = rpcrdma_mr_pop(&req->rl_registered)))
- frwr_mr_recycle(mr->mr_xprt, mr);
-}
-
/* frwr_reset - Place MRs back on the free list
* @req: request to reset
*
@@ -166,9 +142,6 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
struct ib_mr *frmr;
int rc;
- /* NB: ib_alloc_mr and device drivers typically allocate
- * memory with GFP_KERNEL.
- */
frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
if (IS_ERR(frmr))
goto out_mr_err;
@@ -180,7 +153,6 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
mr->frwr.fr_mr = frmr;
mr->mr_dir = DMA_NONE;
INIT_LIST_HEAD(&mr->mr_list);
- INIT_WORK(&mr->mr_recycle, frwr_mr_recycle_worker);
init_completion(&mr->frwr.fr_linv_done);
sg_init_table(sg, depth);
@@ -424,7 +396,7 @@ int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
struct ib_send_wr *post_wr;
struct rpcrdma_mr *mr;
- post_wr = &req->rl_sendctx->sc_wr;
+ post_wr = &req->rl_wr;
list_for_each_entry(mr, &req->rl_registered, mr_list) {
struct rpcrdma_frwr *frwr;
@@ -440,9 +412,6 @@ int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
post_wr = &frwr->fr_regwr.wr;
}
- /* If ib_post_send fails, the next ->send_request for
- * @req will queue these MRs for recovery.
- */
return ib_post_send(ia->ri_id->qp, post_wr, NULL);
}
@@ -468,7 +437,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr)
{
if (wc->status != IB_WC_SUCCESS)
- rpcrdma_mr_recycle(mr);
+ frwr_mr_recycle(mr);
else
rpcrdma_mr_put(mr);
}
@@ -570,7 +539,6 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
*/
bad_wr = NULL;
rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
- trace_xprtrdma_post_send(req, rc);
/* The final LOCAL_INV WR in the chain is supposed to
* do the wake. If it was never posted, the wake will
@@ -583,6 +551,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/* Recycle MRs in the LOCAL_INV chain that did not get posted.
*/
+ trace_xprtrdma_post_linv(req, rc);
while (bad_wr) {
frwr = container_of(bad_wr, struct rpcrdma_frwr,
fr_invwr);
@@ -590,7 +559,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
bad_wr = bad_wr->next;
list_del_init(&mr->mr_list);
- rpcrdma_mr_recycle(mr);
+ frwr_mr_recycle(mr);
}
}
@@ -673,18 +642,18 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
*/
bad_wr = NULL;
rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
- trace_xprtrdma_post_send(req, rc);
if (!rc)
return;
/* Recycle MRs in the LOCAL_INV chain that did not get posted.
*/
+ trace_xprtrdma_post_linv(req, rc);
while (bad_wr) {
frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr);
mr = container_of(frwr, struct rpcrdma_mr, frwr);
bad_wr = bad_wr->next;
- rpcrdma_mr_recycle(mr);
+ frwr_mr_recycle(mr);
}
/* The final LOCAL_INV WR in the chain is supposed to
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index b86b5fd62d9f..aec3beb93b25 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -78,8 +78,6 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
size += rpcrdma_segment_maxsz * sizeof(__be32);
size += sizeof(__be32); /* list discriminator */
- dprintk("RPC: %s: max call header size = %u\n",
- __func__, size);
return size;
}
@@ -100,8 +98,6 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32);
size += sizeof(__be32); /* list discriminator */
- dprintk("RPC: %s: max reply header size = %u\n",
- __func__, size);
return size;
}
@@ -363,8 +359,7 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
out_getmr_err:
trace_xprtrdma_nomrs(req);
xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
- if (r_xprt->rx_ep.rep_connected != -ENODEV)
- schedule_work(&r_xprt->rx_buf.rb_refresh_worker);
+ rpcrdma_mrs_refresh(r_xprt);
return ERR_PTR(-EAGAIN);
}
@@ -393,7 +388,7 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
unsigned int pos;
int nsegs;
- if (rtype == rpcrdma_noch)
+ if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped)
goto done;
pos = rqst->rq_snd_buf.head[0].iov_len;
@@ -565,6 +560,7 @@ static void rpcrdma_sendctx_done(struct kref *kref)
*/
void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc)
{
+ struct rpcrdma_regbuf *rb = sc->sc_req->rl_sendbuf;
struct ib_sge *sge;
if (!sc->sc_unmap_count)
@@ -576,7 +572,7 @@ void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc)
*/
for (sge = &sc->sc_sges[2]; sc->sc_unmap_count;
++sge, --sc->sc_unmap_count)
- ib_dma_unmap_page(sc->sc_device, sge->addr, sge->length,
+ ib_dma_unmap_page(rdmab_device(rb), sge->addr, sge->length,
DMA_TO_DEVICE);
kref_put(&sc->sc_req->rl_kref, rpcrdma_sendctx_done);
@@ -589,149 +585,228 @@ static bool rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt,
{
struct rpcrdma_sendctx *sc = req->rl_sendctx;
struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
- struct ib_sge *sge = sc->sc_sges;
+ struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
if (!rpcrdma_regbuf_dma_map(r_xprt, rb))
- goto out_regbuf;
+ return false;
sge->addr = rdmab_addr(rb);
sge->length = len;
sge->lkey = rdmab_lkey(rb);
ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length,
DMA_TO_DEVICE);
- sc->sc_wr.num_sge++;
return true;
-
-out_regbuf:
- pr_err("rpcrdma: failed to DMA map a Send buffer\n");
- return false;
}
-/* Prepare the Send SGEs. The head and tail iovec, and each entry
- * in the page list, gets its own SGE.
+/* The head iovec is straightforward, as it is usually already
+ * DMA-mapped. Sync the content that has changed.
*/
-static bool rpcrdma_prepare_msg_sges(struct rpcrdma_xprt *r_xprt,
- struct rpcrdma_req *req,
- struct xdr_buf *xdr,
- enum rpcrdma_chunktype rtype)
+static bool rpcrdma_prepare_head_iov(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req, unsigned int len)
{
struct rpcrdma_sendctx *sc = req->rl_sendctx;
- unsigned int sge_no, page_base, len, remaining;
+ struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
struct rpcrdma_regbuf *rb = req->rl_sendbuf;
- struct ib_sge *sge = sc->sc_sges;
- struct page *page, **ppages;
- /* The head iovec is straightforward, as it is already
- * DMA-mapped. Sync the content that has changed.
- */
if (!rpcrdma_regbuf_dma_map(r_xprt, rb))
- goto out_regbuf;
- sc->sc_device = rdmab_device(rb);
- sge_no = 1;
- sge[sge_no].addr = rdmab_addr(rb);
- sge[sge_no].length = xdr->head[0].iov_len;
- sge[sge_no].lkey = rdmab_lkey(rb);
- ib_dma_sync_single_for_device(rdmab_device(rb), sge[sge_no].addr,
- sge[sge_no].length, DMA_TO_DEVICE);
-
- /* If there is a Read chunk, the page list is being handled
- * via explicit RDMA, and thus is skipped here. However, the
- * tail iovec may include an XDR pad for the page list, as
- * well as additional content, and may not reside in the
- * same page as the head iovec.
- */
- if (rtype == rpcrdma_readch) {
- len = xdr->tail[0].iov_len;
+ return false;
- /* Do not include the tail if it is only an XDR pad */
- if (len < 4)
- goto out;
+ sge->addr = rdmab_addr(rb);
+ sge->length = len;
+ sge->lkey = rdmab_lkey(rb);
- page = virt_to_page(xdr->tail[0].iov_base);
- page_base = offset_in_page(xdr->tail[0].iov_base);
+ ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length,
+ DMA_TO_DEVICE);
+ return true;
+}
- /* If the content in the page list is an odd length,
- * xdr_write_pages() has added a pad at the beginning
- * of the tail iovec. Force the tail's non-pad content
- * to land at the next XDR position in the Send message.
- */
- page_base += len & 3;
- len -= len & 3;
- goto map_tail;
- }
+/* If there is a page list present, DMA map and prepare an
+ * SGE for each page to be sent.
+ */
+static bool rpcrdma_prepare_pagelist(struct rpcrdma_req *req,
+ struct xdr_buf *xdr)
+{
+ struct rpcrdma_sendctx *sc = req->rl_sendctx;
+ struct rpcrdma_regbuf *rb = req->rl_sendbuf;
+ unsigned int page_base, len, remaining;
+ struct page **ppages;
+ struct ib_sge *sge;
- /* If there is a page list present, temporarily DMA map
- * and prepare an SGE for each page to be sent.
- */
- if (xdr->page_len) {
- ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
- page_base = offset_in_page(xdr->page_base);
- remaining = xdr->page_len;
- while (remaining) {
- sge_no++;
- if (sge_no > RPCRDMA_MAX_SEND_SGES - 2)
- goto out_mapping_overflow;
-
- len = min_t(u32, PAGE_SIZE - page_base, remaining);
- sge[sge_no].addr =
- ib_dma_map_page(rdmab_device(rb), *ppages,
- page_base, len, DMA_TO_DEVICE);
- if (ib_dma_mapping_error(rdmab_device(rb),
- sge[sge_no].addr))
- goto out_mapping_err;
- sge[sge_no].length = len;
- sge[sge_no].lkey = rdmab_lkey(rb);
-
- sc->sc_unmap_count++;
- ppages++;
- remaining -= len;
- page_base = 0;
- }
- }
+ ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+ page_base = offset_in_page(xdr->page_base);
+ remaining = xdr->page_len;
+ while (remaining) {
+ sge = &sc->sc_sges[req->rl_wr.num_sge++];
+ len = min_t(unsigned int, PAGE_SIZE - page_base, remaining);
+ sge->addr = ib_dma_map_page(rdmab_device(rb), *ppages,
+ page_base, len, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(rdmab_device(rb), sge->addr))
+ goto out_mapping_err;
- /* The tail iovec is not always constructed in the same
- * page where the head iovec resides (see, for example,
- * gss_wrap_req_priv). To neatly accommodate that case,
- * DMA map it separately.
- */
- if (xdr->tail[0].iov_len) {
- page = virt_to_page(xdr->tail[0].iov_base);
- page_base = offset_in_page(xdr->tail[0].iov_base);
- len = xdr->tail[0].iov_len;
+ sge->length = len;
+ sge->lkey = rdmab_lkey(rb);
-map_tail:
- sge_no++;
- sge[sge_no].addr =
- ib_dma_map_page(rdmab_device(rb), page, page_base, len,
- DMA_TO_DEVICE);
- if (ib_dma_mapping_error(rdmab_device(rb), sge[sge_no].addr))
- goto out_mapping_err;
- sge[sge_no].length = len;
- sge[sge_no].lkey = rdmab_lkey(rb);
sc->sc_unmap_count++;
+ ppages++;
+ remaining -= len;
+ page_base = 0;
}
-out:
- sc->sc_wr.num_sge += sge_no;
- if (sc->sc_unmap_count)
- kref_get(&req->rl_kref);
return true;
-out_regbuf:
- pr_err("rpcrdma: failed to DMA map a Send buffer\n");
+out_mapping_err:
+ trace_xprtrdma_dma_maperr(sge->addr);
return false;
+}
-out_mapping_overflow:
- rpcrdma_sendctx_unmap(sc);
- pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no);
- return false;
+/* The tail iovec may include an XDR pad for the page list,
+ * as well as additional content, and may not reside in the
+ * same page as the head iovec.
+ */
+static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req,
+ struct xdr_buf *xdr,
+ unsigned int page_base, unsigned int len)
+{
+ struct rpcrdma_sendctx *sc = req->rl_sendctx;
+ struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
+ struct rpcrdma_regbuf *rb = req->rl_sendbuf;
+ struct page *page = virt_to_page(xdr->tail[0].iov_base);
+
+ sge->addr = ib_dma_map_page(rdmab_device(rb), page, page_base, len,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(rdmab_device(rb), sge->addr))
+ goto out_mapping_err;
+
+ sge->length = len;
+ sge->lkey = rdmab_lkey(rb);
+ ++sc->sc_unmap_count;
+ return true;
out_mapping_err:
- rpcrdma_sendctx_unmap(sc);
- trace_xprtrdma_dma_maperr(sge[sge_no].addr);
+ trace_xprtrdma_dma_maperr(sge->addr);
return false;
}
+/* Copy the tail to the end of the head buffer.
+ */
+static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct xdr_buf *xdr)
+{
+ unsigned char *dst;
+
+ dst = (unsigned char *)xdr->head[0].iov_base;
+ dst += xdr->head[0].iov_len + xdr->page_len;
+ memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len);
+ r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len;
+}
+
+/* Copy pagelist content into the head buffer.
+ */
+static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct xdr_buf *xdr)
+{
+ unsigned int len, page_base, remaining;
+ struct page **ppages;
+ unsigned char *src, *dst;
+
+ dst = (unsigned char *)xdr->head[0].iov_base;
+ dst += xdr->head[0].iov_len;
+ ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+ page_base = offset_in_page(xdr->page_base);
+ remaining = xdr->page_len;
+ while (remaining) {
+ src = page_address(*ppages);
+ src += page_base;
+ len = min_t(unsigned int, PAGE_SIZE - page_base, remaining);
+ memcpy(dst, src, len);
+ r_xprt->rx_stats.pullup_copy_count += len;
+
+ ppages++;
+ dst += len;
+ remaining -= len;
+ page_base = 0;
+ }
+}
+
+/* Copy the contents of @xdr into @rl_sendbuf and DMA sync it.
+ * When the head, pagelist, and tail are small, a pull-up copy
+ * is considerably less costly than DMA mapping the components
+ * of @xdr.
+ *
+ * Assumptions:
+ * - the caller has already verified that the total length
+ * of the RPC Call body will fit into @rl_sendbuf.
+ */
+static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct xdr_buf *xdr)
+{
+ if (unlikely(xdr->tail[0].iov_len))
+ rpcrdma_pullup_tail_iov(r_xprt, req, xdr);
+
+ if (unlikely(xdr->page_len))
+ rpcrdma_pullup_pagelist(r_xprt, req, xdr);
+
+ /* The whole RPC message resides in the head iovec now */
+ return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len);
+}
+
+static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct xdr_buf *xdr)
+{
+ struct kvec *tail = &xdr->tail[0];
+
+ if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len))
+ return false;
+ if (xdr->page_len)
+ if (!rpcrdma_prepare_pagelist(req, xdr))
+ return false;
+ if (tail->iov_len)
+ if (!rpcrdma_prepare_tail_iov(req, xdr,
+ offset_in_page(tail->iov_base),
+ tail->iov_len))
+ return false;
+
+ if (req->rl_sendctx->sc_unmap_count)
+ kref_get(&req->rl_kref);
+ return true;
+}
+
+static bool rpcrdma_prepare_readch(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct xdr_buf *xdr)
+{
+ if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len))
+ return false;
+
+ /* If there is a Read chunk, the page list is being handled
+ * via explicit RDMA, and thus is skipped here.
+ */
+
+ /* Do not include the tail if it is only an XDR pad */
+ if (xdr->tail[0].iov_len > 3) {
+ unsigned int page_base, len;
+
+ /* If the content in the page list is an odd length,
+ * xdr_write_pages() adds a pad at the beginning of
+ * the tail iovec. Force the tail's non-pad content to
+ * land at the next XDR position in the Send message.
+ */
+ page_base = offset_in_page(xdr->tail[0].iov_base);
+ len = xdr->tail[0].iov_len;
+ page_base += len & 3;
+ len -= len & 3;
+ if (!rpcrdma_prepare_tail_iov(req, xdr, page_base, len))
+ return false;
+ kref_get(&req->rl_kref);
+ }
+
+ return true;
+}
+
/**
* rpcrdma_prepare_send_sges - Construct SGEs for a Send WR
* @r_xprt: controlling transport
@@ -742,31 +817,53 @@ out_mapping_err:
*
* Returns 0 on success; otherwise a negative errno is returned.
*/
-int
-rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
- struct rpcrdma_req *req, u32 hdrlen,
- struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
+inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req, u32 hdrlen,
+ struct xdr_buf *xdr,
+ enum rpcrdma_chunktype rtype)
{
int ret;
ret = -EAGAIN;
req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt);
if (!req->rl_sendctx)
- goto err;
- req->rl_sendctx->sc_wr.num_sge = 0;
+ goto out_nosc;
req->rl_sendctx->sc_unmap_count = 0;
req->rl_sendctx->sc_req = req;
kref_init(&req->rl_kref);
+ req->rl_wr.wr_cqe = &req->rl_sendctx->sc_cqe;
+ req->rl_wr.sg_list = req->rl_sendctx->sc_sges;
+ req->rl_wr.num_sge = 0;
+ req->rl_wr.opcode = IB_WR_SEND;
ret = -EIO;
if (!rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen))
- goto err;
- if (rtype != rpcrdma_areadch)
- if (!rpcrdma_prepare_msg_sges(r_xprt, req, xdr, rtype))
- goto err;
+ goto out_unmap;
+
+ switch (rtype) {
+ case rpcrdma_noch_pullup:
+ if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr))
+ goto out_unmap;
+ break;
+ case rpcrdma_noch_mapped:
+ if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr))
+ goto out_unmap;
+ break;
+ case rpcrdma_readch:
+ if (!rpcrdma_prepare_readch(r_xprt, req, xdr))
+ goto out_unmap;
+ break;
+ case rpcrdma_areadch:
+ break;
+ default:
+ goto out_unmap;
+ }
+
return 0;
-err:
+out_unmap:
+ rpcrdma_sendctx_unmap(req->rl_sendctx);
+out_nosc:
trace_xprtrdma_prepsend_failed(&req->rl_slot, ret);
return ret;
}
@@ -796,6 +893,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
struct xdr_stream *xdr = &req->rl_stream;
enum rpcrdma_chunktype rtype, wtype;
+ struct xdr_buf *buf = &rqst->rq_snd_buf;
bool ddp_allowed;
__be32 *p;
int ret;
@@ -853,8 +951,9 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
*/
if (rpcrdma_args_inline(r_xprt, rqst)) {
*p++ = rdma_msg;
- rtype = rpcrdma_noch;
- } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
+ rtype = buf->len < rdmab_length(req->rl_sendbuf) ?
+ rpcrdma_noch_pullup : rpcrdma_noch_mapped;
+ } else if (ddp_allowed && buf->flags & XDRBUF_WRITE) {
*p++ = rdma_msg;
rtype = rpcrdma_readch;
} else {
@@ -863,12 +962,6 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
rtype = rpcrdma_areadch;
}
- /* If this is a retransmit, discard previously registered
- * chunks. Very likely the connection has been replaced,
- * so these registrations are invalid and unusable.
- */
- frwr_recycle(req);
-
/* This implementation supports the following combinations
* of chunk lists in one RPC-over-RDMA Call message:
*
@@ -902,7 +995,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
goto out_err;
ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len,
- &rqst->rq_snd_buf, rtype);
+ buf, rtype);
if (ret)
goto out_err;
@@ -916,6 +1009,40 @@ out_err:
return ret;
}
+static void __rpcrdma_update_cwnd_locked(struct rpc_xprt *xprt,
+ struct rpcrdma_buffer *buf,
+ u32 grant)
+{
+ buf->rb_credits = grant;
+ xprt->cwnd = grant << RPC_CWNDSHIFT;
+}
+
+static void rpcrdma_update_cwnd(struct rpcrdma_xprt *r_xprt, u32 grant)
+{
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+
+ spin_lock(&xprt->transport_lock);
+ __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, grant);
+ spin_unlock(&xprt->transport_lock);
+}
+
+/**
+ * rpcrdma_reset_cwnd - Reset the xprt's congestion window
+ * @r_xprt: controlling transport instance
+ *
+ * Prepare @r_xprt for the next connection by reinitializing
+ * its credit grant to one (see RFC 8166, Section 3.3.3).
+ */
+void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+
+ spin_lock(&xprt->transport_lock);
+ xprt->cong = 0;
+ __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, 1);
+ spin_unlock(&xprt->transport_lock);
+}
+
/**
* rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs
* @rqst: controlling RPC request
@@ -955,7 +1082,6 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
curlen = rqst->rq_rcv_buf.head[0].iov_len;
if (curlen > copy_len)
curlen = copy_len;
- trace_xprtrdma_fixup(rqst, copy_len, curlen);
srcp += curlen;
copy_len -= curlen;
@@ -975,8 +1101,6 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
if (curlen > pagelist_len)
curlen = pagelist_len;
- trace_xprtrdma_fixup_pg(rqst, i, srcp,
- copy_len, curlen);
destp = kmap_atomic(ppages[i]);
memcpy(destp + page_base, srcp, curlen);
flush_dcache_page(ppages[i]);
@@ -1008,6 +1132,8 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
rqst->rq_private_buf.tail[0].iov_base = srcp;
}
+ if (fixup_copy_count)
+ trace_xprtrdma_fixup(rqst, fixup_copy_count);
return fixup_copy_count;
}
@@ -1356,12 +1482,9 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
credits = 1; /* don't deadlock */
else if (credits > buf->rb_max_requests)
credits = buf->rb_max_requests;
- if (buf->rb_credits != credits) {
- spin_lock(&xprt->transport_lock);
- buf->rb_credits = credits;
- xprt->cwnd = credits << RPC_CWNDSHIFT;
- spin_unlock(&xprt->transport_lock);
- }
+ if (buf->rb_credits != credits)
+ rpcrdma_update_cwnd(r_xprt, credits);
+ rpcrdma_post_recvs(r_xprt, false);
req = rpcr_to_rdmar(rqst);
if (req->rl_reply) {
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 160558b4135e..7395eb2cfdeb 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -243,16 +243,13 @@ xprt_rdma_connect_worker(struct work_struct *work)
rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
xprt_clear_connecting(xprt);
if (r_xprt->rx_ep.rep_connected > 0) {
- if (!xprt_test_and_set_connected(xprt)) {
- xprt->stat.connect_count++;
- xprt->stat.connect_time += (long)jiffies -
- xprt->stat.connect_start;
- xprt_wake_pending_tasks(xprt, -EAGAIN);
- }
- } else {
- if (xprt_test_and_clear_connected(xprt))
- xprt_wake_pending_tasks(xprt, rc);
+ xprt->stat.connect_count++;
+ xprt->stat.connect_time += (long)jiffies -
+ xprt->stat.connect_start;
+ xprt_set_connected(xprt);
+ rc = -EAGAIN;
}
+ xprt_wake_pending_tasks(xprt, rc);
}
/**
@@ -425,12 +422,6 @@ void xprt_rdma_close(struct rpc_xprt *xprt)
return;
rpcrdma_ep_disconnect(ep, ia);
- /* Prepare @xprt for the next connection by reinitializing
- * its credit grant to one (see RFC 8166, Section 3.3.3).
- */
- r_xprt->rx_buf.rb_credits = 1;
- xprt->cwnd = RPC_CWNDSHIFT;
-
out:
xprt->reestablish_timeout = 0;
++xprt->connect_cookie;
@@ -450,12 +441,6 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
struct sockaddr *sap = (struct sockaddr *)&xprt->addr;
char buf[8];
- dprintk("RPC: %s: setting port for xprt %p (%s:%s) to %u\n",
- __func__, xprt,
- xprt->address_strings[RPC_DISPLAY_ADDR],
- xprt->address_strings[RPC_DISPLAY_PORT],
- port);
-
rpc_set_port(sap, port);
kfree(xprt->address_strings[RPC_DISPLAY_PORT]);
@@ -465,6 +450,9 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]);
snprintf(buf, sizeof(buf), "%4hx", port);
xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
+
+ trace_xprtrdma_op_setport(container_of(xprt, struct rpcrdma_xprt,
+ rx_xprt));
}
/**
@@ -536,13 +524,12 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
unsigned long delay;
- trace_xprtrdma_op_connect(r_xprt);
-
delay = 0;
if (r_xprt->rx_ep.rep_connected != 0) {
delay = xprt_reconnect_delay(xprt);
xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO);
}
+ trace_xprtrdma_op_connect(r_xprt, delay);
queue_delayed_work(xprtiod_workqueue, &r_xprt->rx_connect_worker,
delay);
}
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 3a907537e2cf..77c7dd7f05e8 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -74,17 +74,17 @@
/*
* internal functions
*/
-static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc);
+static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_sendctx *sc);
+static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf);
static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
-static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf);
-static void rpcrdma_mr_free(struct rpcrdma_mr *mr);
+static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt);
static struct rpcrdma_regbuf *
rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction,
gfp_t flags);
static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb);
static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb);
-static void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp);
/* Wait for outstanding transport work to finish. ib_drain_qp
* handles the drains in the wrong order for us, so open code
@@ -125,7 +125,7 @@ rpcrdma_qp_event_handler(struct ib_event *event, void *context)
/**
* rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC
- * @cq: completion queue (ignored)
+ * @cq: completion queue
* @wc