summaryrefslogtreecommitdiffstats
path: root/net/sunrpc/xprtrdma
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-05-10 13:29:23 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2017-05-10 13:29:23 -0700
commitc70422f760c120480fee4de6c38804c72aa26bc1 (patch)
tree9c61102379bbbf090c13c373ffdace76fe7711ef /net/sunrpc/xprtrdma
parent73ccb023a2f25b72c4b95499ca24760588014614 (diff)
parentb26b78cb726007533d81fdf90a62e915002ef5c8 (diff)
Merge tag 'nfsd-4.12' of git://linux-nfs.org/~bfields/linux
Pull nfsd updates from Bruce Fields: "Another RDMA update from Chuck Lever, and a bunch of miscellaneous bugfixes" * tag 'nfsd-4.12' of git://linux-nfs.org/~bfields/linux: (26 commits) nfsd: Fix up the "supattr_exclcreat" attributes nfsd: encoders mustn't use unitialized values in error cases nfsd: fix undefined behavior in nfsd4_layout_verify lockd: fix lockd shutdown race NFSv4: Fix callback server shutdown SUNRPC: Refactor svc_set_num_threads() NFSv4.x/callback: Create the callback service through svc_create_pooled lockd: remove redundant check on block svcrdma: Clean out old XDR encoders svcrdma: Remove the req_map cache svcrdma: Remove unused RDMA Write completion handler svcrdma: Reduce size of sge array in struct svc_rdma_op_ctxt svcrdma: Clean up RPC-over-RDMA backchannel reply processing svcrdma: Report Write/Reply chunk overruns svcrdma: Clean up RDMA_ERROR path svcrdma: Use rdma_rw API in RPC reply path svcrdma: Introduce local rdma_rw API helpers svcrdma: Clean up svc_rdma_get_inv_rkey() svcrdma: Add helper to save pages under I/O svcrdma: Eliminate RPCRDMA_SQ_DEPTH_MULT ...
Diffstat (limited to 'net/sunrpc/xprtrdma')
-rw-r--r--net/sunrpc/xprtrdma/Makefile2
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c8
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c71
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_marshal.c89
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c79
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_rw.c512
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c978
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c110
8 files changed, 1100 insertions, 749 deletions
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index ef19fa42c50f..c1ae8142ab73 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -4,5 +4,5 @@ rpcrdma-y := transport.o rpc_rdma.o verbs.o \
fmr_ops.o frwr_ops.o \
svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
- module.o
+ svc_rdma_rw.o module.o
rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index c846ca9f1eba..a4a8f6989ee7 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -58,9 +58,9 @@ unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS;
static unsigned int min_max_requests = 4;
static unsigned int max_max_requests = 16384;
-unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
-static unsigned int min_max_inline = 4096;
-static unsigned int max_max_inline = 65536;
+unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH;
+static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH;
+static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH;
atomic_t rdma_stat_recv;
atomic_t rdma_stat_read;
@@ -247,8 +247,6 @@ int svc_rdma_init(void)
dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord);
dprintk("\tmax_requests : %u\n", svcrdma_max_requests);
- dprintk("\tsq_depth : %u\n",
- svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests);
dprintk("\tmax_inline : %d\n", svcrdma_max_req_size);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index ff1df40f0d26..c676ed0efb5a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -12,7 +12,17 @@
#undef SVCRDMA_BACKCHANNEL_DEBUG
-int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
+/**
+ * svc_rdma_handle_bc_reply - Process incoming backchannel reply
+ * @xprt: controlling backchannel transport
+ * @rdma_resp: pointer to incoming transport header
+ * @rcvbuf: XDR buffer into which to decode the reply
+ *
+ * Returns:
+ * %0 if @rcvbuf is filled in, xprt_complete_rqst called,
+ * %-EAGAIN if server should call ->recvfrom again.
+ */
+int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
struct xdr_buf *rcvbuf)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
@@ -27,13 +37,13 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
p = (__be32 *)src->iov_base;
len = src->iov_len;
- xid = rmsgp->rm_xid;
+ xid = *rdma_resp;
#ifdef SVCRDMA_BACKCHANNEL_DEBUG
pr_info("%s: xid=%08x, length=%zu\n",
__func__, be32_to_cpu(xid), len);
pr_info("%s: RPC/RDMA: %*ph\n",
- __func__, (int)RPCRDMA_HDRLEN_MIN, rmsgp);
+ __func__, (int)RPCRDMA_HDRLEN_MIN, rdma_resp);
pr_info("%s: RPC: %*ph\n",
__func__, (int)len, p);
#endif
@@ -53,7 +63,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
goto out_unlock;
memcpy(dst->iov_base, p, len);
- credits = be32_to_cpu(rmsgp->rm_credit);
+ credits = be32_to_cpup(rdma_resp + 2);
if (credits == 0)
credits = 1; /* don't deadlock */
else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
@@ -90,9 +100,9 @@ out_notfound:
* Caller holds the connection's mutex and has already marshaled
* the RPC/RDMA request.
*
- * This is similar to svc_rdma_reply, but takes an rpc_rqst
- * instead, does not support chunks, and avoids blocking memory
- * allocation.
+ * This is similar to svc_rdma_send_reply_msg, but takes a struct
+ * rpc_rqst instead, does not support chunks, and avoids blocking
+ * memory allocation.
*
* XXX: There is still an opportunity to block in svc_rdma_send()
* if there are no SQ entries to post the Send. This may occur if
@@ -101,59 +111,36 @@ out_notfound:
static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
struct rpc_rqst *rqst)
{
- struct xdr_buf *sndbuf = &rqst->rq_snd_buf;
struct svc_rdma_op_ctxt *ctxt;
- struct svc_rdma_req_map *vec;
- struct ib_send_wr send_wr;
int ret;
- vec = svc_rdma_get_req_map(rdma);
- ret = svc_rdma_map_xdr(rdma, sndbuf, vec, false);
- if (ret)
+ ctxt = svc_rdma_get_context(rdma);
+
+ /* rpcrdma_bc_send_request builds the transport header and
+ * the backchannel RPC message in the same buffer. Thus only
+ * one SGE is needed to send both.
+ */
+ ret = svc_rdma_map_reply_hdr(rdma, ctxt, rqst->rq_buffer,
+ rqst->rq_snd_buf.len);
+ if (ret < 0)
goto out_err;
ret = svc_rdma_repost_recv(rdma, GFP_NOIO);
if (ret)
goto out_err;
- ctxt = svc_rdma_get_context(rdma);
- ctxt->pages[0] = virt_to_page(rqst->rq_buffer);
- ctxt->count = 1;
-
- ctxt->direction = DMA_TO_DEVICE;
- ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
- ctxt->sge[0].length = sndbuf->len;
- ctxt->sge[0].addr =
- ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0,
- sndbuf->len, DMA_TO_DEVICE);
- if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) {
- ret = -EIO;
- goto out_unmap;
- }
- svc_rdma_count_mappings(rdma, ctxt);
-
- memset(&send_wr, 0, sizeof(send_wr));
- ctxt->cqe.done = svc_rdma_wc_send;
- send_wr.wr_cqe = &ctxt->cqe;
- send_wr.sg_list = ctxt->sge;
- send_wr.num_sge = 1;
- send_wr.opcode = IB_WR_SEND;
- send_wr.send_flags = IB_SEND_SIGNALED;
-
- ret = svc_rdma_send(rdma, &send_wr);
- if (ret) {
- ret = -EIO;
+ ret = svc_rdma_post_send_wr(rdma, ctxt, 1, 0);
+ if (ret)
goto out_unmap;
- }
out_err:
- svc_rdma_put_req_map(rdma, vec);
dprintk("svcrdma: %s returns %d\n", __func__, ret);
return ret;
out_unmap:
svc_rdma_unmap_dma(ctxt);
svc_rdma_put_context(ctxt, 1);
+ ret = -EIO;
goto out_err;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 1c4aabf0f657..bdcf7d85a3dc 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -166,92 +166,3 @@ out_inval:
dprintk("svcrdma: failed to parse transport header\n");
return -EINVAL;
}
-
-int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
- struct rpcrdma_msg *rmsgp,
- enum rpcrdma_errcode err, __be32 *va)
-{
- __be32 *startp = va;
-
- *va++ = rmsgp->rm_xid;
- *va++ = rmsgp->rm_vers;
- *va++ = xprt->sc_fc_credits;
- *va++ = rdma_error;
- *va++ = cpu_to_be32(err);
- if (err == ERR_VERS) {
- *va++ = rpcrdma_version;
- *va++ = rpcrdma_version;
- }
-
- return (int)((unsigned long)va - (unsigned long)startp);
-}
-
-/**
- * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header
- * @rdma_resp: buffer containing Reply transport header
- *
- * Returns length of transport header, in bytes.
- */
-unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp)
-{
- unsigned int nsegs;
- __be32 *p;
-
- p = rdma_resp;
-
- /* RPC-over-RDMA V1 replies never have a Read list. */
- p += rpcrdma_fixed_maxsz + 1;
-
- /* Skip Write list. */
- while (*p++ != xdr_zero) {
- nsegs = be32_to_cpup(p++);
- p += nsegs * rpcrdma_segment_maxsz;
- }
-
- /* Skip Reply chunk. */
- if (*p++ != xdr_zero) {
- nsegs = be32_to_cpup(p++);
- p += nsegs * rpcrdma_segment_maxsz;
- }
-
- return (unsigned long)p - (unsigned long)rdma_resp;
-}
-
-void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
-{
- struct rpcrdma_write_array *ary;
-
- /* no read-list */
- rmsgp->rm_body.rm_chunks[0] = xdr_zero;
-
- /* write-array discrim */
- ary = (struct rpcrdma_write_array *)
- &rmsgp->rm_body.rm_chunks[1];
- ary->wc_discrim = xdr_one;
- ary->wc_nchunks = cpu_to_be32(chunks);
-
- /* write-list terminator */
- ary->wc_array[chunks].wc_target.rs_handle = xdr_zero;
-
- /* reply-array discriminator */
- ary->wc_array[chunks].wc_target.rs_length = xdr_zero;
-}
-
-void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary,
- int chunks)
-{
- ary->wc_discrim = xdr_one;
- ary->wc_nchunks = cpu_to_be32(chunks);
-}
-
-void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
- int chunk_no,
- __be32 rs_handle,
- __be64 rs_offset,
- u32 write_len)
-{
- struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target;
- seg->rs_handle = rs_handle;
- seg->rs_offset = rs_offset;
- seg->rs_length = cpu_to_be32(write_len);
-}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index f7b2daf72a86..27a99bf5b1a6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -558,33 +558,85 @@ static void rdma_read_complete(struct svc_rqst *rqstp,
rqstp->rq_arg.buflen = head->arg.buflen;
}
+static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
+ __be32 *rdma_argp, int status)
+{
+ struct svc_rdma_op_ctxt *ctxt;
+ __be32 *p, *err_msgp;
+ unsigned int length;
+ struct page *page;
+ int ret;
+
+ ret = svc_rdma_repost_recv(xprt, GFP_KERNEL);
+ if (ret)
+ return;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return;
+ err_msgp = page_address(page);
+
+ p = err_msgp;
+ *p++ = *rdma_argp;
+ *p++ = *(rdma_argp + 1);
+ *p++ = xprt->sc_fc_credits;
+ *p++ = rdma_error;
+ if (status == -EPROTONOSUPPORT) {
+ *p++ = err_vers;
+ *p++ = rpcrdma_version;
+ *p++ = rpcrdma_version;
+ } else {
+ *p++ = err_chunk;
+ }
+ length = (unsigned long)p - (unsigned long)err_msgp;
+
+ /* Map transport header; no RPC message payload */
+ ctxt = svc_rdma_get_context(xprt);
+ ret = svc_rdma_map_reply_hdr(xprt, ctxt, err_msgp, length);
+ if (ret) {
+ dprintk("svcrdma: Error %d mapping send for protocol error\n",
+ ret);
+ return;
+ }
+
+ ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0);
+ if (ret) {
+ dprintk("svcrdma: Error %d posting send for protocol error\n",
+ ret);
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 1);
+ }
+}
+
/* By convention, backchannel calls arrive via rdma_msg type
* messages, and never populate the chunk lists. This makes
* the RPC/RDMA header small and fixed in size, so it is
* straightforward to check the RPC header's direction field.
*/
-static bool
-svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, struct rpcrdma_msg *rmsgp)
+static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt,
+ __be32 *rdma_resp)
{
- __be32 *p = (__be32 *)rmsgp;
+ __be32 *p;
if (!xprt->xpt_bc_xprt)
return false;
- if (rmsgp->rm_type != rdma_msg)
+ p = rdma_resp + 3;
+ if (*p++ != rdma_msg)
return false;
- if (rmsgp->rm_body.rm_chunks[0] != xdr_zero)
+
+ if (*p++ != xdr_zero)
return false;
- if (rmsgp->rm_body.rm_chunks[1] != xdr_zero)
+ if (*p++ != xdr_zero)
return false;
- if (rmsgp->rm_body.rm_chunks[2] != xdr_zero)
+ if (*p++ != xdr_zero)
return false;
- /* sanity */
- if (p[7] != rmsgp->rm_xid)
+ /* XID sanity */
+ if (*p++ != *rdma_resp)
return false;
/* call direction */
- if (p[8] == cpu_to_be32(RPC_CALL))
+ if (*p == cpu_to_be32(RPC_CALL))
return false;
return true;
@@ -650,8 +702,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
goto out_drop;
rqstp->rq_xprt_hlen = ret;
- if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) {
- ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp,
+ if (svc_rdma_is_backchannel_reply(xprt, &rmsgp->rm_xid)) {
+ ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt,
+ &rmsgp->rm_xid,
&rqstp->rq_arg);
svc_rdma_put_context(ctxt, 0);
if (ret)
@@ -686,7 +739,7 @@ complete:
return ret;
out_err:
- svc_rdma_send_error(rdma_xprt, rmsgp, ret);
+ svc_rdma_send_error(rdma_xprt, &rmsgp->rm_xid, ret);
svc_rdma_put_context(ctxt, 0);
return 0;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
new file mode 100644
index 000000000000..0cf620277693
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2016 Oracle. All rights reserved.
+ *
+ * Use the core R/W API to move RPC-over-RDMA Read and Write chunks.
+ */
+
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/sunrpc/svc_rdma.h>
+#include <linux/sunrpc/debug.h>
+
+#include <rdma/rw.h>
+
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+
+/* Each R/W context contains state for one chain of RDMA Read or
+ * Write Work Requests.
+ *
+ * Each WR chain handles a single contiguous server-side buffer,
+ * because scatterlist entries after the first have to start on
+ * page alignment. xdr_buf iovecs cannot guarantee alignment.
+ *
+ * Each WR chain handles only one R_key. Each RPC-over-RDMA segment
+ * from a client may contain a unique R_key, so each WR chain moves
+ * up to one segment at a time.
+ *
+ * The scatterlist makes this data structure over 4KB in size. To
+ * make it less likely to fail, and to handle the allocation for
+ * smaller I/O requests without disabling bottom-halves, these
+ * contexts are created on demand, but cached and reused until the
+ * controlling svcxprt_rdma is destroyed.
+ */
+struct svc_rdma_rw_ctxt {
+ struct list_head rw_list;
+ struct rdma_rw_ctx rw_ctx;
+ int rw_nents;
+ struct sg_table rw_sg_table;
+ struct scatterlist rw_first_sgl[0];
+};
+
+static inline struct svc_rdma_rw_ctxt *
+svc_rdma_next_ctxt(struct list_head *list)
+{
+ return list_first_entry_or_null(list, struct svc_rdma_rw_ctxt,
+ rw_list);
+}
+
+static struct svc_rdma_rw_ctxt *
+svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
+{
+ struct svc_rdma_rw_ctxt *ctxt;
+
+ spin_lock(&rdma->sc_rw_ctxt_lock);
+
+ ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts);
+ if (ctxt) {
+ list_del(&ctxt->rw_list);
+ spin_unlock(&rdma->sc_rw_ctxt_lock);
+ } else {
+ spin_unlock(&rdma->sc_rw_ctxt_lock);
+ ctxt = kmalloc(sizeof(*ctxt) +
+ SG_CHUNK_SIZE * sizeof(struct scatterlist),
+ GFP_KERNEL);
+ if (!ctxt)
+ goto out;
+ INIT_LIST_HEAD(&ctxt->rw_list);
+ }
+
+ ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl;
+ if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges,
+ ctxt->rw_sg_table.sgl)) {
+ kfree(ctxt);
+ ctxt = NULL;
+ }
+out:
+ return ctxt;
+}
+
+static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
+ struct svc_rdma_rw_ctxt *ctxt)
+{
+ sg_free_table_chained(&ctxt->rw_sg_table, true);
+
+ spin_lock(&rdma->sc_rw_ctxt_lock);
+ list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts);
+ spin_unlock(&rdma->sc_rw_ctxt_lock);
+}
+
+/**
+ * svc_rdma_destroy_rw_ctxts - Free accumulated R/W contexts
+ * @rdma: transport about to be destroyed
+ *
+ */
+void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma)
+{
+ struct svc_rdma_rw_ctxt *ctxt;
+
+ while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) {
+ list_del(&ctxt->rw_list);
+ kfree(ctxt);
+ }
+}
+
+/* A chunk context tracks all I/O for moving one Read or Write
+ * chunk. This is a a set of rdma_rw's that handle data movement
+ * for all segments of one chunk.
+ *
+ * These are small, acquired with a single allocator call, and
+ * no more than one is needed per chunk. They are allocated on
+ * demand, and not cached.
+ */
+struct svc_rdma_chunk_ctxt {
+ struct ib_cqe cc_cqe;
+ struct svcxprt_rdma *cc_rdma;
+ struct list_head cc_rwctxts;
+ int cc_sqecount;
+ enum dma_data_direction cc_dir;
+};
+
+static void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
+ struct svc_rdma_chunk_ctxt *cc,
+ enum dma_data_direction dir)
+{
+ cc->cc_rdma = rdma;
+ svc_xprt_get(&rdma->sc_xprt);
+
+ INIT_LIST_HEAD(&cc->cc_rwctxts);
+ cc->cc_sqecount = 0;
+ cc->cc_dir = dir;
+}
+
+static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc)
+{
+ struct svcxprt_rdma *rdma = cc->cc_rdma;
+ struct svc_rdma_rw_ctxt *ctxt;
+
+ while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) {
+ list_del(&ctxt->rw_list);
+
+ rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp,
+ rdma->sc_port_num, ctxt->rw_sg_table.sgl,
+ ctxt->rw_nents, cc->cc_dir);
+ svc_rdma_put_rw_ctxt(rdma, ctxt);
+ }
+ svc_xprt_put(&rdma->sc_xprt);
+}
+
+/* State for sending a Write or Reply chunk.
+ * - Tracks progress of writing one chunk over all its segments
+ * - Stores arguments for the SGL constructor functions
+ */
+struct svc_rdma_write_info {
+ /* write state of this chunk */
+ unsigned int wi_seg_off;
+ unsigned int wi_seg_no;
+ unsigned int wi_nsegs;
+ __be32 *wi_segs;
+
+ /* SGL constructor arguments */
+ struct xdr_buf *wi_xdr;
+ unsigned char *wi_base;
+ unsigned int wi_next_off;
+
+ struct svc_rdma_chunk_ctxt wi_cc;
+};
+
+static struct svc_rdma_write_info *
+svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk)
+{
+ struct svc_rdma_write_info *info;
+
+ info = kmalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return info;
+
+ info->wi_seg_off = 0;
+ info->wi_seg_no = 0;
+ info->wi_nsegs = be32_to_cpup(++chunk);
+ info->wi_segs = ++chunk;
+ svc_rdma_cc_init(rdma, &info->wi_cc, DMA_TO_DEVICE);
+ return info;
+}
+
+static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
+{
+ svc_rdma_cc_release(&info->wi_cc);
+ kfree(info);
+}
+
+/**
+ * svc_rdma_write_done - Write chunk completion
+ * @cq: controlling Completion Queue
+ * @wc: Work Completion
+ *
+ * Pages under I/O are freed by a subsequent Send completion.
+ */
+static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_cqe *cqe = wc->wr_cqe;
+ struct svc_rdma_chunk_ctxt *cc =
+ container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
+ struct svcxprt_rdma *rdma = cc->cc_rdma;
+ struct svc_rdma_write_info *info =
+ container_of(cc, struct svc_rdma_write_info, wi_cc);
+
+ atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+ wake_up(&rdma->sc_send_wait);
+
+ if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ pr_err("svcrdma: write ctx: %s (%u/0x%x)\n",
+ ib_wc_status_msg(wc->status),
+ wc->status, wc->vendor_err);
+ }
+
+ svc_rdma_write_info_free(info);
+}
+
+/* This function sleeps when the transport's Send Queue is congested.
+ *
+ * Assumptions:
+ * - If ib_post_send() succeeds, only one completion is expected,
+ * even if one or more WRs are flushed. This is true when posting
+ * an rdma_rw_ctx or when posting a single signaled WR.
+ */
+static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
+{
+ struct svcxprt_rdma *rdma = cc->cc_rdma;
+ struct svc_xprt *xprt = &rdma->sc_xprt;
+ struct ib_send_wr *first_wr, *bad_wr;
+ struct list_head *tmp;
+ struct ib_cqe *cqe;
+ int ret;
+
+ first_wr = NULL;
+ cqe = &cc->cc_cqe;
+ list_for_each(tmp, &cc->cc_rwctxts) {
+ struct svc_rdma_rw_ctxt *ctxt;
+
+ ctxt = list_entry(tmp, struct svc_rdma_rw_ctxt, rw_list);
+ first_wr = rdma_rw_ctx_wrs(&ctxt->rw_ctx, rdma->sc_qp,
+ rdma->sc_port_num, cqe, first_wr);
+ cqe = NULL;
+ }
+
+ do {
+ if (atomic_sub_return(cc->cc_sqecount,
+ &rdma->sc_sq_avail) > 0) {
+ ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
+ if (ret)
+ break;
+ return 0;
+ }
+
+ atomic_inc(&rdma_stat_sq_starve);
+ atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+ wait_event(rdma->sc_send_wait,
+ atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount);
+ } while (1);
+
+ pr_err("svcrdma: ib_post_send failed (%d)\n", ret);
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+
+ /* If even one was posted, there will be a completion. */
+ if (bad_wr != first_wr)
+ return 0;
+
+ atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+ wake_up(&rdma->sc_send_wait);
+ return -ENOTCONN;
+}
+
+/* Build and DMA-map an SGL that covers one kvec in an xdr_buf
+ */
+static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info,
+ unsigned int len,
+ struct svc_rdma_rw_ctxt *ctxt)
+{
+ struct scatterlist *sg = ctxt->rw_sg_table.sgl;
+
+ sg_set_buf(&sg[0], info->wi_base, len);
+ info->wi_base += len;
+
+ ctxt->rw_nents = 1;
+}
+
+/* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist.
+ */
+static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
+ unsigned int remaining,
+ struct svc_rdma_rw_ctxt *ctxt)
+{
+ unsigned int sge_no, sge_bytes, page_off, page_no;
+ struct xdr_buf *xdr = info->wi_xdr;
+ struct scatterlist *sg;
+ struct page **page;
+
+ page_off = (info->wi_next_off + xdr->page_base) & ~PAGE_MASK;
+ page_no = (info->wi_next_off + xdr->page_base) >> PAGE_SHIFT;
+ page = xdr->pages + page_no;
+ info->wi_next_off += remaining;
+ sg = ctxt->rw_sg_table.sgl;
+ sge_no = 0;
+ do {
+ sge_bytes = min_t(unsigned int, remaining,
+ PAGE_SIZE - page_off);
+ sg_set_page(sg, *page, sge_bytes, page_off);
+
+ remaining -= sge_bytes;
+ sg = sg_next(sg);
+ page_off = 0;
+ sge_no++;
+ page++;
+ } while (remaining);
+
+ ctxt->rw_nents = sge_no;
+}
+
+/* Construct RDMA Write WRs to send a portion of an xdr_buf containing
+ * an RPC Reply.
+ */
+static int
+svc_rdma_build_writes(struct svc_rdma_write_info *info,
+ void (*constructor)(struct svc_rdma_write_info *info,
+ unsigned int len,
+ struct svc_rdma_rw_ctxt *ctxt),
+ unsigned int remaining)
+{
+ struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
+ struct svcxprt_rdma *rdma = cc->cc_rdma;
+ struct svc_rdma_rw_ctxt *ctxt;
+ __be32 *seg;
+ int ret;
+
+ cc->cc_cqe.done = svc_rdma_write_done;
+ seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz;
+ do {
+ unsigned int write_len;
+ u32 seg_length, seg_handle;
+ u64 seg_offset;
+
+ if (info->wi_seg_no >= info->wi_nsegs)
+ goto out_overflow;
+
+ seg_handle = be32_to_cpup(seg);
+ seg_length = be32_to_cpup(seg + 1);
+ xdr_decode_hyper(seg + 2, &seg_offset);
+ seg_offset += info->wi_seg_off;
+
+ write_len = min(remaining, seg_length - info->wi_seg_off);
+ ctxt = svc_rdma_get_rw_ctxt(rdma,
+ (write_len >> PAGE_SHIFT) + 2);
+ if (!ctxt)
+ goto out_noctx;
+
+ constructor(info, write_len, ctxt);
+ ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp,
+ rdma->sc_port_num, ctxt->rw_sg_table.sgl,
+ ctxt->rw_nents, 0, seg_offset,
+ seg_handle, DMA_TO_DEVICE);
+ if (ret < 0)
+ goto out_initerr;
+
+ list_add(&ctxt->rw_list, &cc->cc_rwctxts);
+ cc->cc_sqecount += ret;
+ if (write_len == seg_length - info->wi_seg_off) {
+ seg += 4;
+ info->wi_seg_no++;
+ info->wi_seg_off = 0;
+ } else {
+ info->wi_seg_off += write_len;
+ }
+ remaining -= write_len;
+ } while (remaining);
+
+ return 0;
+
+out_overflow:
+ dprintk("svcrdma: inadequate space in Write chunk (%u)\n",
+ info->wi_nsegs);
+ return -E2BIG;
+
+out_noctx:
+ dprintk("svcrdma: no R/W ctxs available\n");
+ return -ENOMEM;
+
+out_initerr:
+ svc_rdma_put_rw_ctxt(rdma, ctxt);
+ pr_err("svcrdma: failed to map pagelist (%d)\n", ret);
+ return -EIO;
+}
+
+/* Send one of an xdr_buf's kvecs by itself. To send a Reply
+ * chunk, the whole RPC Reply is written back to the client.
+ * This function writes either the head or tail of the xdr_buf
+ * containing the Reply.
+ */
+static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info,
+ struct kvec *vec)
+{
+ info->wi_base = vec->iov_base;
+ return svc_rdma_build_writes(info, svc_rdma_vec_to_sg,
+ vec->iov_len);
+}
+
+/* Send an xdr_buf's page list by itself. A Write chunk is
+ * just the page list. a Reply chunk is the head, page list,
+ * and tail. This function is shared between the two types
+ * of chunk.
+ */
+static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
+ struct xdr_buf *xdr)
+{
+ info->wi_xdr = xdr;
+ info->wi_next_off = 0;
+ return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg,
+ xdr->page_len);
+}
+
+/**
+ * svc_rdma_send_write_chunk - Write all segments in a Write chunk
+ * @rdma: controlling RDMA transport
+ * @wr_ch: Write chunk provided by client
+ * @xdr: xdr_buf containing the data payload
+ *
+ * Returns a non-negative number of bytes the chunk consumed, or
+ * %-E2BIG if the payload was larger than the Write chunk,
+ * %-ENOMEM if rdma_rw context pool was exhausted,
+ * %-ENOTCONN if posting failed (connection is lost),
+ * %-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ */
+int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
+ struct xdr_buf *xdr)
+{
+ struct svc_rdma_write_info *info;
+ int ret;
+
+ if (!xdr->page_len)
+ return 0;
+
+ info = svc_rdma_write_info_alloc(rdma, wr_ch);
+ if (!info)
+ return -ENOMEM;
+
+ ret = svc_rdma_send_xdr_pagelist(info, xdr);
+ if (ret < 0)
+ goto out_err;
+
+ ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+ if (ret < 0)
+ goto out_err;
+ return xdr->page_len;
+
+out_err:
+ svc_rdma_write_info_free(info);
+ return ret;
+}
+
+/**
+ * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk
+ * @rdma: controlling RDMA transport
+ * @rp_ch: Reply chunk provided by client
+ * @writelist: true if client provided a Write list
+ * @xdr: xdr_buf containing an RPC Reply
+ *
+ * Returns a non-negative number of bytes the chunk consumed, or
+ * %-E2BIG if the payload was larger than the Reply chunk,
+ * %-ENOMEM if rdma_rw context pool was exhausted,
+ * %-ENOTCONN if posting failed (connection is lost),
+ * %-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ */
+int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch,
+ bool writelist, struct xdr_buf *xdr)
+{
+ struct svc_rdma_write_info *info;
+ int consumed, ret;
+
+ info = svc_rdma_write_info_alloc(rdma, rp_ch);
+ if (!info)
+ return -ENOMEM;
+
+ ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]);
+ if (ret < 0)
+ goto out_err;
+ consumed = xdr->head[0].iov_len;
+
+ /* Send the page list in the Reply chunk only if the
+ * client did not provide Write chunks.
+ */
+ if (!writelist && xdr->page_len) {
+ ret = svc_rdma_send_xdr_pagelist(info, xdr);
+ if (ret < 0)
+ goto out_err;
+ consumed += xdr->page_len;
+ }
+
+ if (xdr->tail[0].iov_len) {
+ ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]);
+ if (ret < 0)
+ goto out_err;
+ consumed += xdr->tail[0].iov_len;
+ }
+
+ ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+ if (ret < 0)
+ goto out_err;
+ return consumed;
+
+out_err:
+ svc_rdma_write_info_free(info);
+ return ret;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 515221b16d09..1736337f3a55 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -1,4 +1,5 @@
/*
+ * Copyright (c) 2016 Oracle. All rights reserved.
* Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
* Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
*
@@ -40,6 +41,63 @@
* Author: Tom Tucker <tom@opengridcomputing.com>
*/
+/* Operation
+ *
+ * The main entry point is svc_rdma_sendto. This is called by the
+ * RPC server when an RPC Reply is ready to be transmitted to a client.
+ *
+ * The passed-in svc_rqst contains a struct xdr_buf which holds an
+ * XDR-encoded RPC Reply message. sendto must construct the RPC-over-RDMA
+ * transport header, post all Write WRs needed for this Reply, then post
+ * a Send WR conveying the transport header and the RPC message itself to
+ * the client.
+ *
+ * svc_rdma_sendto must fully transmit the Reply before returning, as
+ * the svc_rqst will be recycled as soon as sendto returns. Remaining
+ * resources referred to by the svc_rqst are also recycled at that time.
+ * Therefore any resources that must remain longer must be detached
+ * from the svc_rqst and released later.
+ *
+ * Page Management
+ *
+ * The I/O that performs Reply transmission is asynchronous, and may
+ * complete well after sendto returns. Thus pages under I/O must be
+ * removed from the svc_rqst before sendto returns.
+ *
+ * The logic here depends on Send Queue and completion ordering. Since
+ * the Send WR is always posted last, it will always complete last. Thus
+ * when it completes, it is guaranteed that all previous Write WRs have
+ * also completed.
+ *
+ * Write WRs are constructed and posted. Each Write segment gets its own
+ * svc_rdma_rw_ctxt, allowing the Write completion handler to find and
+ * DMA-unmap the pages under I/O for that Write segment. The Write
+ * completion handler does not release any pages.
+ *
+ * When the Send WR is constructed, it also gets its own svc_rdma_op_ctxt.
+ * The ownership of all of the Reply's pages are transferred into that
+ * ctxt, the Send WR is posted, and sendto returns.
+ *
+ * The svc_rdma_op_ctxt is presented when the Send WR completes. The
+ * Send completion handler finally releases the Reply's pages.
+ *
+ * This mechanism also assumes that completions on the transport's Send
+ * Completion Queue do not run in parallel. Otherwise a Write completion
+ * and Send completion running at the same time could release pages that
+ * are still DMA-mapped.
+ *
+ * Error Handling
+ *
+ * - If the Send WR is posted successfully, it will either complete
+ * successfully, or get flushed. Either way, the Send completion
+ * handler releases the Reply's pages.
+ * - If the Send WR cannot be not posted, the forward path releases
+ * the Reply's pages.
+ *
+ * This handles the case, without the use of page reference counting,
+ * where two different Write segments send portions of the same page.
+ */
+
#include <linux/sunrpc/debug.h>
#include <linux/sunrpc/rpc_rdma.h>
#include <linux/spinlock.h>
@@ -55,113 +113,141 @@ static u32 xdr_padsize(u32 len)
return (len & 3) ? (4 - (len & 3)) : 0;
}
-int svc_rdma_map_xdr(struct svcxprt_rdma *xprt,
- struct xdr_buf *xdr,
- struct svc_rdma_req_map *vec,
- bool write_chunk_present)
+/* Returns length of transport header, in bytes.
+ */
+static unsigned int svc_rdma_reply_hdr_len(__be32 *rdma_resp)
{
- int sge_no;
- u32 sge_bytes;
- u32 page_bytes;
- u32 page_off;
- int page_no;
-
- if (xdr->len !=
- (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) {
- pr_err("svcrdma: %s: XDR buffer length error\n", __func__);
- return -EIO;
- }
+ unsigned int nsegs;
+ __be32 *p;
- /* Skip the first sge, this is for the RPCRDMA header */
- sge_no = 1;
+ p = rdma_resp;
+
+ /* RPC-over-RDMA V1 replies never have a Read list. */
+ p += rpcrdma_fixed_maxsz + 1;