From 2813b626e395f9de1ca0549c1c7d2217c1ce80ea Mon Sep 17 00:00:00 2001 From: Amitoj Kaur Chawla Date: Mon, 8 Aug 2016 14:43:49 +0530 Subject: sunrpc: Remove unnecessary variable The variable `err` is not used anywhere and just returns the predefined value `0` at the end of the function. Hence, remove the variable and return 0 explicitly. Signed-off-by: Amitoj Kaur Chawla Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 66f23b376fa0..75078bbf261a 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -184,7 +184,6 @@ static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event, struct super_block *sb) { struct dentry *dentry; - int err = 0; switch (event) { case RPC_PIPEFS_MOUNT: @@ -201,7 +200,7 @@ static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event, printk(KERN_ERR "%s: unknown event: %ld\n", __func__, event); return -ENOTSUPP; } - return err; + return 0; } static int __rpc_pipefs_event(struct rpc_clnt *clnt, unsigned long event, -- cgit v1.2.3 From d00252688604edfd07d0e11a05d3a2b7cf05bb3d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 29 Aug 2016 20:03:49 -0400 Subject: SUNRPC: Initialise struct svc_serv backchannel fields during __svc_create() Clean up. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/svc.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'net/sunrpc') diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index c5b0cb4f4056..7c8070ec93c8 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -401,6 +401,21 @@ int svc_bind(struct svc_serv *serv, struct net *net) } EXPORT_SYMBOL_GPL(svc_bind); +#if defined(CONFIG_SUNRPC_BACKCHANNEL) +static void +__svc_init_bc(struct svc_serv *serv) +{ + INIT_LIST_HEAD(&serv->sv_cb_list); + spin_lock_init(&serv->sv_cb_lock); + init_waitqueue_head(&serv->sv_cb_waitq); +} +#else +static void +__svc_init_bc(struct svc_serv *serv) +{ +} +#endif + /* * Create an RPC service */ @@ -443,6 +458,8 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, init_timer(&serv->sv_temptimer); spin_lock_init(&serv->sv_lock); + __svc_init_bc(serv); + serv->sv_nrpools = npools; serv->sv_pools = kcalloc(serv->sv_nrpools, sizeof(struct svc_pool), -- cgit v1.2.3 From 7705f6abbb75ff8058e785a3f2e03b8c3962cfa3 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Fri, 9 Sep 2016 09:22:22 -0400 Subject: SUNRPC remove rpc_task_release_client from rpc_task_set_client rpc_task_set_client is only called from rpc_run_task after rpc_new_task and rpc_task_release_client is not needed as the task is new. When called from rpc_new_task, rpc_task_set_client also removed the assigned rpc_xprt which is not desired. Signed-off-by: Andy Adamson Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 75078bbf261a..4bb526f57a09 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -987,7 +987,6 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) { if (clnt != NULL) { - rpc_task_release_client(task); if (task->tk_xprt == NULL) task->tk_xprt = xprt_iter_get_next(&clnt->cl_xpi); task->tk_client = clnt; -- cgit v1.2.3 From 3b58a8a9049d5e191402665c339690a148504358 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Fri, 9 Sep 2016 09:22:23 -0400 Subject: SUNRPC rpc_clnt_xprt_switch_put Give the NFS layer access to the xprt_switch_put function Signed-off-by: Andy Adamson Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/sunrpc') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 4bb526f57a09..0ff5cbf3d631 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2695,6 +2695,12 @@ rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo) } EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout); +void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt) +{ + xprt_switch_put(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); +} +EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put); + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static void rpc_show_header(void) { -- cgit v1.2.3 From dd69171769cf4649a7ff3755e91cbd242a833727 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Fri, 9 Sep 2016 09:22:24 -0400 Subject: SUNRPC rpc_clnt_xprt_switch_add_xprt Give the NFS layer access to the rpc_xprt_switch_add_xprt function Signed-off-by: Andy Adamson Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/sunrpc') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 0ff5cbf3d631..43ec46547dc9 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2701,6 +2701,13 @@ void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt) } EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put); +void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) +{ + rpc_xprt_switch_add_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch), + xprt); +} +EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt); + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static void rpc_show_header(void) { -- cgit v1.2.3 From 39e5d2df959dd4aea81fa33d765d2a5cc67a0512 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Fri, 9 Sep 2016 09:22:25 -0400 Subject: SUNRPC search xprt switch for sockaddr Signed-off-by: Andy Adamson Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 15 +++++++++++++++ net/sunrpc/xprtmultipath.c | 24 +++++++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 43ec46547dc9..8d68efd2026f 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2708,6 +2708,21 @@ void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) } EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt); +bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, + const struct sockaddr *sap) +{ + struct rpc_xprt_switch *xps; + bool ret; + + xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); + + rcu_read_lock(); + ret = rpc_xprt_switch_has_addr(xps, sap); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_has_addr); + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static void rpc_show_header(void) { diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 66c9d63f4797..ae92a9e9ba52 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -15,6 +15,7 @@ #include #include #include +#include #include typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct list_head *head, @@ -49,7 +50,8 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps, if (xprt == NULL) return; spin_lock(&xps->xps_lock); - if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) + if ((xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) && + !rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr)) xprt_switch_add_xprt_locked(xps, xprt); spin_unlock(&xps->xps_lock); } @@ -232,6 +234,26 @@ struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi) return xprt_switch_find_current_entry(head, xpi->xpi_cursor); } +bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, + const struct sockaddr *sap) +{ + struct list_head *head; + struct rpc_xprt *pos; + + if (xps == NULL || sap == NULL) + return false; + + head = &xps->xps_xprt_list; + list_for_each_entry_rcu(pos, head, xprt_switch) { + if (rpc_cmp_addr_port(sap, (struct sockaddr *)&pos->addr)) { + pr_info("RPC: addr %s already in xprt switch\n", + pos->address_strings[RPC_DISPLAY_ADDR]); + return true; + } + } + return false; +} + static struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head, const struct rpc_xprt *cur) -- cgit v1.2.3 From fda0ab41170ee0a1c7a3781ff8cfb4395c3dd784 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Fri, 9 Sep 2016 09:22:26 -0400 Subject: SUNRPC: rpc_clnt_add_xprt setup function for NFS layer Use a setup function to call into the NFS layer to test an rpc_xprt for session trunking so as to not leak the rpc_xprt_switch into the nfs layer. Search for the address in the rpc_xprt_switch first so as not to put an unnecessary EXCHANGE_ID on the wire. Signed-off-by: Andy Adamson Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) (limited to 'net/sunrpc') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 8d68efd2026f..b614cb1356b9 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2613,6 +2613,70 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, } EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt); +/** + * rpc_clnt_setup_test_and_add_xprt() + * + * This is an rpc_clnt_add_xprt setup() function which returns 1 so: + * 1) caller of the test function must dereference the rpc_xprt_switch + * and the rpc_xprt. + * 2) test function must call rpc_xprt_switch_add_xprt, usually in + * the rpc_call_done routine. + * + * Upon success (return of 1), the test function adds the new + * transport to the rpc_clnt xprt switch + * + * @clnt: struct rpc_clnt to get the new transport + * @xps: the rpc_xprt_switch to hold the new transport + * @xprt: the rpc_xprt to test + * @data: a struct rpc_add_xprt_test pointer that holds the test function + * and test function call data + */ +int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt, + struct rpc_xprt_switch *xps, + struct rpc_xprt *xprt, + void *data) +{ + struct rpc_cred *cred; + struct rpc_task *task; + struct rpc_add_xprt_test *xtest = (struct rpc_add_xprt_test *)data; + int status = -EADDRINUSE; + + xprt = xprt_get(xprt); + xprt_switch_get(xps); + + if (rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr)) + goto out_err; + + /* Test the connection */ + cred = authnull_ops.lookup_cred(NULL, NULL, 0); + task = rpc_call_null_helper(clnt, xprt, cred, + RPC_TASK_SOFT | RPC_TASK_SOFTCONN, + NULL, NULL); + put_rpccred(cred); + if (IS_ERR(task)) { + status = PTR_ERR(task); + goto out_err; + } + status = task->tk_status; + rpc_put_task(task); + + if (status < 0) + goto out_err; + + /* rpc_xprt_switch and rpc_xprt are deferrenced by add_xprt_test() */ + xtest->add_xprt_test(clnt, xprt, xtest->data); + + /* so that rpc_clnt_add_xprt does not call rpc_xprt_switch_add_xprt */ + return 1; +out_err: + xprt_put(xprt); + xprt_switch_put(xps); + pr_info("RPC: rpc_clnt_test_xprt failed: %d addr %s not added\n", + status, xprt->address_strings[RPC_DISPLAY_ADDR]); + return status; +} +EXPORT_SYMBOL_GPL(rpc_clnt_setup_test_and_add_xprt); + /** * rpc_clnt_add_xprt - Add a new transport to a rpc_clnt * @clnt: pointer to struct rpc_clnt -- cgit v1.2.3 From eb342e9a38a5ad79866fec2df2d3ca4592bc501b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:55:04 -0400 Subject: xprtrdma: Eliminate INLINE_THRESHOLD macros Clean up: r_xprt is already available everywhere these macros are invoked, so just dereference that directly. RPCRDMA_INLINE_PAD_VALUE is no longer used, so it can simply be removed. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 4 ++-- net/sunrpc/xprtrdma/rpc_rdma.c | 2 +- net/sunrpc/xprtrdma/transport.c | 6 +++--- net/sunrpc/xprtrdma/xprt_rdma.h | 9 --------- 4 files changed, 6 insertions(+), 15 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 87762d976b63..5f60ab2f858a 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -46,13 +46,13 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, return PTR_ERR(req); req->rl_backchannel = true; - size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst); + size = r_xprt->rx_data.inline_wsize; rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL); if (IS_ERR(rb)) goto out_fail; req->rl_rdmabuf = rb; - size += RPCRDMA_INLINE_READ_THRESHOLD(rqst); + size += r_xprt->rx_data.inline_rsize; rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL); if (IS_ERR(rb)) goto out_fail; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index a47f170b20ef..845586f7df47 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -673,7 +673,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) goto out_unmap; hdrlen = (unsigned char *)iptr - (unsigned char *)headerp; - if (hdrlen + rpclen > RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) + if (hdrlen + rpclen > r_xprt->rx_data.inline_wsize) goto out_overflow; dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n", diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 81f0e879f019..be95eced0726 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -518,7 +518,7 @@ out: return req->rl_sendbuf->rg_base; out_rdmabuf: - min_size = RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp); + min_size = r_xprt->rx_data.inline_wsize; rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, flags); if (IS_ERR(rb)) goto out_fail; @@ -541,8 +541,8 @@ out_sendbuf: * reply will be large, but slush is provided here to allow * flexibility when marshaling. */ - min_size = RPCRDMA_INLINE_READ_THRESHOLD(task->tk_rqstp); - min_size += RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp); + min_size = r_xprt->rx_data.inline_rsize; + min_size += r_xprt->rx_data.inline_wsize; if (size < min_size) size = min_size; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index a71b0f5897d8..9df47c857d27 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -356,15 +356,6 @@ struct rpcrdma_create_data_internal { unsigned int padding; /* non-rdma write header padding */ }; -#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \ - (rpcx_to_rdmad(rq->rq_xprt).inline_rsize) - -#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\ - (rpcx_to_rdmad(rq->rq_xprt).inline_wsize) - -#define RPCRDMA_INLINE_PAD_VALUE(rq)\ - rpcx_to_rdmad(rq->rq_xprt).padding - /* * Statistics for RPCRDMA */ -- cgit v1.2.3 From b9c5bc03be6aae41990efd09f83cf70a89ac9f4b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:55:12 -0400 Subject: SUNRPC: Refactor rpc_xdr_buf_init() Clean up: there is some XDR initialization logic that is common to the forward channel and backchannel. Move it to an XDR header so it can be shared. rpc_rqst::rq_buffer points to a buffer containing big-endian data. Update its annotation as part of the clean up. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/backchannel_rqst.c | 8 +------- net/sunrpc/clnt.c | 24 ++++++------------------ net/sunrpc/xprtrdma/backchannel.c | 12 +----------- 3 files changed, 8 insertions(+), 36 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 229956bf8457..ac701c28f44f 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -76,13 +76,7 @@ static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags) page = alloc_page(gfp_flags); if (page == NULL) return -ENOMEM; - buf->head[0].iov_base = page_address(page); - buf->head[0].iov_len = PAGE_SIZE; - buf->tail[0].iov_base = NULL; - buf->tail[0].iov_len = 0; - buf->page_len = 0; - buf->len = 0; - buf->buflen = PAGE_SIZE; + xdr_buf_init(buf, page_address(page), PAGE_SIZE); return 0; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index b614cb1356b9..6481986be7a7 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1746,18 +1746,6 @@ rpc_task_force_reencode(struct rpc_task *task) task->tk_rqstp->rq_bytes_sent = 0; } -static inline void -rpc_xdr_buf_init(struct xdr_buf *buf, void *start, size_t len) -{ - buf->head[0].iov_base = start; - buf->head[0].iov_len = len; - buf->tail[0].iov_len = 0; - buf->page_len = 0; - buf->flags = 0; - buf->len = 0; - buf->buflen = len; -} - /* * 3. Encode arguments of an RPC call */ @@ -1770,12 +1758,12 @@ rpc_xdr_encode(struct rpc_task *task) dprint_status(task); - rpc_xdr_buf_init(&req->rq_snd_buf, - req->rq_buffer, - req->rq_callsize); - rpc_xdr_buf_init(&req->rq_rcv_buf, - (char *)req->rq_buffer + req->rq_callsize, - req->rq_rcvsize); + xdr_buf_init(&req->rq_snd_buf, + req->rq_buffer, + req->rq_callsize); + xdr_buf_init(&req->rq_rcv_buf, + (char *)req->rq_buffer + req->rq_callsize, + req->rq_rcvsize); p = rpc_encode_header(task); if (p == NULL) { diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 5f60ab2f858a..d3cfaf281e55 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -38,7 +38,6 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_regbuf *rb; struct rpcrdma_req *req; - struct xdr_buf *buf; size_t size; req = rpcrdma_create_req(r_xprt); @@ -60,16 +59,7 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, req->rl_sendbuf = rb; /* so that rpcr_to_rdmar works when receiving a request */ rqst->rq_buffer = (void *)req->rl_sendbuf->rg_base; - - buf = &rqst->rq_snd_buf; - buf->head[0].iov_base = rqst->rq_buffer; - buf->head[0].iov_len = 0; - buf->tail[0].iov_base = NULL; - buf->tail[0].iov_len = 0; - buf->page_len = 0; - buf->len = 0; - buf->buflen = size; - + xdr_buf_init(&rqst->rq_snd_buf, rqst->rq_buffer, size); return 0; out_fail: -- cgit v1.2.3 From 5fe6eaa1f9a00b9a5927e3b791ecad2f3eaab130 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:55:20 -0400 Subject: SUNRPC: Generalize the RPC buffer allocation API xprtrdma needs to allocate the Call and Reply buffers separately. TBH, the reliance on using a single buffer for the pair of XDR buffers is transport implementation-specific. Transports that want to allocate separate Call and Reply buffers will ignore the "size" argument anyway. Don't bother passing it. The buf_alloc method can't return two pointers. Instead, make the method's return value an error code, and set the rq_buffer pointer in the method itself. This gives call_allocate an opportunity to terminate an RPC instead of looping forever when a permanent problem occurs. If a request is just bogus, or the transport is in a state where it can't allocate resources for any request, there needs to be a way to kill the RPC right there and not loop. This immediately fixes a rare problem in the backchannel send path, which loops if the server happens to send a CB request whose call+reply size is larger than a page (which it shouldn't do yet). One more issue: looks like xprt_inject_disconnect was incorrectly placed in the failure path in call_allocate. It needs to be in the success path, as it is for other call-sites. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 12 ++++++++---- net/sunrpc/sched.c | 24 +++++++++++++++--------- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 17 +++++++++-------- net/sunrpc/xprtrdma/transport.c | 26 ++++++++++++++++++-------- net/sunrpc/xprtsock.c | 17 +++++++++++------ 5 files changed, 61 insertions(+), 35 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 6481986be7a7..5499fda0c1f3 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1691,6 +1691,7 @@ call_allocate(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct rpc_procinfo *proc = task->tk_msg.rpc_proc; + int status; dprint_status(task); @@ -1716,11 +1717,14 @@ call_allocate(struct rpc_task *task) req->rq_rcvsize = RPC_REPHDRSIZE + slack + proc->p_replen; req->rq_rcvsize <<= 2; - req->rq_buffer = xprt->ops->buf_alloc(task, - req->rq_callsize + req->rq_rcvsize); - if (req->rq_buffer != NULL) - return; + status = xprt->ops->buf_alloc(task); xprt_inject_disconnect(xprt); + if (status == 0) + return; + if (status != -ENOMEM) { + rpc_exit(task, status); + return; + } dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 9ae588511aaf..b964d40b259b 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -849,14 +849,17 @@ static void rpc_async_schedule(struct work_struct *work) } /** - * rpc_malloc - allocate an RPC buffer - * @task: RPC task that will use this buffer - * @size: requested byte size + * rpc_malloc - allocate RPC buffer resources + * @task: RPC task + * + * A single memory region is allocated, which is split between the + * RPC call and RPC reply that this task is being used for. When + * this RPC is retired, the memory is released by calling rpc_free. * * To prevent rpciod from hanging, this allocator never sleeps, - * returning NULL and suppressing warning if the request cannot be serviced - * immediately. - * The caller can arrange to sleep in a way that is safe for rpciod. + * returning -ENOMEM and suppressing warning if the request cannot + * be serviced immediately. The caller can arrange to sleep in a + * way that is safe for rpciod. * * Most requests are 'small' (under 2KiB) and can be serviced from a * mempool, ensuring that NFS reads and writes can always proceed, @@ -865,8 +868,10 @@ static void rpc_async_schedule(struct work_struct *work) * In order to avoid memory starvation triggering more writebacks of * NFS requests, we avoid using GFP_KERNEL. */ -void *rpc_malloc(struct rpc_task *task, size_t size) +int rpc_malloc(struct rpc_task *task) { + struct rpc_rqst *rqst = task->tk_rqstp; + size_t size = rqst->rq_callsize + rqst->rq_rcvsize; struct rpc_buffer *buf; gfp_t gfp = GFP_NOIO | __GFP_NOWARN; @@ -880,12 +885,13 @@ void *rpc_malloc(struct rpc_task *task, size_t size) buf = kmalloc(size, gfp); if (!buf) - return NULL; + return -ENOMEM; buf->len = size; dprintk("RPC: %5u allocated buffer of size %zu at %p\n", task->tk_pid, size, buf); - return &buf->data; + rqst->rq_buffer = buf->data; + return 0; } EXPORT_SYMBOL_GPL(rpc_malloc); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index a2a7519b0f23..124688ba67e5 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -159,29 +159,30 @@ out_unmap: /* Server-side transport endpoint wants a whole page for its send * buffer. The client RPC code constructs the RPC header in this * buffer before it invokes ->send_request. - * - * Returns NULL if there was a temporary allocation failure. */ -static void * -xprt_rdma_bc_allocate(struct rpc_task *task, size_t size) +static int +xprt_rdma_bc_allocate(struct rpc_task *task) { struct rpc_rqst *rqst = task->tk_rqstp; struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; + size_t size = rqst->rq_callsize; struct svcxprt_rdma *rdma; struct page *page; rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); - /* Prevent an infinite loop: try to make this case work */ - if (size > PAGE_SIZE) + if (size > PAGE_SIZE) { WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n", size); + return -EINVAL; + } page = alloc_page(RPCRDMA_DEF_GFP); if (!page) - return NULL; + return -ENOMEM; - return page_address(page); + rqst->rq_buffer = page_address(page); + return 0; } static void diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index be95eced0726..daa7d4d43fd8 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -477,7 +477,15 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) } } -/* +/** + * xprt_rdma_allocate - allocate transport resources for an RPC + * @task: RPC task + * + * Return values: + * 0: Success; rq_buffer points to RPC buffer to use + * ENOMEM: Out of memory, call again later + * EIO: A permanent error occurred, do not retry + * * The RDMA allocate/free functions need the task structure as a place * to hide the struct rpcrdma_req, which is necessary for the actual send/recv * sequence. @@ -486,11 +494,12 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) * (rq_send_buf and rq_rcv_buf are both part of a single contiguous buffer). * We may register rq_rcv_buf when using reply chunks. */ -static void * -xprt_rdma_allocate(struct rpc_task *task, size_t size) +static int +xprt_rdma_allocate(struct rpc_task *task) { - struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpc_rqst *rqst = task->tk_rqstp; + size_t size = rqst->rq_callsize + rqst->rq_rcvsize; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_regbuf *rb; struct rpcrdma_req *req; size_t min_size; @@ -498,7 +507,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) req = rpcrdma_buffer_get(&r_xprt->rx_buf); if (req == NULL) - return NULL; + return -ENOMEM; flags = RPCRDMA_DEF_GFP; if (RPC_IS_SWAPPER(task)) @@ -515,7 +524,8 @@ out: dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); req->rl_connect_cookie = 0; /* our reserved value */ req->rl_task = task; - return req->rl_sendbuf->rg_base; + rqst->rq_buffer = req->rl_sendbuf->rg_base; + return 0; out_rdmabuf: min_size = r_xprt->rx_data.inline_wsize; @@ -558,7 +568,7 @@ out_sendbuf: out_fail: rpcrdma_buffer_put(req); - return NULL; + return -ENOMEM; } /* diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index bf168838a029..bd30b4b18d72 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2533,23 +2533,28 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) * we allocate pages instead doing a kmalloc like rpc_malloc is because we want * to use the server side send routines. */ -static void *bc_malloc(struct rpc_task *task, size_t size) +static int bc_malloc(struct rpc_task *task) { + struct rpc_rqst *rqst = task->tk_rqstp; + size_t size = rqst->rq_callsize; struct page *page; struct rpc_buffer *buf; - WARN_ON_ONCE(size > PAGE_SIZE - sizeof(struct rpc_buffer)); - if (size > PAGE_SIZE - sizeof(struct rpc_buffer)) - return NULL; + if (size > PAGE_SIZE - sizeof(struct rpc_buffer)) { + WARN_ONCE(1, "xprtsock: large bc buffer request (size %zu)\n", + size); + return -EINVAL; + } page = alloc_page(GFP_KERNEL); if (!page) - return NULL; + return -ENOMEM; buf = page_address(page); buf->len = PAGE_SIZE; - return buf->data; + rqst->rq_buffer = buf->data; + return 0; } /* -- cgit v1.2.3 From 3435c74aed2d7b743ccbf34616c523ebee7be943 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:55:29 -0400 Subject: SUNRPC: Generalize the RPC buffer release API xprtrdma needs to allocate the Call and Reply buffers separately. TBH, the reliance on using a single buffer for the pair of XDR buffers is transport implementation-specific. Instead of passing just the rq_buffer into the buf_free method, pass the task structure and let buf_free take care of freeing both XDR buffers at once. There's a micro-optimization here. In the common case, both xprt_release and the transport's buf_free method were checking if rq_buffer was NULL. Now the check is done only once per RPC. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/sched.c | 10 ++++------ net/sunrpc/xprt.c | 2 +- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 2 +- net/sunrpc/xprtrdma/transport.c | 26 ++++++++++---------------- net/sunrpc/xprtrdma/xprt_rdma.h | 1 - net/sunrpc/xprtsock.c | 6 ++---- 6 files changed, 18 insertions(+), 29 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index b964d40b259b..6690ebc774ed 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -896,18 +896,16 @@ int rpc_malloc(struct rpc_task *task) EXPORT_SYMBOL_GPL(rpc_malloc); /** - * rpc_free - free buffer allocated via rpc_malloc - * @buffer: buffer to free + * rpc_free - free RPC buffer resources allocated via rpc_malloc + * @task: RPC task * */ -void rpc_free(void *buffer) +void rpc_free(struct rpc_task *task) { + void *buffer = task->tk_rqstp->rq_buffer; size_t size; struct rpc_buffer *buf; - if (!buffer) - return; - buf = container_of(buffer, struct rpc_buffer, data); size = buf->len; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index ea244b29138b..685e6d225414 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1295,7 +1295,7 @@ void xprt_release(struct rpc_task *task) xprt_schedule_autodisconnect(xprt); spin_unlock_bh(&xprt->transport_lock); if (req->rq_buffer) - xprt->ops->buf_free(req->rq_buffer); + xprt->ops->buf_free(task); xprt_inject_disconnect(xprt); if (req->rq_cred != NULL) put_rpccred(req->rq_cred); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 124688ba67e5..fa893507d9eb 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -186,7 +186,7 @@ xprt_rdma_bc_allocate(struct rpc_task *task) } static void -xprt_rdma_bc_free(void *buffer) +xprt_rdma_bc_free(struct rpc_task *task) { /* No-op: ctxt and page have already been freed. */ } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index daa7d4d43fd8..ebf14ba437c6 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -523,7 +523,6 @@ xprt_rdma_allocate(struct rpc_task *task) out: dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); req->rl_connect_cookie = 0; /* our reserved value */ - req->rl_task = task; rqst->rq_buffer = req->rl_sendbuf->rg_base; return 0; @@ -571,31 +570,26 @@ out_fail: return -ENOMEM; } -/* - * This function returns all RDMA resources to the pool. +/** + * xprt_rdma_free - release resources allocated by xprt_rdma_allocate + * @task: RPC task + * + * Caller guarantees rqst->rq_buffer is non-NULL. */ static void -xprt_rdma_free(void *buffer) +xprt_rdma_free(struct rpc_task *task) { - struct rpcrdma_req *req; - struct rpcrdma_xprt *r_xprt; - struct rpcrdma_regbuf *rb; - - if (buffer == NULL) - return; + struct rpc_rqst *rqst = task->tk_rqstp; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]); - req = rb->rg_owner; if (req->rl_backchannel) return; - r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf); - dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, - !RPC_IS_ASYNC(req->rl_task)); - + !RPC_IS_ASYNC(task)); rpcrdma_buffer_put(req); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 9df47c857d27..4838a85bdcf6 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -283,7 +283,6 @@ struct rpcrdma_req { struct list_head rl_free; unsigned int rl_niovs; unsigned int rl_connect_cookie; - struct rpc_task *rl_task; struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index bd30b4b18d72..bde39f2ff6e5 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2560,13 +2560,11 @@ static int bc_malloc(struct rpc_task *task) /* * Free the space allocated in the bc_alloc routine */ -static void bc_free(void *buffer) +static void bc_free(struct rpc_task *task) { + void *buffer = task->tk_rqstp->rq_buffer; struct rpc_buffer *buf; - if (!buffer) - return; - buf = container_of(buffer, struct rpc_buffer, data); free_page((unsigned long)buf); } -- cgit v1.2.3 From 68778945e46f143ed7974b427a8065f69a4ce944 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:55:37 -0400 Subject: SUNRPC: Separate buffer pointers for RPC Call and Reply messages For xprtrdma, the RPC Call and Reply buffers are involved in real I/O operations. To start with, the DMA direction of the I/O for a Call is opposite that of a Reply. In the current arrangement, the Reply buffer address is on a four-byte alignment just past the call buffer. Would be friendlier on some platforms if that was at a DMA cache alignment instead. Because the current arrangement allocates a single memory region which contains both buffers, the RPC Reply buffer often contains a page boundary in it when the Call buffer is large enough (which is frequent). It would be a little nicer for setting up DMA operations (and possible registration of the Reply buffer) if the two buffers were separated, well-aligned, and contained as few page boundaries as possible. Now, I could just pad out the single memory region used for the pair of buffers. But frequently that would mean a lot of unused space to ensure the Reply buffer did not have a page boundary. Add a separate pointer to rpc_rqst that points right to the RPC Reply buffer. This makes no difference to xprtsock, but it will help xprtrdma in subsequent patches. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 2 +- net/sunrpc/sched.c | 1 + net/sunrpc/xprtrdma/transport.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 5499fda0c1f3..34dd7b26ee5f 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1766,7 +1766,7 @@ rpc_xdr_encode(struct rpc_task *task) req->rq_buffer, req->rq_callsize); xdr_buf_init(&req->rq_rcv_buf, - (char *)req->rq_buffer + req->rq_callsize, + req->rq_rbuffer, req->rq_rcvsize); p = rpc_encode_header(task); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 6690ebc774ed..5db68b371db2 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -891,6 +891,7 @@ int rpc_malloc(struct rpc_task *task) dprintk("RPC: %5u allocated buffer of size %zu at %p\n", task->tk_pid, size, buf); rqst->rq_buffer = buf->data; + rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize; return 0; } EXPORT_SYMBOL_GPL(rpc_malloc); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index ebf14ba437c6..136caf3dd299 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -524,6 +524,7 @@ out: dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); req->rl_connect_cookie = 0; /* our reserved value */ rqst->rq_buffer = req->rl_sendbuf->rg_base; + rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_rcvsize; return 0; out_rdmabuf: -- cgit v1.2.3 From 5a6d1db4556940533f1a5b6521e522f3e46508ed Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:55:45 -0400 Subject: SUNRPC: Add a transport-specific private field in rpc_rqst Currently there's a hidden and indirect mechanism for finding the rpcrdma_req that goes with an rpc_rqst. It depends on getting from the rq_buffer pointer in struct rpc_rqst to the struct rpcrdma_regbuf that controls that buffer, and then to the struct rpcrdma_req it goes with. This was done back in the day to avoid the need to add a per-rqst pointer or to alter the buf_free API when support for RPC-over-RDMA was introduced. I'm about to change the way regbuf's work to support larger inline thresholds. Now is a good time to replace this indirect mechanism with something that is more straightforward. I guess this should be considered a clean up. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 6 ++---- net/sunrpc/xprtrdma/transport.c | 2 +- net/sunrpc/xprtrdma/verbs.c | 1 - net/sunrpc/xprtrdma/xprt_rdma.h | 13 +++++++------ 4 files changed, 10 insertions(+), 12 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index d3cfaf281e55..c4904f881640 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -55,11 +55,9 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL); if (IS_ERR(rb)) goto out_fail; - rb->rg_owner = req; req->rl_sendbuf = rb; - /* so that rpcr_to_rdmar works when receiving a request */ - rqst->rq_buffer = (void *)req->rl_sendbuf->rg_base; - xdr_buf_init(&rqst->rq_snd_buf, rqst->rq_buffer, size); + xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base, size); + rpcrdma_set_xprtdata(rqst, req); return 0; out_fail: diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 136caf3dd299..d83bffa92dfc 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -523,6 +523,7 @@ xprt_rdma_allocate(struct rpc_task *task) out: dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); req->rl_connect_cookie = 0; /* our reserved value */ + rpcrdma_set_xprtdata(rqst, req); rqst->rq_buffer = req->rl_sendbuf->rg_base; rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_rcvsize; return 0; @@ -559,7 +560,6 @@ out_sendbuf: rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags); if (IS_ERR(rb)) goto out_fail; - rb->rg_owner = req; r_xprt->rx_stats.hardway_register_count += size; rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 799cce6cbe45..93def0bf07af 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1210,7 +1210,6 @@ rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags) iov->length = size; iov->lkey = ia->ri_pd->local_dma_lkey; rb->rg_size = size; - rb->rg_owner = NULL; return rb; out_free: diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 4838a85bdcf6..484855eddb85 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -113,7 +113,6 @@ struct rpcrdma_ep { struct rpcrdma_regbuf { size_t rg_size; - struct rpcrdma_req *rg_owner; struct ib_sge rg_iov; __be32 rg_base[0] __attribute__ ((aligned(256))); }; @@ -297,14 +296,16 @@ struct rpcrdma_req { struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; +static inline void +rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req) +{ + rqst->rq_xprtdata = req; +} + static inline struct rpcrdma_req * rpcr_to_rdmar(struct rpc_rqst *rqst) { - void *buffer = rqst->rq_buffer; - struct rpcrdma_regbuf *rb; - - rb = container_of(buffer, struct rpcrdma_regbuf, rg_base); - return rb->rg_owner; + return rqst->rq_xprtdata; } /* -- cgit v1.2.3 From 9c40c49f145f8999ecbf81683aeb31d92b61b966 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:55:53 -0400 Subject: xprtrdma: Initialize separate RPC call and reply buffers RPC-over-RDMA needs to separate its RPC call and reply buffers. o When an RPC Call is sent, rq_snd_buf is DMA mapped for an RDMA Send operation using DMA_TO_DEVICE o If the client expects a large RPC reply, it DMA maps rq_rcv_buf as part of a Reply chunk using DMA_FROM_DEVICE The two mappings are for data movement in opposite directions. DMA-API.txt suggests that if these mappings share a DMA cacheline, bad things can happen. This could occur in the final bytes of rq_snd_buf and the first bytes of rq_rcv_buf if the two buffers happen to share a DMA cacheline. On x86_64 the cacheline size is typically 8 bytes, and RPC call messages are usually much smaller than the send buffer, so this hasn't been a noticeable problem. But the DMA cacheline size can be larger on other platforms. Also, often rq_rcv_buf starts most of the way into a page, thus an additional RDMA segment is needed to map and register the end of that buffer. Try to avoid that scenario to reduce the cost of registering and invalidating Reply chunks. Instead of carrying a single regbuf that covers both rq_snd_buf and rq_rcv_buf, each struct rpcrdma_req now carries one regbuf for rq_snd_buf and one regbuf for rq_rcv_buf. Some incidental changes worth noting: - To clear out some spaghetti, refactor xprt_rdma_allocate. - The value stored in rg_size is the same as the value stored in the iov.length field, so eliminate rg_size Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 150 +++++++++++++++++++++++++--------------- net/sunrpc/xprtrdma/verbs.c | 2 +- net/sunrpc/xprtrdma/xprt_rdma.h | 6 +- 3 files changed, 99 insertions(+), 59 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index d83bffa92dfc..ecdc3ad7dbb6 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -477,6 +477,86 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) } } +/* Allocate a fixed-size buffer in which to construct and send the + * RPC-over-RDMA header for this request. + */ +static bool +rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + gfp_t flags) +{ + size_t size = r_xprt->rx_data.inline_wsize; + struct rpcrdma_regbuf *rb; + + if (req->rl_rdmabuf) + return true; + + rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags); + if (IS_ERR(rb)) + return false; + + r_xprt->rx_stats.hardway_register_count += size; + req->rl_rdmabuf = rb; + return true; +} + +/* RPC/RDMA marshaling may choose to send payload bearing ops inline, + * if the resulting Call message is smaller than the inline threshold. + * The value of the "rq_callsize" argument accounts for RPC header + * requirements, but not for the data payload in these cases. + * + * See rpcrdma_inline_pullup. + */ +static bool +rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + size_t size, gfp_t flags) +{ + struct rpcrdma_regbuf *rb; + size_t min_size; + + if (req->rl_sendbuf && rdmab_length(req->rl_sendbuf) >= size) + return true; + + min_size = max_t(size_t, size, r_xprt->rx_data.inline_wsize); + rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, flags); + if (IS_ERR(rb)) + return false; + + rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf); + r_xprt->rx_stats.hardway_register_count += min_size; + req->rl_sendbuf = rb; + return true; +} + +/* The rq_rcv_buf is used only if a Reply chunk is necessary. + * The decision to use a Reply chunk is made later in + * rpcrdma_marshal_req. This buffer is registered at that time. + * + * Otherwise, the associated RPC Reply arrives in a separate + * Receive buffer, arbitrarily chosen by the HCA. The buffer + * allocated here for the RPC Reply is not utilized in that + * case. See rpcrdma_inline_fixup. + * + * A regbuf is used here to remember the buffer size. + */ +static bool +rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + size_t size, gfp_t flags) +{ + struct rpcrdma_regbuf *rb; + + if (req->rl_recvbuf && rdmab_length(req->rl_recvbuf) >= size) + return true; + + rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags); + if (IS_ERR(rb)) + return false; + + rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_recvbuf); + r_xprt->rx_stats.hardway_register_count += size; + req->rl_recvbuf = rb; + return true; +} + /** * xprt_rdma_allocate - allocate transport resources for an RPC * @task: RPC task @@ -487,22 +567,18 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) * EIO: A permanent error occurred, do not retry * * The RDMA allocate/free functions need the task structure as a place - * to hide the struct rpcrdma_req, which is necessary for the actual send/recv - * sequence. + * to hide the struct rpcrdma_req, which is necessary for the actual + * send/recv sequence. * - * The RPC layer allocates both send and receive buffers in the same call - * (rq_send_buf and rq_rcv_buf are both part of a single contiguous buffer). - * We may register rq_rcv_buf when using reply chunks. + * xprt_rdma_allocate provides buffers that are already mapped for + * DMA, and a local DMA lkey is provided for each. */ static int xprt_rdma_allocate(struct rpc_task *task) { struct rpc_rqst *rqst = task->tk_rqstp; - size_t size = rqst->rq_callsize + rqst->rq_rcvsize; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); - struct rpcrdma_regbuf *rb; struct rpcrdma_req *req; - size_t min_size; gfp_t flags; req = rpcrdma_buffer_get(&r_xprt->rx_buf); @@ -513,59 +589,23 @@ xprt_rdma_allocate(struct rpc_task *task) if (RPC_IS_SWAPPER(task)) flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; - if (req->rl_rdmabuf == NULL) - goto out_rdmabuf; - if (req->rl_sendbuf == NULL) - goto out_sendbuf; - if (size > req->rl_sendbuf->rg_size) - goto out_sendbuf; + if (!rpcrdma_get_rdmabuf(r_xprt, req, flags)) + goto out_fail; + if (!rpcrdma_get_sendbuf(r_xprt, req, rqst->rq_callsize, flags)) + goto out_fail; + if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags)) + goto out_fail; + + dprintk("RPC: %5u %s: send size = %zd, recv size = %zd, req = %p\n", + task->tk_pid, __func__, rqst->rq_callsize, + rqst->rq_rcvsize, req); -out: - dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); req->rl_connect_cookie = 0; /* our reserved value */ rpcrdma_set_xprtdata(rqst, req); rqst->rq_buffer = req->rl_sendbuf->rg_base; - rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_rcvsize; + rqst->rq_rbuffer = req->rl_recvbuf->rg_base; return 0; -out_rdmabuf: - min_size = r_xprt->rx_data.inline_wsize; - rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, flags); - if (IS_ERR(rb)) - goto out_fail; - req->rl_rdmabuf = rb; - -out_sendbuf: - /* XDR encoding and RPC/RDMA marshaling of this request has not - * yet occurred. Thus a lower bound is needed to prevent buffer - * overrun during marshaling. - * - * RPC/RDMA marshaling may choose to send payload bearing ops - * inline, if the result is smaller than the inline threshold. - * The value of the "size" argument accounts for header - * requirements but not for the payload in these cases. - * - * Likewise, allocate enough space to receive a reply up to the - * size of the inline threshold. - * - * It's unlikely that both the send header and the received - * reply will be large, but slush is provided here to allow - * flexibility when marshaling. - */ - min_size = r_xprt->rx_data.inline_rsize; - min_size += r_xprt->rx_data.inline_wsize; - if (size < min_size) - size = min_size; - - rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags); - if (IS_ERR(rb)) - goto out_fail; - - r_xprt->rx_stats.hardway_register_count += size; - rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf); - req->rl_sendbuf = rb; - goto out; - out_fail: rpcrdma_buffer_put(req); return -ENOMEM; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 93def0bf07af..fc6b4ea8b7ec 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -975,6 +975,7 @@ rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep) void rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) { + rpcrdma_free_regbuf(ia, req->rl_recvbuf); rpcrdma_free_regbuf(ia, req->rl_sendbuf); rpcrdma_free_regbuf(ia, req->rl_rdmabuf); kfree(req); @@ -1209,7 +1210,6 @@ rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags) iov->length = size; iov->lkey = ia->ri_pd->local_dma_lkey; - rb->rg_size = size; return rb; out_free: diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 484855eddb85..444f6370d46c 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -112,7 +112,6 @@ struct rpcrdma_ep { */ struct rpcrdma_regbuf { - size_t rg_size; struct ib_sge rg_iov; __be32 rg_base[0] __attribute__ ((aligned(256))); }; @@ -285,8 +284,9 @@ struct rpcrdma_req { struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; - struct rpcrdma_regbuf *rl_rdmabuf; - struct rpcrdma_regbuf *rl_sendbuf; + struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */ + struct rpcrdma_regbuf *rl_sendbuf; /* rq_snd_buf */ + struct rpcrdma_regbuf *rl_recvbuf; /* rq_rcv_buf */ struct ib_cqe rl_cqe; struct list_head rl_all; -- cgit v1.2.3 From 08cf2efd5423121985af5962d66e6db14dff4130 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:56:02 -0400 Subject: xprtrdma: Use smaller buffers for RPC-over-RDMA headers Commit 949317464bc2 ("xprtrdma: Limit number of RDMA segments in RPC-over-RDMA headers") capped the number of chunks that may appear in RPC-over-RDMA headers. The maximum header size can be estimated and fixed to avoid allocating buffer space that is never used. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 5 ++--- net/sunrpc/xprtrdma/transport.c | 2 +- net/sunrpc/xprtrdma/xprt_rdma.h | 5 ++++- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index c4904f881640..60fc9915292f 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -45,13 +45,12 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, return PTR_ERR(req); req->rl_backchannel = true; - size = r_xprt->rx_data.inline_wsize; - rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL); + rb = rpcrdma_alloc_regbuf(ia, RPCRDMA_HDRBUF_SIZE, GFP_KERNEL); if (IS_ERR(rb)) goto out_fail; req->rl_rdmabuf = rb; - size += r_xprt->rx_data.inline_rsize; + size = r_xprt->rx_data.inline_rsize; rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL); if (IS_ERR(rb)) goto out_fail; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index ecdc3ad7dbb6..94dbfd3e89a7 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -484,7 +484,7 @@ static bool rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, gfp_t flags) { - size_t size = r_xprt->rx_data.inline_wsize; + size_t size = RPCRDMA_HDRBUF_SIZE; struct rpcrdma_regbuf *rb; if (req->rl_rdmabuf) diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 444f6370d46c..cc426b165c09 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -160,7 +160,10 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) * The smallest inline threshold is 1024 bytes, ensuring that * at least 750 bytes are available for RPC messages. */ -#define RPCRDMA_MAX_HDR_SEGS (8) +enum { + RPCRDMA_MAX_HDR_SEGS = 8, + RPCRDMA_HDRBUF_SIZE = 256, +}; /* * struct rpcrdma_rep -- this structure encapsulates state required to recv -- cgit v1.2.3 From 99ef4db329f1ee2413dad49346e72a6c902474d1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:56:10 -0400 Subject: xprtrdma: Replace DMA_BIDIRECTIONAL The use of DMA_BIDIRECTIONAL is discouraged by DMA-API.txt. Fortunately, xprtrdma now knows which direction I/O is going as soon as it allocates each regbuf. The RPC Call and Reply buffers are no longer the same regbuf. They can each be labeled correctly now. The RPC Reply buffer is never part of either a Send or Receive WR, but it can be part of Reply chunk, which is mapped and registered via ->ro_map . So it is not DMA mapped when it is allocated (DMA_NONE), to avoid a double- mapping. Since Receive buffers are no longer DMA_BIDIRECTIONAL and their contents are never modified by the host CPU, DMA-API-HOWTO.txt suggests that a DMA sync before posting each buffer should be unnecessary. (See my_card_interrupt_handler). Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 5 ++-- net/sunrpc/xprtrdma/transport.c | 7 ++--- net/sunrpc/xprtrdma/verbs.c | 55 ++++++++++++++++++--------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 4 ++- 4 files changed, 36 insertions(+), 35 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 60fc9915292f..ceae87206347 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -45,13 +45,14 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, return PTR_ERR(req); req->rl_backchannel = true; - rb = rpcrdma_alloc_regbuf(ia, RPCRDMA_HDRBUF_SIZE, GFP_KERNEL); + rb = rpcrdma_alloc_regbuf(ia, RPCRDMA_HDRBUF_SIZE, + DMA_TO_DEVICE, GFP_KERNEL); if (IS_ERR(rb)) goto out_fail; req->rl_rdmabuf = rb; size = r_xprt->rx_data.inline_rsize; - rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL); + rb = rpcrdma_alloc_regbuf(ia, size, DMA_TO_DEVICE, GFP_KERNEL); if (IS_ERR(rb)) goto out_fail; req->rl_sendbuf = rb; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 94dbfd3e89a7..34246916434b 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -490,7 +490,7 @@ rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, if (req->rl_rdmabuf) return true; - rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags); + rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, DMA_TO_DEVICE, flags); if (IS_ERR(rb)) return false; @@ -517,7 +517,8 @@ rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, return true; min_size = max_t(size_t, size, r_xprt->rx_data.inline_wsize); - rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, flags); + rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, + DMA_TO_DEVICE, flags); if (IS_ERR(rb)) return false; @@ -547,7 +548,7 @@ rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, if (req->rl_recvbuf && rdmab_length(req->rl_recvbuf) >= size) return true; - rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags); + rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, DMA_NONE, flags); if (IS_ERR(rb)) return false; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index fc6b4ea8b7ec..9edea34aeb36 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -866,7 +866,7 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) goto out; rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize, - GFP_KERNEL); + DMA_FROM_DEVICE, GFP_KERNEL); if (IS_ERR(rep->rr_rdmabuf)) { rc = PTR_ERR(rep->rr_rdmabuf); goto out_free; @@ -1172,27 +1172,24 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) spin_unlock(&buffers->rb_lock); } -/* - * Wrappers for internal-use kmalloc memory registration, used by buffer code. - */ - /** - * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers + * rpcrdma_alloc_regbuf - allocate and DMA-map memory for SEND/RECV buffers * @ia: controlling rpcrdma_ia * @size: size of buffer to be allocated, in bytes + * @direction: direction of data movement * @flags: GFP flags * - * Returns pointer to private header of an area of internally - * registered memory, or an ERR_PTR. The registered buffer follows - * the end of the private header. + * Returns an ERR_PTR, or a pointer to a regbuf, which is a + * contiguous memory region that is DMA mapped persistently, and + * is registered for local I/O. * * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for - * receiving the payload of RDMA RECV operations. regbufs are not - * used for RDMA READ/WRITE operations, thus are registered only for - * LOCAL access. + * receiving the payload of RDMA RECV operations. During Long Calls + * or Replies they may be registered externally via ro_map. */ struct rpcrdma_regbuf * -rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags) +rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, + enum dma_data_direction direction, gfp_t flags) { struct rpcrdma_regbuf *rb; struct ib_sge *iov; @@ -1201,15 +1198,20 @@ rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags) if (rb == NULL) goto out; + rb->rg_direction = direction; iov = &rb->rg_iov; - iov->addr = ib_dma_map_single(ia->ri_device, - (void *)rb->rg_base, size, - DMA_BIDIRECTIONAL); - if (ib_dma_mapping_error(ia->ri_device, iov->addr)) - goto out_free; - iov->length = size; iov->lkey = ia->ri_pd->local_dma_lkey; + + if (direction != DMA_NONE) { + iov->addr = ib_dma_map_single(ia->ri_device, + (void *)rb->rg_base, + rdmab_length(rb), + rb->rg_direction); + if (ib_dma_mapping_error(ia->ri_device, iov->addr)) + goto out_free; + } + return rb; out_free: @@ -1226,14 +1228,14 @@ out: void rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) { - struct ib_sge *iov; - if (!rb) return; - iov = &rb->rg_iov; - ib_dma_unmap_single(ia->ri_device, - iov->addr, iov->length, DMA_BIDIRECTIONAL); + if (rb->rg_direction != DMA_NONE) { + ib_dma_unmap_single(ia->ri_device, rdmab_addr(rb), + rdmab_length(rb), rb->rg_direction); + } + kfree(rb); } @@ -1305,11 +1307,6 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; recv_wr.num_sge = 1; - ib_dma_sync_single_for_cpu(ia->ri_device, - rdmab_addr(rep->rr_rdmabuf), - rdmab_length(rep->rr_rdmabuf), - DMA_BIDIRECTIONAL); - rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); if (rc) goto out_postrecv; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index cc426b165c09..9569b212be54 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -113,6 +113,7 @@ struct rpcrdma_ep { struct rpcrdma_regbuf { struct ib_sge rg_iov; + enum dma_data_direction rg_direction; __be32 rg_base[0] __attribute__ ((aligned(256))); }; @@ -477,7 +478,8 @@ void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *); struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, - size_t, gfp_t); + size_t, enum dma_data_direction, + gfp_t); void rpcrdma_free_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); -- cgit v1.2.3 From 54cbd6b0c6b9410782da3efe7377d43bb636faaf Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Sep 2016 10:56:18 -0400 Subject: xprtrdma: Delay DMA mapping Send and Receive buffers Currently, each regbuf is allocated and DMA mapped at the same time. This is done during transport creation. When a device driver is unloaded, every DMA-mapped buffer in use by a transport has to be unmapped, and then remapped to the new device if the driver is loaded again. Remapping will have to be done _after_ the connect worker has set up the new device. But there's an ordering problem: call_allocate, which invokes xprt_rdma_allocate which calls rpcrdma_alloc_regbuf to allocate Send buffers, happens _before_ the connect worker can run to set up the new device. Instead, at transport creation, allocate each buffer, but leave it unmapped. Once the RPC carries these buffers into ->send_request, by which time a transport connection should have been established, check to see that the RPC's buffers have been DMA mapped. If not, map them there. When device driver unplug support is added, it will simply unmap all the transport's regbufs, but it doesn't have to deallocate the underlying memory. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 8 +++++ net/sunrpc/xprtrdma/rpc_rdma.c | 9 +++++ net/sunrpc/xprtrdma/verbs.c | 71 +++++++++++++++++++++++++-------------- net/sunrpc/xprtrdma/xprt_rdma.h | 16 +++++++++ 4 files changed, 78 insertions(+), 26 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index ceae87206347..8bc249e50713 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -230,16 +230,24 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) __func__, (int)rpclen, rqst->rq_svec[0].iov_base); #endif + if (!rpcrdma_dma_map_regbuf(&r_xprt->rx_ia, req->rl_rdmabuf)) + goto out_map; req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf); req->rl_send_iov[0].length = RPCRDMA_HDRLEN_MIN; req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf); + if (!rpcrdma_dma_map_regbuf(&r_xprt->rx_ia, req->rl_sendbuf)) + goto out_map; req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf); req->rl_send_iov[1].length = rpclen; req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf); req->rl_niovs = 2; return 0; + +out_map: + pr_err("rpcrdma: failed to DMA map a Send buffer\n"); + return -EIO; } /** diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 845586f7df47..68a39c004851 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -681,6 +681,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) transfertypes[rtype], transfertypes[wtype], hdrlen, rpclen); + if (!rpcrdma_dma_map_regbuf(&r_xprt->rx_ia, req->rl_rdmabuf)) + goto out_map; req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf); req->rl_send_iov[0].length = hdrlen; req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf); @@ -689,6 +691,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) if (rtype == rpcrdma_areadch) return 0; + if (!rpcrdma_dma_map_regbuf(&r_xprt->rx_ia, req->rl_sendbuf)) + goto out_map; req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf); req->rl_send_iov[1].length = rpclen; req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf); @@ -704,6 +708,11 @@ out_overflow: out_unmap: r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); return PTR_ERR(iptr); + +out_map: + pr_err("rpcrdma: failed to DMA map a Send buffer\n"); + iptr = ERR_PTR(-EIO); + goto out_unmap; } /* diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 9edea34aeb36..09346cd01bcb 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1179,9 +1179,8 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) * @direction: direction of data movement * @flags: GFP flags * - * Returns an ERR_PTR, or a pointer to a regbuf, which is a - * contiguous memory region that is DMA mapped persistently, and - * is registered for local I/O. + * Returns an ERR_PTR, or a pointer to a regbuf, a buffer that + * can be persistently DMA-mapped for I/O. * * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for * receiving the payload of RDMA RECV operations. During Long Calls @@ -1192,32 +1191,50 @@ rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, enum dma_data_direction direction, gfp_t flags) { struct rpcrdma_regbuf *rb; - struct ib_sge *iov; rb = kmalloc(sizeof(*rb) + size, fl