summaryrefslogtreecommitdiffstats
path: root/net/sunrpc
diff options
context:
space:
mode:
Diffstat (limited to 'net/sunrpc')
-rw-r--r--net/sunrpc/auth.c2
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c2
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c4
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c2
-rw-r--r--net/sunrpc/backchannel_rqst.c4
-rw-r--r--net/sunrpc/cache.c2
-rw-r--r--net/sunrpc/sched.c5
-rw-r--r--net/sunrpc/sunrpc.h13
-rw-r--r--net/sunrpc/svc_xprt.c5
-rw-r--r--net/sunrpc/svcauth.c2
-rw-r--r--net/sunrpc/svcsock.c17
-rw-r--r--net/sunrpc/xdr.c174
-rw-r--r--net/sunrpc/xprt.c32
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c119
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c643
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c230
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c69
-rw-r--r--net/sunrpc/xprtrdma/transport.c90
-rw-r--r--net/sunrpc/xprtrdma/verbs.c753
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h17
-rw-r--r--net/sunrpc/xprtsock.c16
21 files changed, 970 insertions, 1231 deletions
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 5285ead196c0..247e973544bf 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -296,7 +296,7 @@ static void
rpcauth_unhash_cred_locked(struct rpc_cred *cred)
{
hlist_del_rcu(&cred->cr_hash);
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags);
}
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 36e431ee1c90..b6e440baccc3 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -143,7 +143,7 @@ gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx)
gss_get_ctx(ctx);
rcu_assign_pointer(gss_cred->gc_ctx, ctx);
set_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(RPCAUTH_CRED_NEW, &cred->cr_flags);
}
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index 27ce26240932..92d5ab99fbf3 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -218,10 +218,8 @@ static struct gss_api_mech *_gss_mech_get_by_pseudoflavor(u32 pseudoflavor)
spin_lock(&registered_mechs_lock);
list_for_each_entry(pos, &registered_mechs, gm_list) {
- if (!mech_supports_pseudoflavor(pos, pseudoflavor)) {
- module_put(pos->gm_owner);
+ if (!mech_supports_pseudoflavor(pos, pseudoflavor))
continue;
- }
if (try_module_get(pos->gm_owner))
gm = pos;
break;
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 0f73f4507746..4ce5eccec1f6 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1503,6 +1503,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
if (unwrap_integ_data(rqstp, &rqstp->rq_arg,
gc->gc_seq, rsci->mechctx))
goto garbage_args;
+ rqstp->rq_auth_slack = RPC_MAX_AUTH_SIZE;
break;
case RPC_GSS_SVC_PRIVACY:
/* placeholders for length and seq. number: */
@@ -1511,6 +1512,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
if (unwrap_priv_data(rqstp, &rqstp->rq_arg,
gc->gc_seq, rsci->mechctx))
goto garbage_args;
+ rqstp->rq_auth_slack = RPC_MAX_AUTH_SIZE * 2;
break;
default:
goto auth_err;
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 3513d559bc45..9761a0da964d 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -244,10 +244,10 @@ void xprt_free_bc_request(struct rpc_rqst *req)
dprintk("RPC: free backchannel req=%p\n", req);
req->rq_connect_cookie = xprt->connect_cookie - 1;
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
if (!xprt_need_to_requeue(xprt)) {
/*
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index ae333c1845bb..066362141133 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -374,7 +374,7 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd)
}
return;
out:
- printk(KERN_ERR "nfsd: failed to unregister %s cache\n", cd->name);
+ printk(KERN_ERR "RPC: failed to unregister %s cache\n", cd->name);
}
EXPORT_SYMBOL_GPL(sunrpc_destroy_cache_detail);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 25578afe1548..c0365c14b858 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -832,7 +832,8 @@ static void rpc_async_schedule(struct work_struct *work)
* @size: requested byte size
*
* To prevent rpciod from hanging, this allocator never sleeps,
- * returning NULL if the request cannot be serviced immediately.
+ * returning NULL and suppressing warning if the request cannot be serviced
+ * immediately.
* The caller can arrange to sleep in a way that is safe for rpciod.
*
* Most requests are 'small' (under 2KiB) and can be serviced from a
@@ -845,7 +846,7 @@ static void rpc_async_schedule(struct work_struct *work)
void *rpc_malloc(struct rpc_task *task, size_t size)
{
struct rpc_buffer *buf;
- gfp_t gfp = GFP_NOWAIT;
+ gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN;
if (RPC_IS_SWAPPER(task))
gfp |= __GFP_MEMALLOC;
diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
index 14c9f6d1c5ff..f2b7cb540e61 100644
--- a/net/sunrpc/sunrpc.h
+++ b/net/sunrpc/sunrpc.h
@@ -43,6 +43,19 @@ static inline int rpc_reply_expected(struct rpc_task *task)
(task->tk_msg.rpc_proc->p_decode != NULL);
}
+static inline int sock_is_loopback(struct sock *sk)
+{
+ struct dst_entry *dst;
+ int loopback = 0;
+ rcu_read_lock();
+ dst = rcu_dereference(sk->sk_dst_cache);
+ if (dst && dst->dev &&
+ (dst->dev->features & NETIF_F_LOOPBACK))
+ loopback = 1;
+ rcu_read_unlock();
+ return loopback;
+}
+
int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
struct page *headpage, unsigned long headoffset,
struct page *tailpage, unsigned long tailoffset);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 06c6ff0cb911..b4737fbdec13 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -597,6 +597,7 @@ static int svc_alloc_arg(struct svc_rqst *rqstp)
}
rqstp->rq_pages[i] = p;
}
+ rqstp->rq_page_end = &rqstp->rq_pages[i];
rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
/* Make arg->head point to first page and arg->pages point to rest */
@@ -730,6 +731,8 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
newxpt = xprt->xpt_ops->xpo_accept(xprt);
if (newxpt)
svc_add_new_temp_xprt(serv, newxpt);
+ else
+ module_put(xprt->xpt_class->xcl_owner);
} else if (xprt->xpt_ops->xpo_has_wspace(xprt)) {
/* XPT_DATA|XPT_DEFERRED case: */
dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
@@ -793,7 +796,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
clear_bit(XPT_OLD, &xprt->xpt_flags);
- rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp));
+ rqstp->rq_secure = xprt->xpt_ops->xpo_secure_port(rqstp);
rqstp->rq_chandle.defer = svc_defer;
if (serv->sv_stats)
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index 2af7b0cba43a..79c0f3459b5c 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -54,6 +54,8 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp)
}
spin_unlock(&authtab_lock);
+ rqstp->rq_auth_slack = 0;
+
rqstp->rq_authop = aops;
return aops->accept(rqstp, authp);
}
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 43bcb4699d69..b507cd327d9b 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -400,6 +400,12 @@ static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
release_sock(sock->sk);
#endif
}
+
+static int svc_sock_secure_port(struct svc_rqst *rqstp)
+{
+ return svc_port_is_privileged(svc_addr(rqstp));
+}
+
/*
* INET callback when data has been received on the socket.
*/
@@ -678,6 +684,7 @@ static struct svc_xprt_ops svc_udp_ops = {
.xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
.xpo_has_wspace = svc_udp_has_wspace,
.xpo_accept = svc_udp_accept,
+ .xpo_secure_port = svc_sock_secure_port,
};
static struct svc_xprt_class svc_udp_class = {
@@ -842,8 +849,7 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
* tell us anything. For now just warn about unpriv connections.
*/
if (!svc_port_is_privileged(sin)) {
- dprintk(KERN_WARNING
- "%s: connect from unprivileged port: %s\n",
+ dprintk("%s: connect from unprivileged port: %s\n",
serv->sv_name,
__svc_print_addr(sin, buf, sizeof(buf)));
}
@@ -867,6 +873,10 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
}
svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen);
+ if (sock_is_loopback(newsock->sk))
+ set_bit(XPT_LOCAL, &newsvsk->sk_xprt.xpt_flags);
+ else
+ clear_bit(XPT_LOCAL, &newsvsk->sk_xprt.xpt_flags);
if (serv->sv_stats)
serv->sv_stats->nettcpconn++;
@@ -1112,6 +1122,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
rqstp->rq_xprt_ctxt = NULL;
rqstp->rq_prot = IPPROTO_TCP;
+ rqstp->rq_local = !!test_bit(XPT_LOCAL, &svsk->sk_xprt.xpt_flags);
p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
calldir = p[1];
@@ -1234,6 +1245,7 @@ static struct svc_xprt_ops svc_tcp_bc_ops = {
.xpo_detach = svc_bc_tcp_sock_detach,
.xpo_free = svc_bc_sock_free,
.xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
+ .xpo_secure_port = svc_sock_secure_port,
};
static struct svc_xprt_class svc_tcp_bc_class = {
@@ -1272,6 +1284,7 @@ static struct svc_xprt_ops svc_tcp_ops = {
.xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
.xpo_has_wspace = svc_tcp_has_wspace,
.xpo_accept = svc_tcp_accept,
+ .xpo_secure_port = svc_sock_secure_port,
};
static struct svc_xprt_class svc_tcp_class = {
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index dd97ba3c4456..23fb4e75e245 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -462,6 +462,7 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
struct kvec *iov = buf->head;
int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len;
+ xdr_set_scratch_buffer(xdr, NULL, 0);
BUG_ON(scratch_len < 0);
xdr->buf = buf;
xdr->iov = iov;
@@ -482,6 +483,73 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
EXPORT_SYMBOL_GPL(xdr_init_encode);
/**
+ * xdr_commit_encode - Ensure all data is written to buffer
+ * @xdr: pointer to xdr_stream
+ *
+ * We handle encoding across page boundaries by giving the caller a
+ * temporary location to write to, then later copying the data into
+ * place; xdr_commit_encode does that copying.
+ *
+ * Normally the caller doesn't need to call this directly, as the
+ * following xdr_reserve_space will do it. But an explicit call may be
+ * required at the end of encoding, or any other time when the xdr_buf
+ * data might be read.
+ */
+void xdr_commit_encode(struct xdr_stream *xdr)
+{
+ int shift = xdr->scratch.iov_len;
+ void *page;
+
+ if (shift == 0)
+ return;
+ page = page_address(*xdr->page_ptr);
+ memcpy(xdr->scratch.iov_base, page, shift);
+ memmove(page, page + shift, (void *)xdr->p - page);
+ xdr->scratch.iov_len = 0;
+}
+EXPORT_SYMBOL_GPL(xdr_commit_encode);
+
+__be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, size_t nbytes)
+{
+ static __be32 *p;
+ int space_left;
+ int frag1bytes, frag2bytes;
+
+ if (nbytes > PAGE_SIZE)
+ return NULL; /* Bigger buffers require special handling */
+ if (xdr->buf->len + nbytes > xdr->buf->buflen)
+ return NULL; /* Sorry, we're totally out of space */
+ frag1bytes = (xdr->end - xdr->p) << 2;
+ frag2bytes = nbytes - frag1bytes;
+ if (xdr->iov)
+ xdr->iov->iov_len += frag1bytes;
+ else
+ xdr->buf->page_len += frag1bytes;
+ xdr->page_ptr++;
+ xdr->iov = NULL;
+ /*
+ * If the last encode didn't end exactly on a page boundary, the
+ * next one will straddle boundaries. Encode into the next
+ * page, then copy it back later in xdr_commit_encode. We use
+ * the "scratch" iov to track any temporarily unused fragment of
+ * space at the end of the previous buffer:
+ */
+ xdr->scratch.iov_base = xdr->p;
+ xdr->scratch.iov_len = frag1bytes;
+ p = page_address(*xdr->page_ptr);
+ /*
+ * Note this is where the next encode will start after we've
+ * shifted this one back:
+ */
+ xdr->p = (void *)p + frag2bytes;
+ space_left = xdr->buf->buflen - xdr->buf->len;
+ xdr->end = (void *)p + min_t(int, space_left, PAGE_SIZE);
+ xdr->buf->page_len += frag2bytes;
+ xdr->buf->len += nbytes;
+ return p;
+}
+
+/**
* xdr_reserve_space - Reserve buffer space for sending
* @xdr: pointer to xdr_stream
* @nbytes: number of bytes to reserve
@@ -495,20 +563,122 @@ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes)
__be32 *p = xdr->p;
__be32 *q;
+ xdr_commit_encode(xdr);
/* align nbytes on the next 32-bit boundary */
nbytes += 3;
nbytes &= ~3;
q = p + (nbytes >> 2);
if (unlikely(q > xdr->end || q < p))
- return NULL;
+ return xdr_get_next_encode_buffer(xdr, nbytes);
xdr->p = q;
- xdr->iov->iov_len += nbytes;
+ if (xdr->iov)
+ xdr->iov->iov_len += nbytes;
+ else
+ xdr->buf->page_len += nbytes;
xdr->buf->len += nbytes;
return p;
}
EXPORT_SYMBOL_GPL(xdr_reserve_space);
/**
+ * xdr_truncate_encode - truncate an encode buffer
+ * @xdr: pointer to xdr_stream
+ * @len: new length of buffer
+ *
+ * Truncates the xdr stream, so that xdr->buf->len == len,
+ * and xdr->p points at offset len from the start of the buffer, and
+ * head, tail, and page lengths are adjusted to correspond.
+ *
+ * If this means moving xdr->p to a different buffer, we assume that
+ * that the end pointer should be set to the end of the current page,
+ * except in the case of the head buffer when we assume the head
+ * buffer's current length represents the end of the available buffer.
+ *
+ * This is *not* safe to use on a buffer that already has inlined page
+ * cache pages (as in a zero-copy server read reply), except for the
+ * simple case of truncating from one position in the tail to another.
+ *
+ */
+void xdr_truncate_encode(struct xdr_stream *xdr, size_t len)
+{
+ struct xdr_buf *buf = xdr->buf;
+ struct kvec *head = buf->head;
+ struct kvec *tail = buf->tail;
+ int fraglen;
+ int new, old;
+
+ if (len > buf->len) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ xdr_commit_encode(xdr);
+
+ fraglen = min_t(int, buf->len - len, tail->iov_len);
+ tail->iov_len -= fraglen;
+ buf->len -= fraglen;
+ if (tail->iov_len && buf->len == len) {
+ xdr->p = tail->iov_base + tail->iov_len;
+ /* xdr->end, xdr->iov should be set already */
+ return;
+ }
+ WARN_ON_ONCE(fraglen);
+ fraglen = min_t(int, buf->len - len, buf->page_len);
+ buf->page_len -= fraglen;
+ buf->len -= fraglen;
+
+ new = buf->page_base + buf->page_len;
+ old = new + fraglen;
+ xdr->page_ptr -= (old >> PAGE_SHIFT) - (new >> PAGE_SHIFT);
+
+ if (buf->page_len && buf->len == len) {
+ xdr->p = page_address(*xdr->page_ptr);
+ xdr->end = (void *)xdr->p + PAGE_SIZE;
+ xdr->p = (void *)xdr->p + (new % PAGE_SIZE);
+ /* xdr->iov should already be NULL */
+ return;
+ }
+ if (fraglen) {
+ xdr->end = head->iov_base + head->iov_len;
+ xdr->page_ptr--;
+ }
+ /* (otherwise assume xdr->end is already set) */
+ head->iov_len = len;
+ buf->len = len;
+ xdr->p = head->iov_base + head->iov_len;
+ xdr->iov = buf->head;
+}
+EXPORT_SYMBOL(xdr_truncate_encode);
+
+/**
+ * xdr_restrict_buflen - decrease available buffer space
+ * @xdr: pointer to xdr_stream
+ * @newbuflen: new maximum number of bytes available
+ *
+ * Adjust our idea of how much space is available in the buffer.
+ * If we've already used too much space in the buffer, returns -1.
+ * If the available space is already smaller than newbuflen, returns 0
+ * and does nothing. Otherwise, adjusts xdr->buf->buflen to newbuflen
+ * and ensures xdr->end is set at most offset newbuflen from the start
+ * of the buffer.
+ */
+int xdr_restrict_buflen(struct xdr_stream *xdr, int newbuflen)
+{
+ struct xdr_buf *buf = xdr->buf;
+ int left_in_this_buf = (void *)xdr->end - (void *)xdr->p;
+ int end_offset = buf->len + left_in_this_buf;
+
+ if (newbuflen < 0 || newbuflen < buf->len)
+ return -1;
+ if (newbuflen > buf->buflen)
+ return 0;
+ if (newbuflen < end_offset)
+ xdr->end = (void *)xdr->end + newbuflen - end_offset;
+ buf->buflen = newbuflen;
+ return 0;
+}
+EXPORT_SYMBOL(xdr_restrict_buflen);
+
+/**
* xdr_write_pages - Insert a list of pages into an XDR buffer for sending
* @xdr: pointer to xdr_stream
* @pages: list of pages
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index d173f79947c6..c3b2b3369e52 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -71,24 +71,6 @@ static void xprt_destroy(struct rpc_xprt *xprt);
static DEFINE_SPINLOCK(xprt_list_lock);
static LIST_HEAD(xprt_list);
-/*
- * The transport code maintains an estimate on the maximum number of out-
- * standing RPC requests, using a smoothed version of the congestion
- * avoidance implemented in 44BSD. This is basically the Van Jacobson
- * congestion algorithm: If a retransmit occurs, the congestion window is
- * halved; otherwise, it is incremented by 1/cwnd when
- *
- * - a reply is received and
- * - a full number of requests are outstanding and
- * - the congestion window hasn't been updated recently.
- */
-#define RPC_CWNDSHIFT (8U)
-#define RPC_CWNDSCALE (1U << RPC_CWNDSHIFT)
-#define RPC_INITCWND RPC_CWNDSCALE
-#define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT)
-
-#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd)
-
/**
* xprt_register_transport - register a transport implementation
* @transport: transport to register
@@ -230,9 +212,9 @@ static void xprt_clear_locked(struct rpc_xprt *xprt)
{
xprt->snd_task = NULL;
if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) {
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(XPRT_LOCKED, &xprt->state);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
} else
queue_work(rpciod_workqueue, &xprt->task_cleanup);
}
@@ -446,7 +428,15 @@ EXPORT_SYMBOL_GPL(xprt_release_rqst_cong);
* @task: recently completed RPC request used to adjust window
* @result: result code of completed RPC request
*
- * We use a time-smoothed congestion estimator to avoid heavy oscillation.
+ * The transport code maintains an estimate on the maximum number of out-
+ * standing RPC requests, using a smoothed version of the congestion
+ * avoidance implemented in 44BSD. This is basically the Van Jacobson
+ * congestion algorithm: If a retransmit occurs, the congestion window is
+ * halved; otherwise, it is incremented by 1/cwnd when
+ *
+ * - a reply is received and
+ * - a full number of requests are outstanding and
+ * - the congestion window hasn't been updated recently.
*/
void xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result)
{
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 96ead526b125..693966d3f33b 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -78,8 +78,7 @@ static const char transfertypes[][12] = {
* elements. Segments are then coalesced when registered, if possible
* within the selected memreg mode.
*
- * Note, this routine is never called if the connection's memory
- * registration strategy is 0 (bounce buffers).
+ * Returns positive number of segments converted, or a negative errno.
*/
static int
@@ -102,10 +101,17 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
page_base = xdrbuf->page_base & ~PAGE_MASK;
p = 0;
while (len && n < nsegs) {
+ if (!ppages[p]) {
+ /* alloc the pagelist for receiving buffer */
+ ppages[p] = alloc_page(GFP_ATOMIC);
+ if (!ppages[p])
+ return -ENOMEM;
+ }
seg[n].mr_page = ppages[p];
seg[n].mr_offset = (void *)(unsigned long) page_base;
seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
- BUG_ON(seg[n].mr_len > PAGE_SIZE);
+ if (seg[n].mr_len > PAGE_SIZE)
+ return -EIO;
len -= seg[n].mr_len;
++n;
++p;
@@ -114,7 +120,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
/* Message overflows the seg array */
if (len && n == nsegs)
- return 0;
+ return -EIO;
if (xdrbuf->tail[0].iov_len) {
/* the rpcrdma protocol allows us to omit any trailing
@@ -123,7 +129,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
return n;
if (n == nsegs)
/* Tail remains, but we're out of segments */
- return 0;
+ return -EIO;
seg[n].mr_page = NULL;
seg[n].mr_offset = xdrbuf->tail[0].iov_base;
seg[n].mr_len = xdrbuf->tail[0].iov_len;
@@ -164,15 +170,17 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
* Reply chunk (a counted array):
* N elements:
* 1 - N - HLOO - HLOO - ... - HLOO
+ *
+ * Returns positive RPC/RDMA header size, or negative errno.
*/
-static unsigned int
+static ssize_t
rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
{
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
- int nsegs, nchunks = 0;
+ int n, nsegs, nchunks = 0;
unsigned int pos;
struct rpcrdma_mr_seg *seg = req->rl_segments;
struct rpcrdma_read_chunk *cur_rchunk = NULL;
@@ -198,12 +206,11 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
pos = target->head[0].iov_len;
nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
- if (nsegs == 0)
- return 0;
+ if (nsegs < 0)
+ return nsegs;
do {
- /* bind/register the memory, then build chunk from result. */
- int n = rpcrdma_register_external(seg, nsegs,
+ n = rpcrdma_register_external(seg, nsegs,
cur_wchunk != NULL, r_xprt);
if (n <= 0)
goto out;
@@ -248,10 +255,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
/* success. all failures return above */
req->rl_nchunks = nchunks;
- BUG_ON(nchunks == 0);
- BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
- && (nchunks > 3));
-
/*
* finish off header. If write, marshal discrim and nchunks.
*/
@@ -278,8 +281,8 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
out:
for (pos = 0; nchunks--;)
pos += rpcrdma_deregister_external(
- &req->rl_segments[pos], r_xprt, NULL);
- return 0;
+ &req->rl_segments[pos], r_xprt);
+ return n;
}
/*
@@ -361,6 +364,8 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
* [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
* [2] -- optional padding.
* [3] -- if padded, header only in [1] and data here.
+ *
+ * Returns zero on success, otherwise a negative errno.
*/
int
@@ -370,7 +375,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
char *base;
- size_t hdrlen, rpclen, padlen;
+ size_t rpclen, padlen;
+ ssize_t hdrlen;
enum rpcrdma_chunktype rtype, wtype;
struct rpcrdma_msg *headerp;
@@ -441,14 +447,10 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
/* The following simplification is not true forever */
if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
wtype = rpcrdma_noch;
- BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch);
-
- if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS &&
- (rtype != rpcrdma_noch || wtype != rpcrdma_noch)) {
- /* forced to "pure inline"? */
- dprintk("RPC: %s: too much data (%d/%d) for inline\n",
- __func__, rqst->rq_rcv_buf.len, rqst->rq_snd_buf.len);
- return -1;
+ if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
+ dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
+ __func__);
+ return -EIO;
}
hdrlen = 28; /*sizeof *headerp;*/
@@ -474,8 +476,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
- BUG_ON(wtype != rpcrdma_noch);
-
+ if (wtype != rpcrdma_noch) {
+ dprintk("RPC: %s: invalid chunk list\n",
+ __func__);
+ return -EIO;
+ }
} else {
headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
@@ -492,8 +497,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* on receive. Therefore, we request a reply chunk
* for non-writes wherever feasible and efficient.
*/
- if (wtype == rpcrdma_noch &&
- r_xprt->rx_ia.ri_memreg_strategy > RPCRDMA_REGISTER)
+ if (wtype == rpcrdma_noch)
wtype = rpcrdma_replych;
}
}
@@ -511,9 +515,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
hdrlen = rpcrdma_create_chunks(rqst,
&rqst->rq_rcv_buf, headerp, wtype);
}
-
- if (hdrlen == 0)
- return -1;
+ if (hdrlen < 0)
+ return hdrlen;
dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
" headerp 0x%p base 0x%p lkey 0x%x\n",
@@ -680,15 +683,11 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
rqst->rq_private_buf = rqst->rq_rcv_buf;
}
-/*
- * This function is called when an async event is posted to
- * the connection which changes the connection state. All it
- * does at this point is mark the connection up/down, the rpc
- * timers do the rest.
- */
void
-rpcrdma_conn_func(struct rpcrdma_ep *ep)
+rpcrdma_connect_worker(struct work_struct *work)
{
+ struct rpcrdma_ep *ep =
+ container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
struct rpc_xprt *xprt = ep->rep_xprt;
spin_lock_bh(&xprt->transport_lock);
@@ -705,13 +704,15 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep)
}
/*
- * This function is called when memory window unbind which we are waiting
- * for completes. Just use rr_func (zeroed by upcall) to signal completion.
+ * This function is called when an async event is posted to
+ * the connection which changes the connection state. All it
+ * does at this point is mark the connection up/down, the rpc
+ * timers do the rest.
*/
-static void
-rpcrdma_unbind_func(struct rpcrdma_rep *rep)
+void
+rpcrdma_conn_func(struct rpcrdma_ep *ep)
{
- wake_up(&rep->rr_unbind);
+ schedule_delayed_work(&ep->rep_connect_worker, 0);
}
/*
@@ -728,7 +729,8 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
struct rpc_xprt *xprt = rep->rr_xprt;
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
__be32 *iptr;
- int i, rdmalen, status;
+ int rdmalen, status;
+ unsigned long cwnd;
/* Check status. If bad, signal disconnect and return rep to pool */
if (rep->rr_len == ~0U) {
@@ -783,6 +785,7 @@ repost:
/* from here on, the reply is no longer an orphan */
req->rl_reply = rep;
+ xprt->reestablish_timeout = 0;
/* check for expected message types */
/* The order of some of these tests is important. */
@@ -857,26 +860,10 @@ badheader:
break;
}
- /* If using mw bind, start the deregister process now. */
- /* (Note: if mr_free(), cannot perform it here, in tasklet context) */
- if (req->rl_nchunks) switch (r_xprt->rx_ia.ri_memreg_strategy) {
- case RPCRDMA_MEMWINDOWS:
- for (i = 0; req->rl_nchunks-- > 1;)
- i += rpcrdma_deregister_external(
- &req->rl_segments[i], r_xprt, NULL);
- /* Optionally wait (not here) for unbinds to complete */
- rep->rr_func = rpcrdma_unbind_func;
- (void) rpcrdma_deregister_external(&req->rl_segments[i],
- r_xprt, rep);
- break;
- case RPCRDMA_MEMWINDOWS_ASYNC:
- for (i = 0; req->rl_nchunks--;)
- i += rpcrdma_deregister_external(&req->rl_segments[i],
- r_xprt, NULL);
- break;
- default:
- break;
- }
+ cwnd = xprt->cwnd;
+ xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT;
+ if (xprt->cwnd > cwnd)
+ xprt_release_rqst_cong(rqst->rq_task);
dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
__func__, xprt, rqst, status);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 8d904e4eef15..8f92a61ee2df 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -1,4 +1,5 @@
/*
+ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
* Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
*
* This software is available to you under a choice of one of two
@@ -69,7 +70,8 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
/* Set up the XDR head */
rqstp->rq_arg.head[0].iov_base = page_address(page);
- rqstp->rq_arg.head[0].iov_len = min(byte_count, ctxt->sge[0].length);
+ rqstp->rq_arg.head[0].iov_len =
+ min_t(size_t, byte_count, ctxt->sge[0].length);
rqstp->rq_arg.len = byte_count;
rqstp->rq_arg.buflen = byte_count;
@@ -85,7 +87,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
page = ctxt->pages[sge_no];
put_page(rqstp->rq_pages[sge_no]);
rqstp->rq_pages[sge_no] = page;
- bc -= min(bc, ctxt->sge[sge_no].length);
+ bc -= min_t(u32, bc, ctxt->sge[sge_no].length);
rqstp->rq_arg.buflen += ctxt->sge[sge_no].length;
sge_no++;
}
@@ -113,291 +115,265 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
rqstp->rq_arg.tail[0].iov_len = 0;
}
-/* Encode a read-chunk-list as an array of IB SGE
- *
- * Assumptions:
- * - chunk[0]->position points to pages[0] at an offset of 0
- * - pages[] is not physically or virtually contiguous and consists of
- * PAGE_SIZE elements.
- *
- * Output:
- * - sge array pointing into pages[] array.
- * - chunk_sge array specifying sge index and count for each
- * chunk in the read list
- *
- */
-static int map_read_chunks(struct svcxprt_rdma *xprt,
- struct svc_rqst *rqstp,
- struct svc_rdma_op_ctxt *head,
- struct rpcrdma_msg *rmsgp,
- struct svc_rdma_req_map *rpl_map,
- s