From 9c760d1fd513053410e10d5f507daad56aeec7db Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 31 Jul 2017 18:38:50 -0400
Subject: NFSv4: Refactor _nfs4_proc_exchange_id()

Tease apart the functionality in nfs4_exchange_id_done() so that
it is easier to debug exchange id vs trunking issues by moving
all the processing out of nfs4_exchange_id_done() and into the
callers.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 171 +++++++++++++++++++++++++++---------------------------
 1 file changed, 85 insertions(+), 86 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ffd2e712595d..34190c2c0900 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7403,72 +7403,8 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
 struct nfs41_exchange_id_data {
 	struct nfs41_exchange_id_res res;
 	struct nfs41_exchange_id_args args;
-	struct rpc_xprt *xprt;
-	int rpc_status;
 };
 
-static void nfs4_exchange_id_done(struct rpc_task *task, void *data)
-{
-	struct nfs41_exchange_id_data *cdata =
-					(struct nfs41_exchange_id_data *)data;
-	struct nfs_client *clp = cdata->args.client;
-	int status = task->tk_status;
-
-	trace_nfs4_exchange_id(clp, status);
-
-	if (status == 0)
-		status = nfs4_check_cl_exchange_flags(cdata->res.flags);
-
-	if (cdata->xprt && status == 0) {
-		status = nfs4_detect_session_trunking(clp, &cdata->res,
-						      cdata->xprt);
-		goto out;
-	}
-
-	if (status  == 0)
-		status = nfs4_sp4_select_mode(clp, &cdata->res.state_protect);
-
-	if (status == 0) {
-		clp->cl_clientid = cdata->res.clientid;
-		clp->cl_exchange_flags = cdata->res.flags;
-		clp->cl_seqid = cdata->res.seqid;
-		/* Client ID is not confirmed */
-		if (!(cdata->res.flags & EXCHGID4_FLAG_CONFIRMED_R))
-			clear_bit(NFS4_SESSION_ESTABLISHED,
-				  &clp->cl_session->session_state);
-
-		kfree(clp->cl_serverowner);
-		clp->cl_serverowner = cdata->res.server_owner;
-		cdata->res.server_owner = NULL;
-
-		/* use the most recent implementation id */
-		kfree(clp->cl_implid);
-		clp->cl_implid = cdata->res.impl_id;
-		cdata->res.impl_id = NULL;
-
-		if (clp->cl_serverscope != NULL &&
-		    !nfs41_same_server_scope(clp->cl_serverscope,
-					cdata->res.server_scope)) {
-			dprintk("%s: server_scope mismatch detected\n",
-				__func__);
-			set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
-			kfree(clp->cl_serverscope);
-			clp->cl_serverscope = NULL;
-		}
-
-		if (clp->cl_serverscope == NULL) {
-			clp->cl_serverscope = cdata->res.server_scope;
-			cdata->res.server_scope = NULL;
-		}
-		/* Save the EXCHANGE_ID verifier session trunk tests */
-		memcpy(clp->cl_confirm.data, cdata->args.verifier.data,
-		       sizeof(clp->cl_confirm.data));
-	}
-out:
-	cdata->rpc_status = status;
-	return;
-}
-
 static void nfs4_exchange_id_release(void *data)
 {
 	struct nfs41_exchange_id_data *cdata =
@@ -7482,7 +7418,6 @@ static void nfs4_exchange_id_release(void *data)
 }
 
 static const struct rpc_call_ops nfs4_exchange_id_call_ops = {
-	.rpc_call_done = nfs4_exchange_id_done,
 	.rpc_release = nfs4_exchange_id_release,
 };
 
@@ -7491,7 +7426,8 @@ static const struct rpc_call_ops nfs4_exchange_id_call_ops = {
  *
  * Wrapper for EXCHANGE_ID operation.
  */
-static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
+static struct rpc_task *
+nfs4_run_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 			u32 sp4_how, struct rpc_xprt *xprt)
 {
 	struct rpc_message msg = {
@@ -7505,17 +7441,15 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 		.flags = RPC_TASK_TIMEOUT,
 	};
 	struct nfs41_exchange_id_data *calldata;
-	struct rpc_task *task;
 	int status;
 
 	if (!atomic_inc_not_zero(&clp->cl_count))
-		return -EIO;
+		return ERR_PTR(-EIO);
 
+	status = -ENOMEM;
 	calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
-	if (!calldata) {
-		nfs_put_client(clp);
-		return -ENOMEM;
-	}
+	if (!calldata)
+		goto out;
 
 	nfs4_init_boot_verifier(clp, &calldata->args.verifier);
 
@@ -7554,7 +7488,6 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 		goto out_impl_id;
 	}
 	if (xprt) {
-		calldata->xprt = xprt;
 		task_setup_data.rpc_xprt = xprt;
 		task_setup_data.flags |= RPC_TASK_SOFTCONN;
 		memcpy(calldata->args.verifier.data, clp->cl_confirm.data,
@@ -7573,15 +7506,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 	msg.rpc_resp = &calldata->res;
 	task_setup_data.callback_data = calldata;
 
-	task = rpc_run_task(&task_setup_data);
-	if (IS_ERR(task))
-		return PTR_ERR(task);
-
-	status = calldata->rpc_status;
-
-	rpc_put_task(task);
-out:
-	return status;
+	return rpc_run_task(&task_setup_data);
 
 out_impl_id:
 	kfree(calldata->res.impl_id);
@@ -7591,8 +7516,69 @@ out_server_owner:
 	kfree(calldata->res.server_owner);
 out_calldata:
 	kfree(calldata);
+out:
 	nfs_put_client(clp);
-	goto out;
+	return ERR_PTR(status);
+}
+
+/*
+ * _nfs4_proc_exchange_id()
+ *
+ * Wrapper for EXCHANGE_ID operation.
+ */
+static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
+			u32 sp4_how)
+{
+	struct rpc_task *task;
+	struct nfs41_exchange_id_args *argp;
+	struct nfs41_exchange_id_res *resp;
+	int status;
+
+	task = nfs4_run_exchange_id(clp, cred, sp4_how, NULL);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	argp = task->tk_msg.rpc_argp;
+	resp = task->tk_msg.rpc_resp;
+	status = task->tk_status;
+	if (status  != 0)
+		goto out;
+
+	status = nfs4_check_cl_exchange_flags(resp->flags);
+	if (status  != 0)
+		goto out;
+
+	status = nfs4_sp4_select_mode(clp, &resp->state_protect);
+	if (status != 0)
+		goto out;
+
+	clp->cl_clientid = resp->clientid;
+	clp->cl_exchange_flags = resp->flags;
+	clp->cl_seqid = resp->seqid;
+	/* Client ID is not confirmed */
+	if (!(resp->flags & EXCHGID4_FLAG_CONFIRMED_R))
+		clear_bit(NFS4_SESSION_ESTABLISHED,
+			  &clp->cl_session->session_state);
+
+	if (clp->cl_serverscope != NULL &&
+	    !nfs41_same_server_scope(clp->cl_serverscope,
+				resp->server_scope)) {
+		dprintk("%s: server_scope mismatch detected\n",
+			__func__);
+		set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
+	}
+
+	swap(clp->cl_serverowner, resp->server_owner);
+	swap(clp->cl_serverscope, resp->server_scope);
+	swap(clp->cl_implid, resp->impl_id);
+
+	/* Save the EXCHANGE_ID verifier session trunk tests */
+	memcpy(clp->cl_confirm.data, argp->verifier.data,
+	       sizeof(clp->cl_confirm.data));
+out:
+	trace_nfs4_exchange_id(clp, status);
+	rpc_put_task(task);
+	return status;
 }
 
 /*
@@ -7615,13 +7601,13 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 	/* try SP4_MACH_CRED if krb5i/p	*/
 	if (authflavor == RPC_AUTH_GSS_KRB5I ||
 	    authflavor == RPC_AUTH_GSS_KRB5P) {
-		status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED, NULL);
+		status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED);
 		if (!status)
 			return 0;
 	}
 
 	/* try SP4_NONE */
-	return _nfs4_proc_exchange_id(clp, cred, SP4_NONE, NULL);
+	return _nfs4_proc_exchange_id(clp, cred, SP4_NONE);
 }
 
 /**
@@ -7643,6 +7629,9 @@ int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
 			    void *data)
 {
 	struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data;
+	struct rpc_task *task;
+	int status;
+
 	u32 sp4_how;
 
 	dprintk("--> %s try %s\n", __func__,
@@ -7651,7 +7640,17 @@ int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
 	sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED);
 
 	/* Test connection for session trunking. Async exchange_id call */
-	return  _nfs4_proc_exchange_id(adata->clp, adata->cred, sp4_how, xprt);
+	task = nfs4_run_exchange_id(adata->clp, adata->cred, sp4_how, xprt);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	status = task->tk_status;
+	if (status == 0)
+		status = nfs4_detect_session_trunking(adata->clp,
+				task->tk_msg.rpc_resp, xprt);
+
+	rpc_put_task(task);
+	return status;
 }
 EXPORT_SYMBOL_GPL(nfs4_test_session_trunk);
 
-- 
cgit v1.2.3


From 937e3133cd0b99b0724aebd081fed372441c9915 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 1 Aug 2017 07:32:50 -0400
Subject: NFSv4.1: Ensure we clear the SP4_MACH_CRED flags in
 nfs4_sp4_select_mode()

If the server changes, so that it no longer supports SP4_MACH_CRED, or
that it doesn't support the same set of SP4_MACH_CRED functionality,
then we want to ensure that we clear the unsupported flags.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 34190c2c0900..d9fc34dedcf8 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7319,7 +7319,9 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
 		      1 << (OP_DESTROY_SESSION - 32) |
 		      1 << (OP_DESTROY_CLIENTID - 32)
 	};
+	unsigned long flags = 0;
 	unsigned int i;
+	int ret = 0;
 
 	if (sp->how == SP4_MACH_CRED) {
 		/* Print state protect result */
@@ -7335,7 +7337,8 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
 		for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++) {
 			if (sp->enforce.u.words[i] & ~supported_enforce[i]) {
 				dfprintk(MOUNT, "sp4_mach_cred: disabled\n");
-				return -EINVAL;
+				ret = -EINVAL;
+				goto out;
 			}
 		}
 
@@ -7354,10 +7357,11 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
 		    test_bit(OP_DESTROY_CLIENTID, sp->enforce.u.longs)) {
 			dfprintk(MOUNT, "sp4_mach_cred:\n");
 			dfprintk(MOUNT, "  minimal mode enabled\n");
-			set_bit(NFS_SP4_MACH_CRED_MINIMAL, &clp->cl_sp4_flags);
+			__set_bit(NFS_SP4_MACH_CRED_MINIMAL, &flags);
 		} else {
 			dfprintk(MOUNT, "sp4_mach_cred: disabled\n");
-			return -EINVAL;
+			ret = -EINVAL;
+			goto out;
 		}
 
 		if (test_bit(OP_CLOSE, sp->allow.u.longs) &&
@@ -7365,38 +7369,38 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
 		    test_bit(OP_DELEGRETURN, sp->allow.u.longs) &&
 		    test_bit(OP_LOCKU, sp->allow.u.longs)) {
 			dfprintk(MOUNT, "  cleanup mode enabled\n");
-			set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags);
+			__set_bit(NFS_SP4_MACH_CRED_CLEANUP, &flags);
 		}
 
 		if (test_bit(OP_LAYOUTRETURN, sp->allow.u.longs)) {
 			dfprintk(MOUNT, "  pnfs cleanup mode enabled\n");
-			set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP,
-				&clp->cl_sp4_flags);
+			__set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP, &flags);
 		}
 
 		if (test_bit(OP_SECINFO, sp->allow.u.longs) &&
 		    test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) {
 			dfprintk(MOUNT, "  secinfo mode enabled\n");
-			set_bit(NFS_SP4_MACH_CRED_SECINFO, &clp->cl_sp4_flags);
+			__set_bit(NFS_SP4_MACH_CRED_SECINFO, &flags);
 		}
 
 		if (test_bit(OP_TEST_STATEID, sp->allow.u.longs) &&
 		    test_bit(OP_FREE_STATEID, sp->allow.u.longs)) {
 			dfprintk(MOUNT, "  stateid mode enabled\n");
-			set_bit(NFS_SP4_MACH_CRED_STATEID, &clp->cl_sp4_flags);
+			__set_bit(NFS_SP4_MACH_CRED_STATEID, &flags);
 		}
 
 		if (test_bit(OP_WRITE, sp->allow.u.longs)) {
 			dfprintk(MOUNT, "  write mode enabled\n");
-			set_bit(NFS_SP4_MACH_CRED_WRITE, &clp->cl_sp4_flags);
+			__set_bit(NFS_SP4_MACH_CRED_WRITE, &flags);
 		}
 
 		if (test_bit(OP_COMMIT, sp->allow.u.longs)) {
 			dfprintk(MOUNT, "  commit mode enabled\n");
-			set_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags);
+			__set_bit(NFS_SP4_MACH_CRED_COMMIT, &flags);
 		}
 	}
-
+out:
+	clp->cl_sp4_flags = flags;
 	return 0;
 }
 
-- 
cgit v1.2.3


From bfab281721edc250ac44b8b6787de32c95f9adfc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 1 Aug 2017 08:17:34 -0400
Subject: NFSv4: Cleanup setting of the migration flags.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d9fc34dedcf8..15e91f003adc 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7498,13 +7498,10 @@ nfs4_run_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 				sizeof(calldata->args.verifier.data));
 	}
 	calldata->args.client = clp;
-#ifdef CONFIG_NFS_V4_1_MIGRATION
-	calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
-	EXCHGID4_FLAG_BIND_PRINC_STATEID |
-	EXCHGID4_FLAG_SUPP_MOVED_MIGR,
-#else
 	calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
-	EXCHGID4_FLAG_BIND_PRINC_STATEID,
+	EXCHGID4_FLAG_BIND_PRINC_STATEID;
+#ifdef CONFIG_NFS_V4_1_MIGRATION
+	calldata->args.flags |= EXCHGID4_FLAG_SUPP_MOVED_MIGR;
 #endif
 	msg.rpc_argp = &calldata->args;
 	msg.rpc_resp = &calldata->res;
-- 
cgit v1.2.3


From 4e2fcac773902eefdeae81637ca73bbf0398d11f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 8 Aug 2017 09:06:18 -0400
Subject: NFSv4: Use correct inode in _nfs4_opendata_to_nfs4_state()

When doing open by filehandle we don't really want to lookup a new inode,
but rather update the one we've got. Add a helper which does this for us.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d90132642340..5f11caefd36d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1659,6 +1659,28 @@ update:
 	return state;
 }
 
+static struct inode *
+nfs4_opendata_get_inode(struct nfs4_opendata *data)
+{
+	struct inode *inode;
+
+	switch (data->o_arg.claim) {
+	case NFS4_OPEN_CLAIM_NULL:
+	case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+	case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+		if (!(data->f_attr.valid & NFS_ATTR_FATTR))
+			return ERR_PTR(-EAGAIN);
+		inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh,
+				&data->f_attr, data->f_label);
+		break;
+	default:
+		inode = d_inode(data->dentry);
+		ihold(inode);
+		nfs_refresh_inode(inode, &data->f_attr);
+	}
+	return inode;
+}
+
 static struct nfs4_state *
 _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
 {
@@ -1672,10 +1694,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
 		goto out;
 	}
 
-	ret = -EAGAIN;
-	if (!(data->f_attr.valid & NFS_ATTR_FATTR))
-		goto err;
-	inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, data->f_label);
+	inode = nfs4_opendata_get_inode(data);
 	ret = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto err;
@@ -2071,7 +2090,6 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 		data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0];
 	case NFS4_OPEN_CLAIM_FH:
 		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
-		nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
 	}
 	data->timestamp = jiffies;
 	if (nfs4_setup_sequence(data->o_arg.server->nfs_client,
-- 
cgit v1.2.3


From 75e8c48b9ef30bfda38e7f6e5d807352fbf0b090 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 8 Aug 2017 10:38:07 -0400
Subject: NFSv4: Use the nfs4_state being recovered in
 _nfs4_opendata_to_nfs4_state()

If we're recovering a nfs4_state, then we should try to use that instead
of looking up a new stateid. Only do that if the inodes match, though.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 41 +++++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 16 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5f11caefd36d..3923ac71a420 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1682,11 +1682,29 @@ nfs4_opendata_get_inode(struct nfs4_opendata *data)
 }
 
 static struct nfs4_state *
-_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
+nfs4_opendata_find_nfs4_state(struct nfs4_opendata *data)
 {
+	struct nfs4_state *state;
 	struct inode *inode;
-	struct nfs4_state *state = NULL;
-	int ret;
+
+	inode = nfs4_opendata_get_inode(data);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (data->state != NULL && data->state->inode == inode) {
+		state = data->state;
+		atomic_inc(&state->count);
+	} else
+		state = nfs4_get_open_state(inode, data->owner);
+	iput(inode);
+	if (state == NULL)
+		state = ERR_PTR(-ENOMEM);
+	return state;
+}
+
+static struct nfs4_state *
+_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
+{
+	struct nfs4_state *state;
 
 	if (!data->rpc_done) {
 		state = nfs4_try_open_cached(data);
@@ -1694,26 +1712,17 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
 		goto out;
 	}
 
-	inode = nfs4_opendata_get_inode(data);
-	ret = PTR_ERR(inode);
-	if (IS_ERR(inode))
-		goto err;
-	ret = -ENOMEM;
-	state = nfs4_get_open_state(inode, data->owner);
-	if (state == NULL)
-		goto err_put_inode;
+	state = nfs4_opendata_find_nfs4_state(data);
+	if (IS_ERR(state))
+		goto out;
+
 	if (data->o_res.delegation_type != 0)
 		nfs4_opendata_check_deleg(data, state);
 	update_open_stateid(state, &data->o_res.stateid, NULL,
 			data->o_arg.fmode);
-	iput(inode);
 out:
 	nfs_release_seqid(data->o_arg.seqid);
 	return state;
-err_put_inode:
-	iput(inode);
-err:
-	return ERR_PTR(ret);
 }
 
 static struct nfs4_state *
-- 
cgit v1.2.3


From 6d17d653c9f152e113043d00f3bcf00c0eb5f5a2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 9 Jul 2017 13:45:27 -0400
Subject: NFS: Simplify page writeback

We don't expect the page header lock to ever be held across I/O, so
it should always be safe to wait for it, even if we're doing nonblocking
writebacks.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 30 ++++++++++--------------------
 1 file changed, 10 insertions(+), 20 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b1af5dee5e0a..1d447e37f472 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -366,7 +366,6 @@ nfs_page_group_clear_bits(struct nfs_page *req)
  * @inode - inode associated with request page group, must be holding inode lock
  * @head  - head request of page group, must be holding head lock
  * @req   - request that couldn't lock and needs to wait on the req bit lock
- * @nonblock - if true, don't actually wait
  *
  * NOTE: this must be called holding page_group bit lock and inode spin lock
  *       and BOTH will be released before returning.
@@ -375,7 +374,7 @@ nfs_page_group_clear_bits(struct nfs_page *req)
  */
 static int
 nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head,
-			  struct nfs_page *req, bool nonblock)
+			  struct nfs_page *req)
 	__releases(&inode->i_lock)
 {
 	struct nfs_page *tmp;
@@ -396,10 +395,7 @@ nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head,
 	/* release ref from nfs_page_find_head_request_locked */
 	nfs_release_request(head);
 
-	if (!nonblock)
-		ret = nfs_wait_on_request(req);
-	else
-		ret = -EAGAIN;
+	ret = nfs_wait_on_request(req);
 	nfs_release_request(req);
 
 	return ret;
@@ -464,7 +460,6 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
  *                              operations for this page.
  *
  * @page - the page used to lookup the "page group" of nfs_page structures
- * @nonblock - if true, don't block waiting for request locks
  *
  * This function joins all sub requests to the head request by first
  * locking all requests in the group, cancelling any pending operations
@@ -478,7 +473,7 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
  * error was encountered.
  */
 static struct nfs_page *
-nfs_lock_and_join_requests(struct page *page, bool nonblock)
+nfs_lock_and_join_requests(struct page *page)
 {
 	struct inode *inode = page_file_mapping(page)->host;
 	struct nfs_page *head, *subreq;
@@ -511,14 +506,9 @@ try_again:
 	if (ret < 0) {
 		spin_unlock(&inode->i_lock);
 
-		if (!nonblock && ret == -EAGAIN) {
-			nfs_page_group_lock_wait(head);
-			nfs_release_request(head);
-			goto try_again;
-		}
-
+		nfs_page_group_lock_wait(head);
 		nfs_release_request(head);
-		return ERR_PTR(ret);
+		goto try_again;
 	}
 
 	/* lock each request in the page group */
@@ -543,7 +533,7 @@ try_again:
 			/* releases page group bit lock and
 			 * inode spin lock and all references */
 			ret = nfs_unroll_locks_and_wait(inode, head,
-				subreq, nonblock);
+				subreq);
 
 			if (ret == 0)
 				goto try_again;
@@ -624,12 +614,12 @@ nfs_error_is_fatal_on_server(int err)
  * May return an error if the user signalled nfs_wait_on_request().
  */
 static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
-				struct page *page, bool nonblock)
+				struct page *page)
 {
 	struct nfs_page *req;
 	int ret = 0;
 
-	req = nfs_lock_and_join_requests(page, nonblock);
+	req = nfs_lock_and_join_requests(page);
 	if (!req)
 		goto out;
 	ret = PTR_ERR(req);
@@ -672,7 +662,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
 	int ret;
 
 	nfs_pageio_cond_complete(pgio, page_index(page));
-	ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
+	ret = nfs_page_async_flush(pgio, page);
 	if (ret == -EAGAIN) {
 		redirty_page_for_writepage(wbc, page);
 		ret = 0;
@@ -2015,7 +2005,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
 
 	/* blocking call to cancel all requests and join to a single (head)
 	 * request */
-	req = nfs_lock_and_join_requests(page, false);
+	req = nfs_lock_and_join_requests(page);
 
 	if (IS_ERR(req)) {
 		ret = PTR_ERR(req);
-- 
cgit v1.2.3


From 82749dd4efcec8e90fa7769eec3dd0afa2e3396a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 09:30:13 -0400
Subject: NFS: Reduce lock contention in nfs_page_find_head_request()

Add a lockless check for whether or not the page might be carrying
an existing writeback before we grab the inode->i_lock.

Reported-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1d447e37f472..06e150c4e315 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -190,9 +190,11 @@ static struct nfs_page *nfs_page_find_head_request(struct page *page)
 	struct inode *inode = page_file_mapping(page)->host;
 	struct nfs_page *req = NULL;
 
-	spin_lock(&inode->i_lock);
-	req = nfs_page_find_head_request_locked(NFS_I(inode), page);
-	spin_unlock(&inode->i_lock);
+	if (PagePrivate(page) || PageSwapCache(page)) {
+		spin_lock(&inode->i_lock);
+		req = nfs_page_find_head_request_locked(NFS_I(inode), page);
+		spin_unlock(&inode->i_lock);
+	}
 	return req;
 }
 
-- 
cgit v1.2.3


From 1403390d8366c717139cab26b8e94d943915fa12 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 09:37:12 -0400
Subject: NFS: Reduce lock contention in nfs_try_to_update_request()

Micro-optimisation to move the lockless check into the for(;;) loop.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 06e150c4e315..bb019096c331 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1097,13 +1097,12 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
 	unsigned int end;
 	int error;
 
-	if (!PagePrivate(page))
-		return NULL;
-
 	end = offset + bytes;
-	spin_lock(&inode->i_lock);
 
 	for (;;) {
+		if (!(PagePrivate(page) || PageSwapCache(page)))
+			return NULL;
+		spin_lock(&inode->i_lock);
 		req = nfs_page_find_head_request_locked(NFS_I(inode), page);
 		if (req == NULL)
 			goto out_unlock;
@@ -1132,7 +1131,6 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
 		nfs_release_request(req);
 		if (error != 0)
 			goto out_err;
-		spin_lock(&inode->i_lock);
 	}
 
 	/* Okay, the request matches. Update the region */
-- 
cgit v1.2.3


From 08fead2ae5a9953d47677416cc5f6bcae448480d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 18 Jul 2017 19:31:10 -0400
Subject: NFS: Ensure we always dereference the page head last

This fixes a race with nfs_page_group_sync_on_bit() whereby the
call to wake_up_bit() in nfs_page_group_unlock() could occur after
the page header had been freed.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pagelist.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index de9066a92c0d..a6f2bbd709ba 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -306,14 +306,11 @@ static void
 nfs_page_group_destroy(struct kref *kref)
 {
 	struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
+	struct nfs_page *head = req->wb_head;
 	struct nfs_page *tmp, *next;
 
-	/* subrequests must release the ref on the head request */
-	if (req->wb_head != req)
-		nfs_release_request(req->wb_head);
-
 	if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN))
-		return;
+		goto out;
 
 	tmp = req;
 	do {
@@ -324,6 +321,10 @@ nfs_page_group_destroy(struct kref *kref)
 		nfs_free_request(tmp);
 		tmp = next;
 	} while (tmp != req);
+out:
+	/* subrequests must release the ref on the head request */
+	if (head != req)
+		nfs_release_request(head);
 }
 
 /**
-- 
cgit v1.2.3


From 7cb9cd9aa2eafe869935d4168031f1ed376d924c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 10:11:11 -0400
Subject: NFS: Fix a reference and lock leak in nfs_lock_and_join_requests()

Yes, this is a situation that should never happen (hence the WARN_ON)
but we should still ensure that we free up the locks and references to
the faulty pages.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index bb019096c331..1ca759719429 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -526,8 +526,7 @@ try_again:
 		} else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset ||
 			    ((subreq->wb_offset + subreq->wb_bytes) >
 			     (head->wb_offset + total_bytes)))) {
-			nfs_page_group_unlock(head);
-			spin_unlock(&inode->i_lock);
+			nfs_unroll_locks_and_wait(inode, head, subreq);
 			return ERR_PTR(-EIO);
 		}
 
-- 
cgit v1.2.3


From a0e265bc78010d2d831a968d4cea3c40a0efac8b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 10:29:32 -0400
Subject: NFS: Fix an ABBA issue in nfs_lock_and_join_requests()

All other callers of nfs_page_group_lock() appear to already hold the
page lock on the head page, so doing it in the opposite order here
is inefficient, although not deadlock prone since we roll back all
locks on contention.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1ca759719429..c940e615f5dc 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -383,7 +383,7 @@ nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head,
 	int ret;
 
 	/* relinquish all the locks successfully grabbed this run */
-	for (tmp = head ; tmp != req; tmp = tmp->wb_this_page)
+	for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page)
 		nfs_unlock_request(tmp);
 
 	WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
@@ -395,7 +395,7 @@ nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head,
 	spin_unlock(&inode->i_lock);
 
 	/* release ref from nfs_page_find_head_request_locked */
-	nfs_release_request(head);
+	nfs_unlock_and_release_request(head);
 
 	ret = nfs_wait_on_request(req);
 	nfs_release_request(req);
@@ -484,10 +484,6 @@ nfs_lock_and_join_requests(struct page *page)
 	int ret;
 
 try_again:
-	total_bytes = 0;
-
-	WARN_ON_ONCE(destroy_list);
-
 	spin_lock(&inode->i_lock);
 
 	/*
@@ -502,6 +498,16 @@ try_again:
 		return NULL;
 	}
 
+	/* lock the page head first in order to avoid an ABBA inefficiency */
+	if (!nfs_lock_request(head)) {
+		spin_unlock(&inode->i_lock);
+		ret = nfs_wait_on_request(head);
+		nfs_release_request(head);
+		if (ret < 0)
+			return ERR_PTR(ret);
+		goto try_again;
+	}
+
 	/* holding inode lock, so always make a non-blocking call to try the
 	 * page group lock */
 	ret = nfs_page_group_lock(head, true);
@@ -509,13 +515,14 @@ try_again:
 		spin_unlock(&inode->i_lock);
 
 		nfs_page_group_lock_wait(head);
-		nfs_release_request(head);
+		nfs_unlock_and_release_request(head);
 		goto try_again;
 	}
 
 	/* lock each request in the page group */
-	subreq = head;
-	do {
+	total_bytes = head->wb_bytes;
+	for (subreq = head->wb_this_page; subreq != head;
+			subreq = subreq->wb_this_page) {
 		/*
 		 * Subrequests are always contiguous, non overlapping
 		 * and in order - but may be repeated (mirrored writes).
@@ -541,9 +548,7 @@ try_again:
 
 			return ERR_PTR(ret);
 		}
-
-		subreq = subreq->wb_this_page;
-	} while (subreq != head);
+	}
 
 	/* Now that all requests are locked, make sure they aren't on any list.
 	 * Commit list removal accounting is done after locks are dropped */
-- 
cgit v1.2.3


From e14bebf6de11a4b8476cf2b0a75bf7c3e69112d5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 11:11:49 -0400
Subject: NFS: Don't check request offset and size without holding a lock

Request offsets and sizes are not guaranteed to be stable unless you
are holding the request locked.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c940e615f5dc..84b6818e5278 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -523,6 +523,17 @@ try_again:
 	total_bytes = head->wb_bytes;
 	for (subreq = head->wb_this_page; subreq != head;
 			subreq = subreq->wb_this_page) {
+		if (!nfs_lock_request(subreq)) {
+			/* releases page group bit lock and
+			 * inode spin lock and all references */
+			ret = nfs_unroll_locks_and_wait(inode, head,
+				subreq);
+
+			if (ret == 0)
+				goto try_again;
+
+			return ERR_PTR(ret);
+		}
 		/*
 		 * Subrequests are always contiguous, non overlapping
 		 * and in order - but may be repeated (mirrored writes).
@@ -533,21 +544,10 @@ try_again:
 		} else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset ||
 			    ((subreq->wb_offset + subreq->wb_bytes) >
 			     (head->wb_offset + total_bytes)))) {
+			nfs_unlock_request(subreq);
 			nfs_unroll_locks_and_wait(inode, head, subreq);
 			return ERR_PTR(-EIO);
 		}
-
-		if (!nfs_lock_request(subreq)) {
-			/* releases page group bit lock and
-			 * inode spin lock and all references */
-			ret = nfs_unroll_locks_and_wait(inode, head,
-				subreq);
-
-			if (ret == 0)
-				goto try_again;
-
-			return ERR_PTR(ret);
-		}
 	}
 
 	/* Now that all requests are locked, make sure they aren't on any list.
-- 
cgit v1.2.3


From 31a01f093edbc687e783a4c8adcd76d3cc91a559 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 18 Jul 2017 19:18:49 -0400
Subject: NFS: Don't unlock writebacks before declaring PG_WB_END

We don't want nfs_lock_and_join_requests() to start fiddling with
the request before the call to nfs_page_group_sync_on_bit().

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 84b6818e5278..bb38c881fc48 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -335,8 +335,11 @@ static void nfs_end_page_writeback(struct nfs_page *req)
 {
 	struct inode *inode = page_file_mapping(req->wb_page)->host;
 	struct nfs_server *nfss = NFS_SERVER(inode);
+	bool is_done;
 
-	if (!nfs_page_group_sync_on_bit(req, PG_WB_END))
+	is_done = nfs_page_group_sync_on_bit(req, PG_WB_END);
+	nfs_unlock_request(req);
+	if (!is_done)
 		return;
 
 	end_page_writeback(req->wb_page);
@@ -596,7 +599,6 @@ try_again:
 
 static void nfs_write_error_remove_page(struct nfs_page *req)
 {
-	nfs_unlock_request(req);
 	nfs_end_page_writeback(req);
 	generic_error_remove_page(page_file_mapping(req->wb_page),
 				  req->wb_page);
@@ -1019,7 +1021,6 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
 remove_req:
 		nfs_inode_remove_request(req);
 next:
-		nfs_unlock_request(req);
 		nfs_end_page_writeback(req);
 		nfs_release_request(req);
 	}
@@ -1406,7 +1407,6 @@ static void nfs_redirty_request(struct nfs_page *req)
 {
 	nfs_mark_request_dirty(req);
 	set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags);
-	nfs_unlock_request(req);
 	nfs_end_page_writeback(req);
 	nfs_release_request(req);
 }
-- 
cgit v1.2.3


From b66aaa8dfeda7b5c7df513cf3b36e1290fa84055 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 18 Jul 2017 15:22:12 -0400
Subject: NFS: Fix the inode request accounting when pages have subrequests

Both nfs_destroy_unlinked_subrequests() and nfs_lock_and_join_requests()
manipulate the inode flags adjusting the NFS_I(inode)->nrequests.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index bb38c881fc48..ee981353d4aa 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -418,7 +418,8 @@ nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head,
  */
 static void
 nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
-				 struct nfs_page *old_head)
+				 struct nfs_page *old_head,
+				 struct inode *inode)
 {
 	while (destroy_list) {
 		struct nfs_page *subreq = destroy_list;
@@ -443,9 +444,12 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
 			nfs_page_group_clear_bits(subreq);
 
 			/* release the PG_INODE_REF reference */
-			if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags))
+			if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) {
 				nfs_release_request(subreq);
-			else
+				spin_lock(&inode->i_lock);
+				NFS_I(inode)->nrequests--;
+				spin_unlock(&inode->i_lock);
+			} else
 				WARN_ON_ONCE(1);
 		} else {
 			WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags));
@@ -572,25 +576,24 @@ try_again:
 		head->wb_bytes = total_bytes;
 	}
 
+	/* Postpone destruction of this request */
+	if (test_and_clear_bit(PG_REMOVE, &head->wb_flags)) {
+		set_bit(PG_INODE_REF, &head->wb_flags);
+		kref_get(&head->wb_kref);
+		NFS_I(inode)->nrequests++;
+	}
+
 	/*
 	 * prepare head request to be added to new pgio descriptor
 	 */
 	nfs_page_group_clear_bits(head);
 
-	/*
-	 * some part of the group was still on the inode list - otherwise
-	 * the group wouldn't be involved in async write.
-	 * grab a reference for the head request, iff it needs one.
-	 */
-	if (!test_and_set_bit(PG_INODE_REF, &head->wb_flags))
-		kref_get(&head->wb_kref);
-
 	nfs_page_group_unlock(head);
 
 	/* drop lock to clean uprequests on destroy list */
 	spin_unlock(&inode->i_lock);
 
-	nfs_destroy_unlinked_subrequests(destroy_list, head);
+	nfs_destroy_unlinked_subrequests(destroy_list, head, inode);
 
 	/* still holds ref on head from nfs_page_find_head_request_locked
 	 * and still has lock on head from lock loop */
-- 
cgit v1.2.3


From f6032f216fca8a1fa7f43a652f26cdf633183745 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 19:50:23 -0400
Subject: NFS: Teach nfs_try_to_update_request() to deal with request
 page_groups

Simplify the code, and avoid some flushes to disk.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 60 ++++++++++++++++++++--------------------------------------
 1 file changed, 20 insertions(+), 40 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ee981353d4aa..0b4d1ef168e0 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1107,39 +1107,19 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
 
 	end = offset + bytes;
 
-	for (;;) {
-		if (!(PagePrivate(page) || PageSwapCache(page)))
-			return NULL;
-		spin_lock(&inode->i_lock);
-		req = nfs_page_find_head_request_locked(NFS_I(inode), page);
-		if (req == NULL)
-			goto out_unlock;
-
-		/* should be handled by nfs_flush_incompatible */
-		WARN_ON_ONCE(req->wb_head != req);
-		WARN_ON_ONCE(req->wb_this_page != req);
-
-		rqend = req->wb_offset + req->wb_bytes;
-		/*
-		 * Tell the caller to flush out the request if
-		 * the offsets are non-contiguous.
-		 * Note: nfs_flush_incompatible() will already
-		 * have flushed out requests having wrong owners.
-		 */
-		if (offset > rqend
-		    || end < req->wb_offset)
-			goto out_flushme;
-
-		if (nfs_lock_request(req))
-			break;
+	req = nfs_lock_and_join_requests(page);
+	if (IS_ERR_OR_NULL(req))
+		return req;
 
-		/* The request is locked, so wait and then retry */
-		spin_unlock(&inode->i_lock);
-		error = nfs_wait_on_request(req);
-		nfs_release_request(req);
-		if (error != 0)
-			goto out_err;
-	}
+	rqend = req->wb_offset + req->wb_bytes;
+	/*
+	 * Tell the caller to flush out the request if
+	 * the offsets are non-contiguous.
+	 * Note: nfs_flush_incompatible() will already
+	 * have flushed out requests having wrong owners.
+	 */
+	if (offset > rqend || end < req->wb_offset)
+		goto out_flushme;
 
 	/* Okay, the request matches. Update the region */
 	if (offset < req->wb_offset) {
@@ -1150,17 +1130,17 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
 		req->wb_bytes = end - req->wb_offset;
 	else
 		req->wb_bytes = rqend - req->wb_offset;
-out_unlock:
-	if (req)
-		nfs_clear_request_commit(req);
-	spin_unlock(&inode->i_lock);
 	return req;
 out_flushme:
-	spin_unlock(&inode->i_lock);
-	nfs_release_request(req);
+	/*
+	 * Note: we mark the request dirty here because
+	 * nfs_lock_and_join_requests() cannot preserve
+	 * commit flags, so we have to replay the write.
+	 */
+	nfs_mark_request_dirty(req);
+	nfs_unlock_and_release_request(req);
 	error = nfs_wb_page(inode, page);
-out_err:
-	return ERR_PTR(error);
+	return (error < 0) ? ERR_PTR(error) : NULL;
 }
 
 /*
-- 
cgit v1.2.3


From 7e6cca6caf7230b049bd681c5400b01c365ee452 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 20:00:46 -0400
Subject: NFS: Remove page group limit in nfs_flush_incompatible()

nfs_try_to_update_request() should be able to cope now.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0b4d1ef168e0..08c1ce968cce 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1205,8 +1205,6 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
 		l_ctx = req->wb_lock_context;
 		do_flush = req->wb_page != page ||
 			!nfs_match_open_context(req->wb_context, ctx);
-		/* for now, flush if more than 1 request in page_group */
-		do_flush |= req->wb_this_page != req;
 		if (l_ctx && flctx &&
 		    !(list_empty_careful(&flctx->flc_posix) &&
 		      list_empty_careful(&flctx->flc_flock))) {
-- 
cgit v1.2.3


From b5bab9bf91324a7fe21b365d6966cfd087d08e3a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 10:34:21 -0400
Subject: NFS: Reduce inode->i_lock contention in nfs_lock_and_join_requests()

We should no longer need the inode->i_lock, now that we've
straightened out the request locking. The locking schema is now:

1) Lock page head request
2) Lock the page group
3) Lock the subrequests one by one

Note that there is a subtle race with nfs_inode_remove_request() due
to the fact that the latter does not lock the page head, when removing
it from the struct page. Only the last subrequest is locked, hence
we need to re-check that the PagePrivate(page) is still set after
we've locked all the subrequests.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 08c1ce968cce..ff7c90c7ff79 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -372,15 +372,14 @@ nfs_page_group_clear_bits(struct nfs_page *req)
  * @head  - head request of page group, must be holding head lock
  * @req   - request that couldn't lock and needs to wait on the req bit lock
  *
- * NOTE: this must be called holding page_group bit lock and inode spin lock
- *       and BOTH will be released before returning.
+ * NOTE: this must be called holding page_group bit lock
+ *       which will be released before returning.
  *
  * returns 0 on success, < 0 on error.
  */
 static int
 nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head,
 			  struct nfs_page *req)
-	__releases(&inode->i_lock)
 {
 	struct nfs_page *tmp;
 	int ret;
@@ -395,7 +394,6 @@ nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head,
 	kref_get(&req->wb_kref);
 
 	nfs_page_group_unlock(head);
-	spin_unlock(&inode->i_lock);
 
 	/* release ref from nfs_page_find_head_request_locked */
 	nfs_unlock_and_release_request(head);
@@ -491,8 +489,9 @@ nfs_lock_and_join_requests(struct page *page)
 	int ret;
 
 try_again:
+	if (!(PagePrivate(page) || PageSwapCache(page)))
+		return NULL;
 	spin_lock(&inode->i_lock);
-
 	/*
 	 * A reference is taken only on the head request which acts as a
 	 * reference to the whole page group - the group will not be destroyed
@@ -514,16 +513,12 @@ try_again:
 			return ERR_PTR(ret);
 		goto try_again;
 	}
+	spin_unlock(&inode->i_lock);
 
-	/* holding inode lock, so always make a non-blocking call to try the
-	 * page group lock */
-	ret = nfs_page_group_lock(head, true);
+	ret = nfs_page_group_lock(head, false);
 	if (ret < 0) {
-		spin_unlock(&inode->i_lock);
-
-		nfs_page_group_lock_wait(head);
 		nfs_unlock_and_release_request(head);
-		goto try_again;
+		return ERR_PTR(ret);
 	}
 
 	/* lock each request in the page group */
@@ -531,8 +526,10 @@ try_again:
 	for (subreq = head->wb_this_page; subreq != head;
 			subreq = subreq->wb_this_page) {
 		if (!nfs_lock_request(subreq)) {
-			/* releases page group bit lock and
-			 * inode spin lock and all references */
+			/*
+			 * releases page group bit lock and
+			 * page locks and all references
+			 */
 			ret = nfs_unroll_locks_and_wait(inode, head,
 				subreq);
 
@@ -580,7 +577,9 @@ try_again:
 	if (test_and_clear_bit(PG_REMOVE, &head->wb_flags)) {
 		set_bit(PG_INODE_REF, &head->wb_flags);
 		kref_get(&head->wb_kref);
+		spin_lock(&inode->i_lock);
 		NFS_I(inode)->nrequests++;
+		spin_unlock(&inode->i_lock);
 	}
 
 	/*
@@ -590,11 +589,14 @@ try_again:
 
 	nfs_page_group_unlock(head);
 
-	/* drop lock to clean uprequests on destroy list */
-	spin_unlock(&inode->i_lock);
-
 	nfs_destroy_unlinked_subrequests(destroy_list, head, inode);
 
+	/* Did we lose a race with nfs_inode_remove_request()? */
+	if (!(PagePrivate(page) || PageSwapCache(page))) {
+		nfs_unlock_and_release_request(head);
+		return NULL;
+	}
+
 	/* still holds ref on head from nfs_page_find_head_request_locked
 	 * and still has lock on head from lock loop */
 	return head;
@@ -968,7 +970,7 @@ nfs_clear_page_commit(struct page *page)
 		    WB_RECLAIMABLE);
 }
 
-/* Called holding inode (/cinfo) lock */
+/* Called holding the request lock on @req */
 static void
 nfs_clear_request_commit(struct nfs_page *req)
 {
@@ -977,9 +979,11 @@ nfs_clear_request_commit(struct nfs_page *req)
 		struct nfs_commit_info cinfo;
 
 		nfs_init_cinfo_from_inode(&cinfo, inode);
+		spin_lock(&inode->i_lock);
 		if (!pnfs_clear_request_commit(req, &cinfo)) {
 			nfs_request_remove_commit_list(req, &cinfo);
 		}
+		spin_unlock(&inode->i_lock);
 		nfs_clear_page_commit(req->wb_page);
 	}
 }
-- 
cgit v1.2.3


From 74a6d4b5ae4ec7e93c72a92decb2f8c16c812416 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 19 Jul 2017 08:23:10 -0400
Subject: NFS: Further optimise nfs_lock_and_join_requests()

When locking the entire group in order to remove subrequests,
the locks are always taken in order, and with the page group
lock being taken after the page head is locked. The intention
is that:

1) The lock on the group head guarantees that requests may not
   be removed from the group (although new entries could be appended
   if we're not holding the group lock).
2) It is safe to drop and retake the page group lock while iterating
   through the list, in particular when waiting for a subrequest lock.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 45 ++++++++++++++++++---------------------------
 1 file changed, 18 insertions(+), 27 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ff7c90c7ff79..1ee5d89380d9 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -377,31 +377,17 @@ nfs_page_group_clear_bits(struct nfs_page *req)
  *
  * returns 0 on success, < 0 on error.
  */
-static int
-nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head,
+static void
+nfs_unroll_locks(struct inode *inode, struct nfs_page *head,
 			  struct nfs_page *req)
 {
 	struct nfs_page *tmp;
-	int ret;
 
 	/* relinquish all the locks successfully grabbed this run */
 	for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page)
 		nfs_unlock_request(tmp);
 
 	WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
-
-	/* grab a ref on the request that will be waited on */
-	kref_get(&req->wb_kref);
-
-	nfs_page_group_unlock(head);
-
-	/* release ref from nfs_page_find_head_request_locked */
-	nfs_unlock_and_release_request(head);
-
-	ret = nfs_wait_on_request(req);
-	nfs_release_request(req);
-
-	return ret;
 }
 
 /*
@@ -525,18 +511,21 @@ try_again:
 	total_bytes = head->wb_bytes;
 	for (subreq = head->wb_this_page; subreq != head;
 			subreq = subreq->wb_this_page) {
-		if (!nfs_lock_request(subreq)) {
+
+		while (!nfs_lock_request(subreq)) {
 			/*
-			 * releases page group bit lock and
-			 * page locks and all references
+			 * Unlock page to allow nfs_page_group_sync_on_bit()
+			 * to succeed
 			 */
-			ret = nfs_unroll_locks_and_wait(inode, head,
-				subreq);
-
-			if (ret == 0)
-				goto try_again;
-
-			return ERR_PTR(ret);
+			nfs_page_group_unlock(head);
+			ret = nfs_wait_on_request(subreq);
+			if (!ret)
+				ret = nfs_page_group_lock(head, false);
+			if (ret < 0) {
+				nfs_unroll_locks(inode, head, subreq);
+				nfs_unlock_and_release_request(head);
+				return ERR_PTR(ret);
+			}
 		}
 		/*
 		 * Subrequests are always contiguous, non overlapping
@@ -549,7 +538,9 @@ try_again:
 			    ((subreq->wb_offset + subreq->wb_bytes) >
 			     (head->wb_offset + total_bytes)))) {
 			nfs_unlock_request(subreq);
-			nfs_unroll_locks_and_wait(inode, head, subreq);
+			nfs_unroll_locks(inode, head, subreq);
+			nfs_page_group_unlock(head);
+			nfs_unlock_and_release_request(head);
 			return ERR_PTR(-EIO);
 		}
 	}
-- 
cgit v1.2.3


From 5b2b5187fa85665f0c47029ecaf49186ec138d9b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 19 Jul 2017 10:06:36 -0400
Subject: NFS: Fix nfs_page_group_destroy() and nfs_lock_and_join_requests()
 race cases

Since nfs_page_group_destroy() does not take any locks on the requests
to be freed, we need to ensure that we don't inadvertently free the
request in nfs_destroy_unlinked_subrequests() while the last reference
is being released elsewhere.

Do this by:

1) Taking a reference to the request unless it is already being freed
2) Checking (under the page group lock) if PG_TEARDOWN is already set before
   freeing an unreferenced request in nfs_destroy_unlinked_subrequests()

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 58 +++++++++++++++++++++++++++++-----------------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1ee5d89380d9..ffb9934607ef 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -384,10 +384,11 @@ nfs_unroll_locks(struct inode *inode, struct nfs_page *head,
 	struct nfs_page *tmp;
 
 	/* relinquish all the locks successfully grabbed this run */
-	for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page)
-		nfs_unlock_request(tmp);
-
-	WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
+	for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) {
+		if (!kref_read(&tmp->wb_kref))
+			continue;
+		nfs_unlock_and_release_request(tmp);
+	}
 }
 
 /*
@@ -414,36 +415,32 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
 		WARN_ON_ONCE(old_head != subreq->wb_head);
 
 		/* make sure old group is not used */
-		subreq->wb_head = subreq;
 		subreq->wb_this_page = subreq;
 
-		/* subreq is now totally disconnected from page group or any
-		 * write / commit lists. last chance to wake any waiters */
-		nfs_unlock_request(subreq);
-
-		if (!test_bit(PG_TEARDOWN, &subreq->wb_flags)) {
-			/* release ref on old head request */
-			nfs_release_request(old_head);
+		/* Note: races with nfs_page_group_destroy() */
+		if (!kref_read(&subreq->wb_kref)) {
+			bool freeme = test_bit(PG_TEARDOWN, &subreq->wb_flags);
 
 			nfs_page_group_clear_bits(subreq);
+			/* Check if we raced with nfs_page_group_destroy() */
+			if (freeme)
+				nfs_free_request(subreq);
+			continue;
+		}
 
-			/* release the PG_INODE_REF reference */
-			if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) {
-				nfs_release_request(subreq);
-				spin_lock(&inode->i_lock);
-				NFS_I(inode)->nrequests--;
-				spin_unlock(&inode->i_lock);
-			} else
-				WARN_ON_ONCE(1);
-		} else {
-			WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags));
-			/* zombie requests have already released the last
-			 * reference and were waiting on the rest of the
-			 * group to complete. Since it's no longer part of a
-			 * group, simply free the request */
-			nfs_page_group_clear_bits(subreq);
-			nfs_free_request(subreq);
+		subreq->wb_head = subreq;
+
+		if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) {
+			nfs_release_request(subreq);
+			spin_lock(&inode->i_lock);
+			NFS_I(inode)->nrequests--;
+			spin_unlock(&inode->i_lock);
 		}
+
+		nfs_page_group_clear_bits(subreq);
+		/* subreq is now totally disconnected from page group or any
+		 * write / commit lists. last chance to wake any waiters */
+		nfs_unlock_and_release_request(subreq);
 	}
 }
 
@@ -512,6 +509,8 @@ try_again:
 	for (subreq = head->wb_this_page; subreq != head;
 			subreq = subreq->wb_this_page) {
 
+		if (!kref_get_unless_zero(&subreq->wb_kref))
+			continue;
 		while (!nfs_lock_request(subreq)) {
 			/*
 			 * Unlock page to allow nfs_page_group_sync_on_bit()
@@ -523,6 +522,7 @@ try_again:
 				ret = nfs_page_group_lock(head, false);
 			if (ret < 0) {
 				nfs_unroll_locks(inode, head, subreq);
+				nfs_release_request(subreq);
 				nfs_unlock_and_release_request(head);
 				return ERR_PTR(ret);
 			}
@@ -537,8 +537,8 @@ try_again:
 		} else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset ||
 			    ((subreq->wb_offset + subreq->wb_bytes) >
 			     (head->wb_offset + total_bytes)))) {
-			nfs_unlock_request(subreq);
 			nfs_unroll_locks(inode, head, subreq);
+			nfs_unlock_and_release_request(subreq);
 			nfs_page_group_unlock(head);
 			nfs_unlock_and_release_request(head);
 			return ERR_PTR(-EIO);
-- 
cgit v1.2.3


From 902a4c00462a755fe4a6ca655813c8b2a51fab4c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 19 Jul 2017 13:50:07 -0400
Subject: NFS: Remove nfs_page_group_clear_bits()

At this point, we only expect ever to potentially see PG_REMOVE and
PG_TEARDOWN being set on the subrequests.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 29 +++--------------------------
 1 file changed, 3 insertions(+), 26 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ffb9934607ef..20d44ea328b6 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -347,22 +347,6 @@ static void nfs_end_page_writeback(struct nfs_page *req)
 		clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
 }
 
-
-/* nfs_page_group_clear_bits
- *   @req - an nfs request
- * clears all page group related bits from @req
- */
-static void
-nfs_page_group_clear_bits(struct nfs_page *req)
-{
-	clear_bit(PG_TEARDOWN, &req->wb_flags);
-	clear_bit(PG_UNLOCKPAGE, &req->wb_flags);
-	clear_bit(PG_UPTODATE, &req->wb_flags);
-	clear_bit(PG_WB_END, &req->wb_flags);
-	clear_bit(PG_REMOVE, &req->wb_flags);
-}
-
-
 /*
  * nfs_unroll_locks_and_wait -  unlock all newly locked reqs and wait on @req
  *
@@ -417,13 +401,12 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
 		/* make sure old group is not used */
 		subreq->wb_this_page = subreq;
 
+		clear_bit(PG_REMOVE, &subreq->wb_flags);
+
 		/* Note: races with nfs_page_group_destroy() */
 		if (!kref_read(&subreq->wb_kref)) {
-			bool freeme = test_bit(PG_TEARDOWN, &subreq->wb_flags);
-
-			nfs_page_group_clear_bits(subreq);
 			/* Check if we raced with nfs_page_group_destroy() */
-			if (freeme)
+			if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags))
 				nfs_free_request(subreq);
 			continue;
 		}
@@ -437,7 +420,6 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
 			spin_unlock(&inode->i_lock);
 		}
 
-		nfs_page_group_clear_bits(subreq);
 		/* subreq is now totally disconnected from page group or any
 		 * write / commit lists. last chance to wake any waiters */
 		nfs_unlock_and_release_request(subreq);
@@ -573,11 +555,6 @@ try_again:
 		spin_unlock(&inode->i_lock);
 	}
 
-	/*
-	 * prepare head request to be added to new pgio descriptor
-	 */
-	nfs_page_group_clear_bits(head);
-
 	nfs_page_group_unlock(head);
 
 	nfs_destroy_unlinked_subrequests(destroy_list, head, inode);
-- 
cgit v1.2.3


From dee83046e73cb7ebbbae955c1ef0f4f55a0f44f9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 10:51:02 -0400
Subject: NFS: Remove unuse function nfs_page_group_lock_wait()

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pagelist.c | 21 ---------------------
 1 file changed, 21 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index a6f2bbd709ba..ced7974622dd 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -165,27 +165,6 @@ nfs_page_group_lock(struct nfs_page *req, bool nonblock)
 	return -EAGAIN;
 }
 
-/*
- * nfs_page_group_lock_wait - wait for the lock to clear, but don't grab it
- * @req - a request in the group
- *
- * This is a blocking call to wait for the group lock to be cleared.
- */
-void
-nfs_page_group_lock_wait(struct nfs_page *req)
-{
-	struct nfs_page *head = req->wb_head;
-
-	WARN_ON_ONCE(head != head->wb_head);
-
-	if (!test_bit(PG_HEADLOCK, &head->wb_flags))
-		return;
-	set_bit(PG_CONTENDED1, &head->wb_flags);
-	smp_mb__after_atomic();
-	wait_on_bit(&head->wb_flags, PG_HEADLOCK,
-		TASK_UNINTERRUPTIBLE);
-}
-
 /*
  * nfs_page_group_unlock - unlock the head of the page group
  * @req - request in group that is to be unlocked
-- 
cgit v1.2.3


From 1344b7ea172b4911a8ee8a6ff26c5bc6b5abb302 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 10:54:14 -0400
Subject: NFS: Remove unused parameter from nfs_page_group_lock()

nfs_page_group_lock() is now always called with the 'nonblock'
parameter set to 'false'.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pagelist.c | 31 +++++++++++--------------------
 fs/nfs/write.c    |  6 +++---
 2 files changed, 14 insertions(+), 23 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index ced7974622dd..af6731dd4324 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -134,19 +134,14 @@ EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait);
 /*
  * nfs_page_group_lock - lock the head of the page group
  * @req - request in group that is to be locked
- * @nonblock - if true don't block waiting for lock
  *
- * this lock must be held if modifying the page group list
+ * this lock must be held when traversing or modifying the page
+ * group list
  *
- * return 0 on success, < 0 on error: -EDELAY if nonblocking or the
- * result from wait_on_bit_lock
- *
- * NOTE: calling with nonblock=false should always have set the
- *       lock bit (see fs/buffer.c and other uses of wait_on_bit_lock
- *       with TASK_UNINTERRUPTIBLE), so there is no need to check the result.
+ * return 0 on success, < 0 on error
  */
 int
-nfs_page_group_lock(struct nfs_page *req, bool nonblock)
+nfs_page_group_lock(struct nfs_page *req)
 {
 	struct nfs_page *head = req->wb_head;
 
@@ -155,14 +150,10 @@ nfs_page_group_lock(struct nfs_page *req, bool nonblock)
 	if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags))
 		return 0;
 
-	if (!nonblock) {
-		set_bit(PG_CONTENDED1, &head->wb_flags);
-		smp_mb__after_atomic();
-		return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+	set_bit(PG_CONTENDED1, &head->wb_flags);
+	smp_mb__after_atomic();
+	return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
 				TASK_UNINTERRUPTIBLE);
-	}
-
-	return -EAGAIN;
 }
 
 /*
@@ -225,7 +216,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
 {
 	bool ret;
 
-	nfs_page_group_lock(req, false);
+	nfs_page_group_lock(req);
 	ret = nfs_page_group_sync_on_bit_locked(req, bit);
 	nfs_page_group_unlock(req);
 
@@ -1016,7 +1007,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 	unsigned int bytes_left = 0;
 	unsigned int offset, pgbase;
 
-	nfs_page_group_lock(req, false);
+	nfs_page_group_lock(req);
 
 	subreq = req;
 	bytes_left = subreq->wb_bytes;
@@ -1038,7 +1029,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 			if (mirror->pg_recoalesce)
 				return 0;
 			/* retry add_request for this subreq */
-			nfs_page_group_lock(req, false);
+			nfs_page_group_lock(req);
 			continue;
 		}
 
@@ -1135,7 +1126,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 
 	for (midx = 0; midx < desc->pg_mirror_count; midx++) {
 		if (midx) {
-			nfs_page_group_lock(req, false);
+			nfs_page_group_lock(req);
 
 			/* find the last request */
 			for (lastreq = req->wb_head;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 20d44ea328b6..0f418d825185 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -271,7 +271,7 @@ static bool nfs_page_group_covers_page(struct nfs_page *req)
 	unsigned int pos = 0;
 	unsigned int len = nfs_page_length(req->wb_page);
 
-	nfs_page_group_lock(req, false);
+	nfs_page_group_lock(req);
 
 	do {
 		tmp = nfs_page_group_search_locked(req->wb_head, pos);
@@ -480,7 +480,7 @@ try_again:
 	}
 	spin_unlock(&inode->i_lock);
 
-	ret = nfs_page_group_lock(head, false);
+	ret = nfs_page_group_lock(head);
 	if (ret < 0) {
 		nfs_unlock_and_release_request(head);
 		return ERR_PTR(ret);
@@ -501,7 +501,7 @@ try_again:
 			nfs_page_group_unlock(head);
 			ret = nfs_wait_on_request(subreq);
 			if (!ret)
-				ret = nfs_page_group_lock(head, false);
+				ret = nfs_page_group_lock(head);
 			if (ret < 0) {
 				nfs_unroll_locks(inode, head, subreq);
 				nfs_release_request(subreq);
-- 
cgit v1.2.3


From 7e8a30f8b497315a6467d86c82f6cc8acaa9db61 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 16:58:07 -0400
Subject: NFS: Fix up nfs_page_group_covers_page()

Fix up the test in nfs_page_group_covers_page(). The simplest implementation
is to check that we have a set of intersecting or contiguous subrequests
that connect page offset 0 to nfs_page_length(req->wb_page).

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0f418d825185..759e37d26acf 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -243,9 +243,6 @@ nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset)
 {
 	struct nfs_page *req;
 
-	WARN_ON_ONCE(head != head->wb_head);
-	WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_head->wb_flags));
-
 	req = head;
 	do {
 		if (page_offset >= req->wb_pgbase &&
@@ -273,18 +270,15 @@ static bool nfs_page_group_covers_page(struct nfs_page *req)
 
 	nfs_page_group_lock(req);
 
-	do {
+	for (;;) {
 		tmp = nfs_page_group_search_locked(req->wb_head, pos);
-		if (tmp) {
-			/* no way this should happen */
-			WARN_ON_ONCE(tmp->wb_pgbase != pos);
-			pos += tmp->wb_bytes - (pos - tmp->wb_pgbase);
-		}
-	} while (tmp && pos < len);
+		if (!tmp)
+			break;
+		pos = tmp->wb_pgbase + tmp->wb_bytes;
+	}
 
 	nfs_page_group_unlock(req);
-	WARN_ON_ONCE(pos > len);
-	return pos == len;
+	return pos >= len;
 }
 
 /* We can set the PG_uptodate flag if we see that a write request
-- 
cgit v1.2.3


From bd37d6fce184836bd5e7cd90ce40116a4fadaf2a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 1 Aug 2017 12:06:30 -0400
Subject: NFSv4: Convert nfs_lock_and_join_requests() to use
 nfs_page_find_head_request()

Hide the locking from nfs_lock_and_join_requests() so that we can
separate out the requirements for swapcache pages.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

(limited to 'fs/nfs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 759e37d26acf..a06167e20b72 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -154,6 +154,14 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
 	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
 }
 
+static struct nfs_page *
+nfs_page_private_request(struct page *page)
+{
+	if (!PagePrivate(page))
+		return NULL;
+	return (struct nfs_page *)page_private(page);
+}
+
 /*
  * nfs_page_find_head_request_locked - find head request associated with @page
  *
@@ -164,11 +172,10 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
 static struct nfs_page *
 nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page)
 {
-	struct nfs_page *req = NULL;
+	struct nfs_page *req;
 
-	if (PagePrivate(page))
-		req = (struct nfs_page *)page_private(page);
-	else if (unlikely(PageSwapCache(page)))
+	req = nfs_page_private_request(page);
+	if (!req && unlikely(PageSwapCache(page)))
 		req = nfs_page_search_commits_for_head_request_locked(nfsi,
 			page);
 
@@ -448,31 +455,29 @@ nfs_lock_and_join_requests(struct page *page)
 	int ret;
 
 try_again:
-	if (!(PagePrivate(page) || PageSwapCache(page)))
-		return NULL;
-	spin_lock(&inode->i_lock);
 	/*
 	 * A reference is taken only on the head request which acts as a
 	 * reference to the whole page group - the group will not be destroyed
 	 * until the head reference is released.
 	 */
-	head = nfs_page_find_head_request_locked(NFS_I(inode), page);
-
-	if (!head) {
-		spin_unlock(&inode->i_lock);
+	head = nfs_page_find_head_request(page);
+	if (!head)
 		return NULL;
-	}
 
 	/* lock the page head first in order to avoid an ABBA inefficiency */
 	if (!nfs_lock_request(head)) {
-		spin_unlock(&inode->i_lock);
 		ret = nfs_wait_on_request(head);
 		nfs_release_request(head);
 		if (ret < 0)
 			return ERR_PTR(ret);
 		goto try_again;
 	}
-	spin_unlock(&inode->i_lock);
+
+	/* Ensure that nobody removed the request before we locked it */
+	if (head != nfs_page_private_request(page) && !PageSwapCache(page)) {
+		nfs_unlock_and_release_request(head);
+		goto try_again;
+	}
 
 	ret = nfs_page_group_lock(head);
 	if (ret < 0) {
@@ -559,7 +564,7 @@ try_again:
 		return NULL;
 	}
 
-	/* still holds ref on head from nfs_page_find_head_request_locked
+	/* still holds ref on head from nfs_page_find_head_request
 	 * and still has lock on head from lock loop */
 	return head;
 }
-- 
cgit v1.2.3


From b30d2f04c35d539bf8003b3e014c389abefc249b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 1 Aug 2017 12:26:53 -0400
Subject: NFS: Refactor nfs_page_find_head_request()

Split out the 2 cases so that we can treat the locking differently.
The issue is that the locking in the pageswapcache cache is