Merge tag 'ceph-for-4.8-rc1' of git://github.com/ceph/ceph-client

Pull Ceph updates from Ilya Dryomov: "The highlights are: - RADOS namespace support in libceph and CephFS (Zheng Yan and myself). The stopgaps added in 4.5 to deny access to inodes in namespaces are removed and CEPH_FEATURE_FS_FILE_LAYOUT_V2 feature bit is now fully supported - A large rework of the MDS cap flushing code (Zheng Yan) - Handle some of ->d_revalidate() in RCU mode (Jeff Layton). We were overly pessimistic before, bailing at the first sight of LOOKUP_RCU On top of that we've got a few CephFS bug fixes, a couple of cleanups and Arnd's workaround for a weird genksyms issue" * tag 'ceph-for-4.8-rc1' of git://github.com/ceph/ceph-client: (34 commits) ceph: fix symbol versioning for ceph_monc_do_statfs ceph: Correctly return NXIO errors from ceph_llseek ceph: Mark the file cache as unreclaimable ceph: optimize cap flush waiting ceph: cleanup ceph_flush_snaps() ceph: kick cap flushes before sending other cap message ceph: introduce an inode flag to indicates if snapflush is needed ceph: avoid sending duplicated cap flush message ceph: unify cap flush and snapcap flush ceph: use list instead of rbtree to track cap flushes ceph: update types of some local varibles ceph: include 'follows' of pending snapflush in cap reconnect message ceph: update cap reconnect message to version 3 ceph: mount non-default filesystem by name libceph: fsmap.user subscription support ceph: handle LOOKUP_RCU in ceph_d_revalidate ceph: allow dentry_lease_is_valid to work under RCU walk ceph: clear d_fsinfo pointer under d_lock ceph: remove ceph_mdsc_lease_release ceph: don't use ->d_time ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2016-08-02 19:39:09 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-08-02 19:39:09 -0400
commit: 72b5ac54d620b29cae23d25f0405f2765b466f72 (patch)
tree: 276e6313a16f0821cb3da7061372f37b0feb8ace
parent: c7fac299672ee98a1da90ea2e473180fc75d2c53 (diff)
parent: a0f2b65275413b3438e9f55b1427273cd893c3b2 (diff)
31 files changed, 1412 insertions, 823 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 450662055d97..1a04af6d2421 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1937,7 +1937,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
 	osd_req->r_callback = rbd_osd_req_callback;
 	osd_req->r_priv = obj_request;
 
-	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+	osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
 	if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
 			     obj_request->object_name))
 		goto fail;
@@ -1991,7 +1991,7 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
 	osd_req->r_callback = rbd_osd_req_callback;
 	osd_req->r_priv = obj_request;
 
-	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+	osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
 	if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
 			     obj_request->object_name))
 		goto fail;
@@ -3995,10 +3995,11 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
 
 	/* Initialize the layout used for all rbd requests */
 
-	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
-	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
-	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
-	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
+	rbd_dev->layout.stripe_unit = 1 << RBD_MAX_OBJ_ORDER;
+	rbd_dev->layout.stripe_count = 1;
+	rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER;
+	rbd_dev->layout.pool_id = spec->pool_id;
+	RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
 
 	/*
 	 * If this is a mapping rbd_dev (as opposed to a parent one),
@@ -5187,7 +5188,7 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev)
 
 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 
-	rbd_dev->header_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+	rbd_dev->header_oloc.pool = rbd_dev->layout.pool_id;
 	if (rbd_dev->image_format == 1)
 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
 				       spec->image_name, RBD_SUFFIX);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 26a9d10d75e9..d5b6f959a3c3 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1730,7 +1730,8 @@ enum {
 	POOL_WRITE	= 2,
 };
 
-static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
+static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
+				s64 pool, struct ceph_string *pool_ns)
 {
 	struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -1738,6 +1739,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 	struct rb_node **p, *parent;
 	struct ceph_pool_perm *perm;
 	struct page **pages;
+	size_t pool_ns_len;
 	int err = 0, err2 = 0, have = 0;
 
 	down_read(&mdsc->pool_perm_rwsem);
@@ -1749,17 +1751,31 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 		else if (pool > perm->pool)
 			p = &(*p)->rb_right;
 		else {
-			have = perm->perm;
-			break;
+			int ret = ceph_compare_string(pool_ns,
+						perm->pool_ns,
+						perm->pool_ns_len);
+			if (ret < 0)
+				p = &(*p)->rb_left;
+			else if (ret > 0)
+				p = &(*p)->rb_right;
+			else {
+				have = perm->perm;
+				break;
+			}
 		}
 	}
 	up_read(&mdsc->pool_perm_rwsem);
 	if (*p)
 		goto out;
 
-	dout("__ceph_pool_perm_get pool %u no perm cached\n", pool);
+	if (pool_ns)
+		dout("__ceph_pool_perm_get pool %lld ns %.*s no perm cached\n",
+		     pool, (int)pool_ns->len, pool_ns->str);
+	else
+		dout("__ceph_pool_perm_get pool %lld no perm cached\n", pool);
 
 	down_write(&mdsc->pool_perm_rwsem);
+	p = &mdsc->pool_perm_tree.rb_node;
 	parent = NULL;
 	while (*p) {
 		parent = *p;
@@ -1769,8 +1785,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 		else if (pool > perm->pool)
 			p = &(*p)->rb_right;
 		else {
-			have = perm->perm;
-			break;
+			int ret = ceph_compare_string(pool_ns,
+						perm->pool_ns,
+						perm->pool_ns_len);
+			if (ret < 0)
+				p = &(*p)->rb_left;
+			else if (ret > 0)
+				p = &(*p)->rb_right;
+			else {
+				have = perm->perm;
+				break;
+			}
 		}
 	}
 	if (*p) {
@@ -1788,6 +1813,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 	rd_req->r_flags = CEPH_OSD_FLAG_READ;
 	osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
 	rd_req->r_base_oloc.pool = pool;
+	if (pool_ns)
+		rd_req->r_base_oloc.pool_ns = ceph_get_string(pool_ns);
 	ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
 
 	err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
@@ -1841,7 +1868,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 		goto out_unlock;
 	}
 
-	perm = kmalloc(sizeof(*perm), GFP_NOFS);
+	pool_ns_len = pool_ns ? pool_ns->len : 0;
+	perm = kmalloc(sizeof(*perm) + pool_ns_len + 1, GFP_NOFS);
 	if (!perm) {
 		err = -ENOMEM;
 		goto out_unlock;
@@ -1849,6 +1877,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 
 	perm->pool = pool;
 	perm->perm = have;
+	perm->pool_ns_len = pool_ns_len;
+	if (pool_ns_len > 0)
+		memcpy(perm->pool_ns, pool_ns->str, pool_ns_len);
+	perm->pool_ns[pool_ns_len] = 0;
+
 	rb_link_node(&perm->node, parent, p);
 	rb_insert_color(&perm->node, &mdsc->pool_perm_tree);
 	err = 0;
@@ -1860,43 +1893,46 @@ out_unlock:
 out:
 	if (!err)
 		err = have;
-	dout("__ceph_pool_perm_get pool %u result = %d\n", pool, err);
+	if (pool_ns)
+		dout("__ceph_pool_perm_get pool %lld ns %.*s result = %d\n",
+		     pool, (int)pool_ns->len, pool_ns->str, err);
+	else
+		dout("__ceph_pool_perm_get pool %lld result = %d\n", pool, err);
 	return err;
 }
 
 int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
 {
-	u32 pool;
+	s64 pool;
+	struct ceph_string *pool_ns;
 	int ret, flags;
 
-	/* does not support pool namespace yet */
-	if (ci->i_pool_ns_len)
-		return -EIO;
-
 	if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
 				NOPOOLPERM))
 		return 0;
 
 	spin_lock(&ci->i_ceph_lock);
 	flags = ci->i_ceph_flags;
-	pool = ceph_file_layout_pg_pool(ci->i_layout);
+	pool = ci->i_layout.pool_id;
 	spin_unlock(&ci->i_ceph_lock);
 check:
 	if (flags & CEPH_I_POOL_PERM) {
 		if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
-			dout("ceph_pool_perm_check pool %u no read perm\n",
+			dout("ceph_pool_perm_check pool %lld no read perm\n",
 			     pool);
 			return -EPERM;
 		}
 		if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
-			dout("ceph_pool_perm_check pool %u no write perm\n",
+			dout("ceph_pool_perm_check pool %lld no write perm\n",
 			     pool);
 			return -EPERM;
 		}
 		return 0;
 	}
 
-	ret = __ceph_pool_perm_get(ci, pool);
+	pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
+	ret = __ceph_pool_perm_get(ci, pool, pool_ns);
+	ceph_put_string(pool_ns);
 	if (ret < 0)
 		return ret;
 
@@ -1907,10 +1943,11 @@ check:
 		flags |= CEPH_I_POOL_WR;
 
 	spin_lock(&ci->i_ceph_lock);
-	if (pool == ceph_file_layout_pg_pool(ci->i_layout)) {
-		ci->i_ceph_flags = flags;
+	if (pool == ci->i_layout.pool_id &&
+	    pool_ns == rcu_dereference_raw(ci->i_layout.pool_ns)) {
+		ci->i_ceph_flags |= flags;
         } else {
-		pool = ceph_file_layout_pg_pool(ci->i_layout);
+		pool = ci->i_layout.pool_id;
 		flags = ci->i_ceph_flags;
 	}
 	spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 238c55b01723..5bc5d37b1217 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -71,7 +71,7 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
 					      &ceph_fscache_fsid_object_def,
 					      fsc, true);
 	if (!fsc->fscache)
-		pr_err("Unable to resgister fsid: %p fscache cookie", fsc);
+		pr_err("Unable to register fsid: %p fscache cookie\n", fsc);
 
 	return 0;
 }
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 6f60d0a3d0f9..99115cae1652 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -40,6 +40,11 @@
  * cluster to release server state.
  */
 
+static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc);
+static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
+				 struct ceph_mds_session *session,
+				 struct ceph_inode_info *ci,
+				 u64 oldest_flush_tid);
 
 /*
  * Generate readable cap strings for debugging output.
@@ -849,12 +854,14 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
  */
 int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
 {
-	int want = 0;
-	int mode;
-	for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++)
-		if (ci->i_nr_by_mode[mode])
-			want |= ceph_caps_for_mode(mode);
-	return want;
+	int i, bits = 0;
+	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+		if (ci->i_nr_by_mode[i])
+			bits |= 1 << i;
+	}
+	if (bits == 0)
+		return 0;
+	return ceph_caps_for_mode(bits >> 1);
 }
 
 /*
@@ -991,7 +998,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
 			u32 seq, u64 flush_tid, u64 oldest_flush_tid,
 			u32 issue_seq, u32 mseq, u64 size, u64 max_size,
 			struct timespec *mtime, struct timespec *atime,
-			struct timespec *ctime, u64 time_warp_seq,
+			struct timespec *ctime, u32 time_warp_seq,
 			kuid_t uid, kgid_t gid, umode_t mode,
 			u64 xattr_version,
 			struct ceph_buffer *xattrs_buf,
@@ -1116,8 +1123,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	struct inode *inode = &ci->vfs_inode;
 	u64 cap_id = cap->cap_id;
 	int held, revoking, dropping, keep;
-	u64 seq, issue_seq, mseq, time_warp_seq, follows;
-	u64 size, max_size;
+	u64 follows, size, max_size;
+	u32 seq, issue_seq, mseq, time_warp_seq;
 	struct timespec mtime, atime, ctime;
 	int wake = 0;
 	umode_t mode;
@@ -1215,6 +1222,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	return delayed;
 }
 
+static inline int __send_flush_snap(struct inode *inode,
+				    struct ceph_mds_session *session,
+				    struct ceph_cap_snap *capsnap,
+				    u32 mseq, u64 oldest_flush_tid)
+{
+	return send_cap_msg(session, ceph_vino(inode).ino, 0,
+			CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
+			capsnap->dirty, 0, capsnap->cap_flush.tid,
+			oldest_flush_tid, 0, mseq, capsnap->size, 0,
+			&capsnap->mtime, &capsnap->atime,
+			&capsnap->ctime, capsnap->time_warp_seq,
+			capsnap->uid, capsnap->gid, capsnap->mode,
+			capsnap->xattr_version, capsnap->xattr_blob,
+			capsnap->follows, capsnap->inline_data);
+}
+
 /*
  * When a snapshot is taken, clients accumulate dirty metadata on
  * inodes with capabilities in ceph_cap_snaps to describe the file
@@ -1222,37 +1245,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
  * asynchronously back to the MDS once sync writes complete and dirty
  * data is written out.
  *
- * Unless @kick is true, skip cap_snaps that were already sent to
- * the MDS (i.e., during this session).
- *
  * Called under i_ceph_lock.  Takes s_mutex as needed.
  */
-void __ceph_flush_snaps(struct ceph_inode_info *ci,
-			struct ceph_mds_session **psession,
-			int kick)
+static void __ceph_flush_snaps(struct ceph_inode_info *ci,
+			       struct ceph_mds_session *session)
 		__releases(ci->i_ceph_lock)
 		__acquires(ci->i_ceph_lock)
 {
 	struct inode *inode = &ci->vfs_inode;
-	int mds;
+	struct ceph_mds_client *mdsc = session->s_mdsc;
 	struct ceph_cap_snap *capsnap;
-	u32 mseq;
-	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
-	struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
-						    session->s_mutex */
-	u64 next_follows = 0;  /* keep track of how far we've gotten through the
-			     i_cap_snaps list, and skip these entries next time
-			     around to avoid an infinite loop */
+	u64 oldest_flush_tid = 0;
+	u64 first_tid = 1, last_tid = 0;
 
-	if (psession)
-		session = *psession;
+	dout("__flush_snaps %p session %p\n", inode, session);
 
-	dout("__flush_snaps %p\n", inode);
-retry:
 	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
-		/* avoid an infiniute loop after retry */
-		if (capsnap->follows < next_follows)
-			continue;
 		/*
 		 * we need to wait for sync writes to complete and for dirty
 		 * pages to be written out.
@@ -1263,97 +1271,129 @@ retry:
 		/* should be removed by ceph_try_drop_cap_snap() */
 		BUG_ON(!capsnap->need_flush);
 
-		/* pick mds, take s_mutex */
-		if (ci->i_auth_cap == NULL) {
-			dout("no auth cap (migrating?), doing nothing\n");
-			goto out;
-		}
-
 		/* only flush each capsnap once */
-		if (!kick && !list_empty(&capsnap->flushing_item)) {
-			dout("already flushed %p, skipping\n", capsnap);
+		if (capsnap->cap_flush.tid > 0) {
+			dout(" already flushed %p, skipping\n", capsnap);
 			continue;
 		}
 
-		mds = ci->i_auth_cap->session->s_mds;
-		mseq = ci->i_auth_cap->mseq;
+		spin_lock(&mdsc->cap_dirty_lock);
+		capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
+		list_add_tail(&capsnap->cap_flush.g_list,
+			      &mdsc->cap_flush_list);
+		if (oldest_flush_tid == 0)
+			oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+		if (list_empty(&ci->i_flushing_item)) {
+			list_add_tail(&ci->i_flushing_item,
+				      &session->s_cap_flushing);
+		}
+		spin_unlock(&mdsc->cap_dirty_lock);
+
+		list_add_tail(&capsnap->cap_flush.i_list,
+			      &ci->i_cap_flush_list);
 
-		if (session && session->s_mds != mds) {
-			dout("oops, wrong session %p mutex\n", session);
-			if (kick)
-				goto out;
+		if (first_tid == 1)
+			first_tid = capsnap->cap_flush.tid;
+		last_tid = capsnap->cap_flush.tid;
+	}
 
-			mutex_unlock(&session->s_mutex);
-			ceph_put_mds_session(session);
-			session = NULL;
+	ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS;
+
+	while (first_tid <= last_tid) {
+		struct ceph_cap *cap = ci->i_auth_cap;
+		struct ceph_cap_flush *cf;
+		int ret;
+
+		if (!(cap && cap->session == session)) {
+			dout("__flush_snaps %p auth cap %p not mds%d, "
+			     "stop\n", inode, cap, session->s_mds);
+			break;
 		}
-		if (!session) {
-			spin_unlock(&ci->i_ceph_lock);
-			mutex_lock(&mdsc->mutex);
-			session = __ceph_lookup_mds_session(mdsc, mds);
-			mutex_unlock(&mdsc->mutex);
-			if (session) {
-				dout("inverting session/ino locks on %p\n",
-				     session);
-				mutex_lock(&session->s_mutex);
+
+		ret = -ENOENT;
+		list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
+			if (cf->tid >= first_tid) {
+				ret = 0;
+				break;
 			}
-			/*
-			 * if session == NULL, we raced against a cap
-			 * deletion or migration.  retry, and we'll
-			 * get a better @mds value next time.
-			 */
-			spin_lock(&ci->i_ceph_lock);
-			goto retry;
 		}
+		if (ret < 0)
+			break;
 
-		spin_lock(&mdsc->cap_dirty_lock);
-		capsnap->flush_tid = ++mdsc->last_cap_flush_tid;
-		spin_unlock(&mdsc->cap_dirty_lock);
+		first_tid = cf->tid + 1;
 
+		capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
 		atomic_inc(&capsnap->nref);
-		if (list_empty(&capsnap->flushing_item))
-			list_add_tail(&capsnap->flushing_item,
-				      &session->s_cap_snaps_flushing);
 		spin_unlock(&ci->i_ceph_lock);
 
-		dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
-		     inode, capsnap, capsnap->follows, capsnap->flush_tid);
-		send_cap_msg(session, ceph_vino(inode).ino, 0,
-			     CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
-			     capsnap->dirty, 0, capsnap->flush_tid, 0,
-			     0, mseq, capsnap->size, 0,
-			     &capsnap->mtime, &capsnap->atime,
-			     &capsnap->ctime, capsnap->time_warp_seq,
-			     capsnap->uid, capsnap->gid, capsnap->mode,
-			     capsnap->xattr_version, capsnap->xattr_blob,
-			     capsnap->follows, capsnap->inline_data);
-
-		next_follows = capsnap->follows + 1;
-		ceph_put_cap_snap(capsnap);
+		dout("__flush_snaps %p capsnap %p tid %llu %s\n",
+		     inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty));
+
+		ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
+					oldest_flush_tid);
+		if (ret < 0) {
+			pr_err("__flush_snaps: error sending cap flushsnap, "
+			       "ino (%llx.%llx) tid %llu follows %llu\n",
+				ceph_vinop(inode), cf->tid, capsnap->follows);
+		}
 
+		ceph_put_cap_snap(capsnap);
 		spin_lock(&ci->i_ceph_lock);
-		goto retry;
 	}
+}
 
-	/* we flushed them all; remove this inode from the queue */
-	spin_lock(&mdsc->snap_flush_lock);
-	list_del_init(&ci->i_snap_flush_item);
-	spin_unlock(&mdsc->snap_flush_lock);
+void ceph_flush_snaps(struct ceph_inode_info *ci,
+		      struct ceph_mds_session **psession)
+{
+	struct inode *inode = &ci->vfs_inode;
+	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_session *session = *psession;
+	int mds;
+	dout("ceph_flush_snaps %p\n", inode);
+retry:
+	spin_lock(&ci->i_ceph_lock);
+	if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
+		dout(" no capsnap needs flush, doing nothing\n");
+		goto out;
+	}
+	if (!ci->i_auth_cap) {
+		dout(" no auth cap (migrating?), doing nothing\n");
+		goto out;
+	}
 
-out:
-	if (psession)
-		*psession = session;
-	else if (session) {
+	mds = ci->i_auth_cap->session->s_mds;
+	if (session && session->s_mds != mds) {
+		dout(" oops, wrong session %p mutex\n", session);
 		mutex_unlock(&session->s_mutex);
 		ceph_put_mds_session(session);
+		session = NULL;
+	}
+	if (!session) {
+		spin_unlock(&ci->i_ceph_lock);
+		mutex_lock(&mdsc->mutex);
+		session = __ceph_lookup_mds_session(mdsc, mds);
+		mutex_unlock(&mdsc->mutex);
+		if (session) {
+			dout(" inverting session/ino locks on %p\n", session);
+			mutex_lock(&session->s_mutex);
+		}
+		goto retry;
 	}
-}
 
-static void ceph_flush_snaps(struct ceph_inode_info *ci)
-{
-	spin_lock(&ci->i_ceph_lock);
-	__ceph_flush_snaps(ci, NULL, 0);
+	__ceph_flush_snaps(ci, session);
+out:
 	spin_unlock(&ci->i_ceph_lock);
+
+	if (psession) {
+		*psession = session;
+	} else {
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+	}
+	/* we flushed them all; remove this inode from the queue */
+	spin_lock(&mdsc->snap_flush_lock);
+	list_del_init(&ci->i_snap_flush_item);
+	spin_unlock(&mdsc->snap_flush_lock);
 }
 
 /*
@@ -1411,52 +1451,6 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
 	return dirty;
 }
 
-static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci,
-					struct ceph_cap_flush *cf)
-{
-	struct rb_node **p = &ci->i_cap_flush_tree.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_cap_flush *other = NULL;
-
-	while (*p) {
-		parent = *p;
-		other = rb_entry(parent, struct ceph_cap_flush, i_node);
-
-		if (cf->tid < other->tid)
-			p = &(*p)->rb_left;
-		else if (cf->tid > other->tid)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&cf->i_node, parent, p);
-	rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree);
-}
-
-static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc,
-				       struct ceph_cap_flush *cf)
-{
-	struct rb_node **p = &mdsc->cap_flush_tree.rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_cap_flush *other = NULL;
-
-	while (*p) {
-		parent = *p;
-		other = rb_entry(parent, struct ceph_cap_flush, g_node);
-
-		if (cf->tid < other->tid)
-			p = &(*p)->rb_left;
-		else if (cf->tid > other->tid)
-			p = &(*p)->rb_right;
-		else
-			BUG();
-	}
-
-	rb_link_node(&cf->g_node, parent, p);
-	rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree);
-}
-
 struct ceph_cap_flush *ceph_alloc_cap_flush(void)
 {
 	return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
@@ -1470,23 +1464,54 @@ void ceph_free_cap_flush(struct ceph_cap_flush *cf)
 
 static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
 {
-	struct rb_node *n = rb_first(&mdsc->cap_flush_tree);
-	if (n) {
+	if (!list_empty(&mdsc->cap_flush_list)) {
 		struct ceph_cap_flush *cf =
-			rb_entry(n, struct ceph_cap_flush, g_node);
+			list_first_entry(&mdsc->cap_flush_list,
+					 struct ceph_cap_flush, g_list);
 		return cf->tid;
 	}
 	return 0;
 }
 
 /*
+ * Remove cap_flush from the mdsc's or inode's flushing cap list.
+ * Return true if caller needs to wake up flush waiters.
+ */
+static bool __finish_cap_flush(struct ceph_mds_client *mdsc,
+			       struct ceph_inode_info *ci,
+			       struct ceph_cap_flush *cf)
+{
+	struct ceph_cap_flush *prev;
+	bool wake = cf->wake;
+	if (mdsc) {
+		/* are there older pending cap flushes? */
+		if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
+			prev = list_prev_entry(cf, g_list);
+			prev->wake = true;
+			wake = false;
+		}
+		list_del(&cf->g_list);
+	} else if (ci) {
+		if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
+			prev = list_prev_entry(cf, i_list);
+			prev->wake = true;
+			wake = false;
+		}
+		list_del(&cf->i_list);
+	} else {
+		BUG_ON(1);
+	}
+	return wake;
+}
+
+/*
  * Add dirty inode to the flushing list.  Assigned a seq number so we
  * can wait for caps to flush without starving.
  *
  * Called under i_ceph_lock.
  */
 static int __mark_caps_flushing(struct inode *inode,
-				struct ceph_mds_session *session,
+				struct ceph_mds_session *session, bool wake,
 				u64 *flush_tid, u64 *oldest_flush_tid)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
@@ -1509,26 +1534,22 @@ static int __mark_caps_flushing(struct inode *inode,
 
 	swap(cf, ci->i_prealloc_cap_flush);
 	cf->caps = flushing;
+	cf->wake = wake;
 
 	spin_lock(&mdsc->cap_dirty_lock);
 	list_del_init(&ci->i_dirty_item);
 
 	cf->tid = ++mdsc->last_cap_flush_tid;
-	__add_cap_flushing_to_mdsc(mdsc, cf);
+	list_add_tail(&cf->g_list, &mdsc->cap_flush_list);
 	*oldest_flush_tid = __get_oldest_flush_tid(mdsc);
 
 	if (list_empty(&ci->i_flushing_item)) {
 		list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
 		mdsc->num_cap_flushing++;
-		dout(" inode %p now flushing tid %llu\n", inode, cf->tid);
-	} else {
-		list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
-		dout(" inode %p now flushing (more) tid %llu\n",
-		     inode, cf->tid);
 	}
 	spin_unlock(&mdsc->cap_dirty_lock);
 
-	__add_cap_flushing_to_inode(ci, cf);
+	list_add_tail(&cf->i_list, &ci->i_cap_flush_list);
 
 	*flush_tid = cf->tid;
 	return flushing;
@@ -1583,10 +1604,11 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	int mds = -1;   /* keep track of how far we've gone through i_caps list
 			   to avoid an infinite loop on retry */
 	struct rb_node *p;
-	int tried_invalidate = 0;
-	int delayed = 0, sent = 0, force_requeue = 0, num;
-	int queue_invalidate = 0;
-	int is_delayed = flags & CHECK_CAPS_NODELAY;
+	int delayed = 0, sent = 0, num;
+	bool is_delayed = flags & CHECK_CAPS_NODELAY;
+	bool queue_invalidate = false;
+	bool force_requeue = false;
+	bool tried_invalidate = false;
 
 	/* if we are unmounting, flush any unused caps immediately. */
 	if (mdsc->stopping)
@@ -1597,9 +1619,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	if (ci->i_ceph_flags & CEPH_I_FLUSH)
 		flags |= CHECK_CAPS_FLUSH;
 
-	/* flush snaps first time around only */
-	if (!list_empty(&ci->i_cap_snaps))
-		__ceph_flush_snaps(ci, &session, 0);
 	goto retry_locked;
 retry:
 	spin_lock(&ci->i_ceph_lock);
@@ -1666,17 +1685,17 @@ retry_locked:
 			if (revoking & (CEPH_CAP_FILE_CACHE|
 					CEPH_CAP_FILE_LAZYIO)) {
 				dout("check_caps queuing invalidate\n");
-				queue_invalidate = 1;
+				queue_invalidate = true;
 				ci->i_rdcache_revoking = ci->i_rdcache_gen;
 			} else {
 				dout("check_caps failed to invalidate pages\n");
 				/* we failed to invalidate pages.  check these
 				   caps again later. */
-				force_requeue = 1;
+				force_requeue = true;
 				__cap_set_timeouts(mdsc, ci);
 			}
 		}
-		tried_invalidate = 1;
+		tried_invalidate = true;
 		goto retry_locked;
 	}
 
@@ -1720,10 +1739,15 @@ retry_locked:
 			}
 		}
 		/* flush anything dirty? */
-		if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
-		    ci->i_dirty_caps) {
-			dout("flushing dirty caps\n");
-			goto ack;
+		if (cap == ci->i_auth_cap) {
+			if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) {
+				dout("flushing dirty caps\n");
+				goto ack;
+			}
+			if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) {
+				dout("flushing snap caps\n");
+				goto ack;
+			}
 		}
 
 		/* completed revocation? going down and there are no caps? */
@@ -1782,6 +1806,26 @@ ack:
 				goto retry;
 			}
 		}
+
+		/* kick flushing and flush snaps before sending normal
+		 * cap message */
+		if (cap == ci->i_auth_cap &&
+		    (ci->i_ceph_flags &
+		     (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) {
+			if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
+				spin_lock(&mdsc->cap_dirty_lock);
+				oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+				spin_unlock(&mdsc->cap_dirty_lock);
+				__kick_flushing_caps(mdsc, session, ci,
+						     oldest_flush_tid);
+				ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+			}
+			if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
+				__ceph_flush_snaps(ci, session);
+
+			goto retry_locked;
+		}
+
 		/* take snap_rwsem after session mutex */
 		if (!took_snap_rwsem) {
 			if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
@@ -1796,7 +1840,7 @@ ack:
 		}
 
 		if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
-			flushing = __mark_caps_flushing(inode, session,
+			flushing = __mark_caps_flushing(inode, session, false,
 							&flush_tid,
 							&oldest_flush_tid);
 		} else {
@@ -1822,7 +1866,7 @@ ack:
 	 * otherwise cancel.
 	 */
 	if (delayed && is_delayed)
-		force_requeue = 1;   /* __send_cap delayed release; requeue */
+		force_requeue = true;   /* __send_cap delayed release; requeue */
 	if (!delayed && !is_delayed)
 		__cap_delay_cancel(mdsc, ci);
 	else if (!is_delayed || force_requeue)
@@ -1873,8 +1917,8 @@ retry:
 		if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
 			goto out;
 
-		flushing = __mark_caps_flushing(inode, session, &flush_tid,
-						&oldest_flus
author	Linus Torvalds <torvalds@linux-foundation.org>	2016-08-02 19:39:09 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-08-02 19:39:09 -0400
commit	72b5ac54d620b29cae23d25f0405f2765b466f72 (patch)
tree	276e6313a16f0821cb3da7061372f37b0feb8ace
parent	c7fac299672ee98a1da90ea2e473180fc75d2c53 (diff)
parent	a0f2b65275413b3438e9f55b1427273cd893c3b2 (diff)