summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-08-02 19:39:09 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-08-02 19:39:09 -0400
commit72b5ac54d620b29cae23d25f0405f2765b466f72 (patch)
tree276e6313a16f0821cb3da7061372f37b0feb8ace
parentc7fac299672ee98a1da90ea2e473180fc75d2c53 (diff)
parenta0f2b65275413b3438e9f55b1427273cd893c3b2 (diff)
Merge tag 'ceph-for-4.8-rc1' of git://github.com/ceph/ceph-client
Pull Ceph updates from Ilya Dryomov: "The highlights are: - RADOS namespace support in libceph and CephFS (Zheng Yan and myself). The stopgaps added in 4.5 to deny access to inodes in namespaces are removed and CEPH_FEATURE_FS_FILE_LAYOUT_V2 feature bit is now fully supported - A large rework of the MDS cap flushing code (Zheng Yan) - Handle some of ->d_revalidate() in RCU mode (Jeff Layton). We were overly pessimistic before, bailing at the first sight of LOOKUP_RCU On top of that we've got a few CephFS bug fixes, a couple of cleanups and Arnd's workaround for a weird genksyms issue" * tag 'ceph-for-4.8-rc1' of git://github.com/ceph/ceph-client: (34 commits) ceph: fix symbol versioning for ceph_monc_do_statfs ceph: Correctly return NXIO errors from ceph_llseek ceph: Mark the file cache as unreclaimable ceph: optimize cap flush waiting ceph: cleanup ceph_flush_snaps() ceph: kick cap flushes before sending other cap message ceph: introduce an inode flag to indicates if snapflush is needed ceph: avoid sending duplicated cap flush message ceph: unify cap flush and snapcap flush ceph: use list instead of rbtree to track cap flushes ceph: update types of some local varibles ceph: include 'follows' of pending snapflush in cap reconnect message ceph: update cap reconnect message to version 3 ceph: mount non-default filesystem by name libceph: fsmap.user subscription support ceph: handle LOOKUP_RCU in ceph_d_revalidate ceph: allow dentry_lease_is_valid to work under RCU walk ceph: clear d_fsinfo pointer under d_lock ceph: remove ceph_mdsc_lease_release ceph: don't use ->d_time ...
-rw-r--r--drivers/block/rbd.c15
-rw-r--r--fs/ceph/addr.c77
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/ceph/caps.c873
-rw-r--r--fs/ceph/dir.c73
-rw-r--r--fs/ceph/file.c77
-rw-r--r--fs/ceph/inode.c40
-rw-r--r--fs/ceph/ioctl.c30
-rw-r--r--fs/ceph/mds_client.c358
-rw-r--r--fs/ceph/mds_client.h19
-rw-r--r--fs/ceph/snap.c10
-rw-r--r--fs/ceph/super.c43
-rw-r--r--fs/ceph/super.h48
-rw-r--r--fs/ceph/xattr.c101
-rw-r--r--include/linux/ceph/ceph_fs.h55
-rw-r--r--include/linux/ceph/decode.h55
-rw-r--r--include/linux/ceph/libceph.h4
-rw-r--r--include/linux/ceph/mon_client.h7
-rw-r--r--include/linux/ceph/msgpool.h1
-rw-r--r--include/linux/ceph/osd_client.h1
-rw-r--r--include/linux/ceph/osdmap.h15
-rw-r--r--include/linux/ceph/string_table.h62
-rw-r--r--net/ceph/Makefile2
-rw-r--r--net/ceph/ceph_common.c2
-rw-r--r--net/ceph/ceph_fs.c30
-rw-r--r--net/ceph/debugfs.c12
-rw-r--r--net/ceph/mon_client.c4
-rw-r--r--net/ceph/msgpool.c1
-rw-r--r--net/ceph/osd_client.c49
-rw-r--r--net/ceph/osdmap.c58
-rw-r--r--net/ceph/string_table.c111
31 files changed, 1412 insertions, 823 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 450662055d97..1a04af6d2421 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1937,7 +1937,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
osd_req->r_callback = rbd_osd_req_callback;
osd_req->r_priv = obj_request;
- osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+ osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
obj_request->object_name))
goto fail;
@@ -1991,7 +1991,7 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
osd_req->r_callback = rbd_osd_req_callback;
osd_req->r_priv = obj_request;
- osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+ osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
obj_request->object_name))
goto fail;
@@ -3995,10 +3995,11 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
/* Initialize the layout used for all rbd requests */
- rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
- rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
- rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
- rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
+ rbd_dev->layout.stripe_unit = 1 << RBD_MAX_OBJ_ORDER;
+ rbd_dev->layout.stripe_count = 1;
+ rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER;
+ rbd_dev->layout.pool_id = spec->pool_id;
+ RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
/*
* If this is a mapping rbd_dev (as opposed to a parent one),
@@ -5187,7 +5188,7 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev)
rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
- rbd_dev->header_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+ rbd_dev->header_oloc.pool = rbd_dev->layout.pool_id;
if (rbd_dev->image_format == 1)
ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
spec->image_name, RBD_SUFFIX);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 26a9d10d75e9..d5b6f959a3c3 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1730,7 +1730,8 @@ enum {
POOL_WRITE = 2,
};
-static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
+static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
+ s64 pool, struct ceph_string *pool_ns)
{
struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -1738,6 +1739,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
struct rb_node **p, *parent;
struct ceph_pool_perm *perm;
struct page **pages;
+ size_t pool_ns_len;
int err = 0, err2 = 0, have = 0;
down_read(&mdsc->pool_perm_rwsem);
@@ -1749,17 +1751,31 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
else if (pool > perm->pool)
p = &(*p)->rb_right;
else {
- have = perm->perm;
- break;
+ int ret = ceph_compare_string(pool_ns,
+ perm->pool_ns,
+ perm->pool_ns_len);
+ if (ret < 0)
+ p = &(*p)->rb_left;
+ else if (ret > 0)
+ p = &(*p)->rb_right;
+ else {
+ have = perm->perm;
+ break;
+ }
}
}
up_read(&mdsc->pool_perm_rwsem);
if (*p)
goto out;
- dout("__ceph_pool_perm_get pool %u no perm cached\n", pool);
+ if (pool_ns)
+ dout("__ceph_pool_perm_get pool %lld ns %.*s no perm cached\n",
+ pool, (int)pool_ns->len, pool_ns->str);
+ else
+ dout("__ceph_pool_perm_get pool %lld no perm cached\n", pool);
down_write(&mdsc->pool_perm_rwsem);
+ p = &mdsc->pool_perm_tree.rb_node;
parent = NULL;
while (*p) {
parent = *p;
@@ -1769,8 +1785,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
else if (pool > perm->pool)
p = &(*p)->rb_right;
else {
- have = perm->perm;
- break;
+ int ret = ceph_compare_string(pool_ns,
+ perm->pool_ns,
+ perm->pool_ns_len);
+ if (ret < 0)
+ p = &(*p)->rb_left;
+ else if (ret > 0)
+ p = &(*p)->rb_right;
+ else {
+ have = perm->perm;
+ break;
+ }
}
}
if (*p) {
@@ -1788,6 +1813,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
rd_req->r_flags = CEPH_OSD_FLAG_READ;
osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
rd_req->r_base_oloc.pool = pool;
+ if (pool_ns)
+ rd_req->r_base_oloc.pool_ns = ceph_get_string(pool_ns);
ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
@@ -1841,7 +1868,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
goto out_unlock;
}
- perm = kmalloc(sizeof(*perm), GFP_NOFS);
+ pool_ns_len = pool_ns ? pool_ns->len : 0;
+ perm = kmalloc(sizeof(*perm) + pool_ns_len + 1, GFP_NOFS);
if (!perm) {
err = -ENOMEM;
goto out_unlock;
@@ -1849,6 +1877,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
perm->pool = pool;
perm->perm = have;
+ perm->pool_ns_len = pool_ns_len;
+ if (pool_ns_len > 0)
+ memcpy(perm->pool_ns, pool_ns->str, pool_ns_len);
+ perm->pool_ns[pool_ns_len] = 0;
+
rb_link_node(&perm->node, parent, p);
rb_insert_color(&perm->node, &mdsc->pool_perm_tree);
err = 0;
@@ -1860,43 +1893,46 @@ out_unlock:
out:
if (!err)
err = have;
- dout("__ceph_pool_perm_get pool %u result = %d\n", pool, err);
+ if (pool_ns)
+ dout("__ceph_pool_perm_get pool %lld ns %.*s result = %d\n",
+ pool, (int)pool_ns->len, pool_ns->str, err);
+ else
+ dout("__ceph_pool_perm_get pool %lld result = %d\n", pool, err);
return err;
}
int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
{
- u32 pool;
+ s64 pool;
+ struct ceph_string *pool_ns;
int ret, flags;
- /* does not support pool namespace yet */
- if (ci->i_pool_ns_len)
- return -EIO;
-
if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
NOPOOLPERM))
return 0;
spin_lock(&ci->i_ceph_lock);
flags = ci->i_ceph_flags;
- pool = ceph_file_layout_pg_pool(ci->i_layout);
+ pool = ci->i_layout.pool_id;
spin_unlock(&ci->i_ceph_lock);
check:
if (flags & CEPH_I_POOL_PERM) {
if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
- dout("ceph_pool_perm_check pool %u no read perm\n",
+ dout("ceph_pool_perm_check pool %lld no read perm\n",
pool);
return -EPERM;
}
if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
- dout("ceph_pool_perm_check pool %u no write perm\n",
+ dout("ceph_pool_perm_check pool %lld no write perm\n",
pool);
return -EPERM;
}
return 0;
}
- ret = __ceph_pool_perm_get(ci, pool);
+ pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
+ ret = __ceph_pool_perm_get(ci, pool, pool_ns);
+ ceph_put_string(pool_ns);
if (ret < 0)
return ret;
@@ -1907,10 +1943,11 @@ check:
flags |= CEPH_I_POOL_WR;
spin_lock(&ci->i_ceph_lock);
- if (pool == ceph_file_layout_pg_pool(ci->i_layout)) {
- ci->i_ceph_flags = flags;
+ if (pool == ci->i_layout.pool_id &&
+ pool_ns == rcu_dereference_raw(ci->i_layout.pool_ns)) {
+ ci->i_ceph_flags |= flags;
} else {
- pool = ceph_file_layout_pg_pool(ci->i_layout);
+ pool = ci->i_layout.pool_id;
flags = ci->i_ceph_flags;
}
spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 238c55b01723..5bc5d37b1217 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -71,7 +71,7 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
&ceph_fscache_fsid_object_def,
fsc, true);
if (!fsc->fscache)
- pr_err("Unable to resgister fsid: %p fscache cookie", fsc);
+ pr_err("Unable to register fsid: %p fscache cookie\n", fsc);
return 0;
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 6f60d0a3d0f9..99115cae1652 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -40,6 +40,11 @@
* cluster to release server state.
*/
+static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc);
+static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_inode_info *ci,
+ u64 oldest_flush_tid);
/*
* Generate readable cap strings for debugging output.
@@ -849,12 +854,14 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
*/
int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
{
- int want = 0;
- int mode;
- for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++)
- if (ci->i_nr_by_mode[mode])
- want |= ceph_caps_for_mode(mode);
- return want;
+ int i, bits = 0;
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+ if (ci->i_nr_by_mode[i])
+ bits |= 1 << i;
+ }
+ if (bits == 0)
+ return 0;
+ return ceph_caps_for_mode(bits >> 1);
}
/*
@@ -991,7 +998,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
u32 seq, u64 flush_tid, u64 oldest_flush_tid,
u32 issue_seq, u32 mseq, u64 size, u64 max_size,
struct timespec *mtime, struct timespec *atime,
- struct timespec *ctime, u64 time_warp_seq,
+ struct timespec *ctime, u32 time_warp_seq,
kuid_t uid, kgid_t gid, umode_t mode,
u64 xattr_version,
struct ceph_buffer *xattrs_buf,
@@ -1116,8 +1123,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
struct inode *inode = &ci->vfs_inode;
u64 cap_id = cap->cap_id;
int held, revoking, dropping, keep;
- u64 seq, issue_seq, mseq, time_warp_seq, follows;
- u64 size, max_size;
+ u64 follows, size, max_size;
+ u32 seq, issue_seq, mseq, time_warp_seq;
struct timespec mtime, atime, ctime;
int wake = 0;
umode_t mode;
@@ -1215,6 +1222,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
return delayed;
}
+static inline int __send_flush_snap(struct inode *inode,
+ struct ceph_mds_session *session,
+ struct ceph_cap_snap *capsnap,
+ u32 mseq, u64 oldest_flush_tid)
+{
+ return send_cap_msg(session, ceph_vino(inode).ino, 0,
+ CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
+ capsnap->dirty, 0, capsnap->cap_flush.tid,
+ oldest_flush_tid, 0, mseq, capsnap->size, 0,
+ &capsnap->mtime, &capsnap->atime,
+ &capsnap->ctime, capsnap->time_warp_seq,
+ capsnap->uid, capsnap->gid, capsnap->mode,
+ capsnap->xattr_version, capsnap->xattr_blob,
+ capsnap->follows, capsnap->inline_data);
+}
+
/*
* When a snapshot is taken, clients accumulate dirty metadata on
* inodes with capabilities in ceph_cap_snaps to describe the file
@@ -1222,37 +1245,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
* asynchronously back to the MDS once sync writes complete and dirty
* data is written out.
*
- * Unless @kick is true, skip cap_snaps that were already sent to
- * the MDS (i.e., during this session).
- *
* Called under i_ceph_lock. Takes s_mutex as needed.
*/
-void __ceph_flush_snaps(struct ceph_inode_info *ci,
- struct ceph_mds_session **psession,
- int kick)
+static void __ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session *session)
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
struct inode *inode = &ci->vfs_inode;
- int mds;
+ struct ceph_mds_client *mdsc = session->s_mdsc;
struct ceph_cap_snap *capsnap;
- u32 mseq;
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
- struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
- session->s_mutex */
- u64 next_follows = 0; /* keep track of how far we've gotten through the
- i_cap_snaps list, and skip these entries next time
- around to avoid an infinite loop */
+ u64 oldest_flush_tid = 0;
+ u64 first_tid = 1, last_tid = 0;
- if (psession)
- session = *psession;
+ dout("__flush_snaps %p session %p\n", inode, session);
- dout("__flush_snaps %p\n", inode);
-retry:
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- /* avoid an infiniute loop after retry */
- if (capsnap->follows < next_follows)
- continue;
/*
* we need to wait for sync writes to complete and for dirty
* pages to be written out.
@@ -1263,97 +1271,129 @@ retry:
/* should be removed by ceph_try_drop_cap_snap() */
BUG_ON(!capsnap->need_flush);
- /* pick mds, take s_mutex */
- if (ci->i_auth_cap == NULL) {
- dout("no auth cap (migrating?), doing nothing\n");
- goto out;
- }
-
/* only flush each capsnap once */
- if (!kick && !list_empty(&capsnap->flushing_item)) {
- dout("already flushed %p, skipping\n", capsnap);
+ if (capsnap->cap_flush.tid > 0) {
+ dout(" already flushed %p, skipping\n", capsnap);
continue;
}
- mds = ci->i_auth_cap->session->s_mds;
- mseq = ci->i_auth_cap->mseq;
+ spin_lock(&mdsc->cap_dirty_lock);
+ capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
+ list_add_tail(&capsnap->cap_flush.g_list,
+ &mdsc->cap_flush_list);
+ if (oldest_flush_tid == 0)
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ if (list_empty(&ci->i_flushing_item)) {
+ list_add_tail(&ci->i_flushing_item,
+ &session->s_cap_flushing);
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ list_add_tail(&capsnap->cap_flush.i_list,
+ &ci->i_cap_flush_list);
- if (session && session->s_mds != mds) {
- dout("oops, wrong session %p mutex\n", session);
- if (kick)
- goto out;
+ if (first_tid == 1)
+ first_tid = capsnap->cap_flush.tid;
+ last_tid = capsnap->cap_flush.tid;
+ }
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
- session = NULL;
+ ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS;
+
+ while (first_tid <= last_tid) {
+ struct ceph_cap *cap = ci->i_auth_cap;
+ struct ceph_cap_flush *cf;
+ int ret;
+
+ if (!(cap && cap->session == session)) {
+ dout("__flush_snaps %p auth cap %p not mds%d, "
+ "stop\n", inode, cap, session->s_mds);
+ break;
}
- if (!session) {
- spin_unlock(&ci->i_ceph_lock);
- mutex_lock(&mdsc->mutex);
- session = __ceph_lookup_mds_session(mdsc, mds);
- mutex_unlock(&mdsc->mutex);
- if (session) {
- dout("inverting session/ino locks on %p\n",
- session);
- mutex_lock(&session->s_mutex);
+
+ ret = -ENOENT;
+ list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
+ if (cf->tid >= first_tid) {
+ ret = 0;
+ break;
}
- /*
- * if session == NULL, we raced against a cap
- * deletion or migration. retry, and we'll
- * get a better @mds value next time.
- */
- spin_lock(&ci->i_ceph_lock);
- goto retry;
}
+ if (ret < 0)
+ break;
- spin_lock(&mdsc->cap_dirty_lock);
- capsnap->flush_tid = ++mdsc->last_cap_flush_tid;
- spin_unlock(&mdsc->cap_dirty_lock);
+ first_tid = cf->tid + 1;
+ capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
atomic_inc(&capsnap->nref);
- if (list_empty(&capsnap->flushing_item))
- list_add_tail(&capsnap->flushing_item,
- &session->s_cap_snaps_flushing);
spin_unlock(&ci->i_ceph_lock);
- dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
- inode, capsnap, capsnap->follows, capsnap->flush_tid);
- send_cap_msg(session, ceph_vino(inode).ino, 0,
- CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
- capsnap->dirty, 0, capsnap->flush_tid, 0,
- 0, mseq, capsnap->size, 0,
- &capsnap->mtime, &capsnap->atime,
- &capsnap->ctime, capsnap->time_warp_seq,
- capsnap->uid, capsnap->gid, capsnap->mode,
- capsnap->xattr_version, capsnap->xattr_blob,
- capsnap->follows, capsnap->inline_data);
-
- next_follows = capsnap->follows + 1;
- ceph_put_cap_snap(capsnap);
+ dout("__flush_snaps %p capsnap %p tid %llu %s\n",
+ inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty));
+
+ ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
+ oldest_flush_tid);
+ if (ret < 0) {
+ pr_err("__flush_snaps: error sending cap flushsnap, "
+ "ino (%llx.%llx) tid %llu follows %llu\n",
+ ceph_vinop(inode), cf->tid, capsnap->follows);
+ }
+ ceph_put_cap_snap(capsnap);
spin_lock(&ci->i_ceph_lock);
- goto retry;
}
+}
- /* we flushed them all; remove this inode from the queue */
- spin_lock(&mdsc->snap_flush_lock);
- list_del_init(&ci->i_snap_flush_item);
- spin_unlock(&mdsc->snap_flush_lock);
+void ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session **psession)
+{
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_session *session = *psession;
+ int mds;
+ dout("ceph_flush_snaps %p\n", inode);
+retry:
+ spin_lock(&ci->i_ceph_lock);
+ if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
+ dout(" no capsnap needs flush, doing nothing\n");
+ goto out;
+ }
+ if (!ci->i_auth_cap) {
+ dout(" no auth cap (migrating?), doing nothing\n");
+ goto out;
+ }
-out:
- if (psession)
- *psession = session;
- else if (session) {
+ mds = ci->i_auth_cap->session->s_mds;
+ if (session && session->s_mds != mds) {
+ dout(" oops, wrong session %p mutex\n", session);
mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session);
+ session = NULL;
+ }
+ if (!session) {
+ spin_unlock(&ci->i_ceph_lock);
+ mutex_lock(&mdsc->mutex);
+ session = __ceph_lookup_mds_session(mdsc, mds);
+ mutex_unlock(&mdsc->mutex);
+ if (session) {
+ dout(" inverting session/ino locks on %p\n", session);
+ mutex_lock(&session->s_mutex);
+ }
+ goto retry;
}
-}
-static void ceph_flush_snaps(struct ceph_inode_info *ci)
-{
- spin_lock(&ci->i_ceph_lock);
- __ceph_flush_snaps(ci, NULL, 0);
+ __ceph_flush_snaps(ci, session);
+out:
spin_unlock(&ci->i_ceph_lock);
+
+ if (psession) {
+ *psession = session;
+ } else {
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ }
+ /* we flushed them all; remove this inode from the queue */
+ spin_lock(&mdsc->snap_flush_lock);
+ list_del_init(&ci->i_snap_flush_item);
+ spin_unlock(&mdsc->snap_flush_lock);
}
/*
@@ -1411,52 +1451,6 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
return dirty;
}
-static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci,
- struct ceph_cap_flush *cf)
-{
- struct rb_node **p = &ci->i_cap_flush_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_cap_flush *other = NULL;
-
- while (*p) {
- parent = *p;
- other = rb_entry(parent, struct ceph_cap_flush, i_node);
-
- if (cf->tid < other->tid)
- p = &(*p)->rb_left;
- else if (cf->tid > other->tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&cf->i_node, parent, p);
- rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree);
-}
-
-static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc,
- struct ceph_cap_flush *cf)
-{
- struct rb_node **p = &mdsc->cap_flush_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_cap_flush *other = NULL;
-
- while (*p) {
- parent = *p;
- other = rb_entry(parent, struct ceph_cap_flush, g_node);
-
- if (cf->tid < other->tid)
- p = &(*p)->rb_left;
- else if (cf->tid > other->tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&cf->g_node, parent, p);
- rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree);
-}
-
struct ceph_cap_flush *ceph_alloc_cap_flush(void)
{
return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
@@ -1470,23 +1464,54 @@ void ceph_free_cap_flush(struct ceph_cap_flush *cf)
static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
{
- struct rb_node *n = rb_first(&mdsc->cap_flush_tree);
- if (n) {
+ if (!list_empty(&mdsc->cap_flush_list)) {
struct ceph_cap_flush *cf =
- rb_entry(n, struct ceph_cap_flush, g_node);
+ list_first_entry(&mdsc->cap_flush_list,
+ struct ceph_cap_flush, g_list);
return cf->tid;
}
return 0;
}
/*
+ * Remove cap_flush from the mdsc's or inode's flushing cap list.
+ * Return true if caller needs to wake up flush waiters.
+ */
+static bool __finish_cap_flush(struct ceph_mds_client *mdsc,
+ struct ceph_inode_info *ci,
+ struct ceph_cap_flush *cf)
+{
+ struct ceph_cap_flush *prev;
+ bool wake = cf->wake;
+ if (mdsc) {
+ /* are there older pending cap flushes? */
+ if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
+ prev = list_prev_entry(cf, g_list);
+ prev->wake = true;
+ wake = false;
+ }
+ list_del(&cf->g_list);
+ } else if (ci) {
+ if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
+ prev = list_prev_entry(cf, i_list);
+ prev->wake = true;
+ wake = false;
+ }
+ list_del(&cf->i_list);
+ } else {
+ BUG_ON(1);
+ }
+ return wake;
+}
+
+/*
* Add dirty inode to the flushing list. Assigned a seq number so we
* can wait for caps to flush without starving.
*
* Called under i_ceph_lock.
*/
static int __mark_caps_flushing(struct inode *inode,
- struct ceph_mds_session *session,
+ struct ceph_mds_session *session, bool wake,
u64 *flush_tid, u64 *oldest_flush_tid)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
@@ -1509,26 +1534,22 @@ static int __mark_caps_flushing(struct inode *inode,
swap(cf, ci->i_prealloc_cap_flush);
cf->caps = flushing;
+ cf->wake = wake;
spin_lock(&mdsc->cap_dirty_lock);
list_del_init(&ci->i_dirty_item);
cf->tid = ++mdsc->last_cap_flush_tid;
- __add_cap_flushing_to_mdsc(mdsc, cf);
+ list_add_tail(&cf->g_list, &mdsc->cap_flush_list);
*oldest_flush_tid = __get_oldest_flush_tid(mdsc);
if (list_empty(&ci->i_flushing_item)) {
list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
mdsc->num_cap_flushing++;
- dout(" inode %p now flushing tid %llu\n", inode, cf->tid);
- } else {
- list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
- dout(" inode %p now flushing (more) tid %llu\n",
- inode, cf->tid);
}
spin_unlock(&mdsc->cap_dirty_lock);
- __add_cap_flushing_to_inode(ci, cf);
+ list_add_tail(&cf->i_list, &ci->i_cap_flush_list);
*flush_tid = cf->tid;
return flushing;
@@ -1583,10 +1604,11 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
int mds = -1; /* keep track of how far we've gone through i_caps list
to avoid an infinite loop on retry */
struct rb_node *p;
- int tried_invalidate = 0;
- int delayed = 0, sent = 0, force_requeue = 0, num;
- int queue_invalidate = 0;
- int is_delayed = flags & CHECK_CAPS_NODELAY;
+ int delayed = 0, sent = 0, num;
+ bool is_delayed = flags & CHECK_CAPS_NODELAY;
+ bool queue_invalidate = false;
+ bool force_requeue = false;
+ bool tried_invalidate = false;
/* if we are unmounting, flush any unused caps immediately. */
if (mdsc->stopping)
@@ -1597,9 +1619,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
if (ci->i_ceph_flags & CEPH_I_FLUSH)
flags |= CHECK_CAPS_FLUSH;
- /* flush snaps first time around only */
- if (!list_empty(&ci->i_cap_snaps))
- __ceph_flush_snaps(ci, &session, 0);
goto retry_locked;
retry:
spin_lock(&ci->i_ceph_lock);
@@ -1666,17 +1685,17 @@ retry_locked:
if (revoking & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_LAZYIO)) {
dout("check_caps queuing invalidate\n");
- queue_invalidate = 1;
+ queue_invalidate = true;
ci->i_rdcache_revoking = ci->i_rdcache_gen;
} else {
dout("check_caps failed to invalidate pages\n");
/* we failed to invalidate pages. check these
caps again later. */
- force_requeue = 1;
+ force_requeue = true;
__cap_set_timeouts(mdsc, ci);
}
}
- tried_invalidate = 1;
+ tried_invalidate = true;
goto retry_locked;
}
@@ -1720,10 +1739,15 @@ retry_locked:
}
}
/* flush anything dirty? */
- if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
- ci->i_dirty_caps) {
- dout("flushing dirty caps\n");
- goto ack;
+ if (cap == ci->i_auth_cap) {
+ if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) {
+ dout("flushing dirty caps\n");
+ goto ack;
+ }
+ if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) {
+ dout("flushing snap caps\n");
+ goto ack;
+ }
}
/* completed revocation? going down and there are no caps? */
@@ -1782,6 +1806,26 @@ ack:
goto retry;
}
}
+
+ /* kick flushing and flush snaps before sending normal
+ * cap message */
+ if (cap == ci->i_auth_cap &&
+ (ci->i_ceph_flags &
+ (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) {
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
+ __kick_flushing_caps(mdsc, session, ci,
+ oldest_flush_tid);
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ }
+ if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
+ __ceph_flush_snaps(ci, session);
+
+ goto retry_locked;
+ }
+
/* take snap_rwsem after session mutex */
if (!took_snap_rwsem) {
if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
@@ -1796,7 +1840,7 @@ ack:
}
if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
- flushing = __mark_caps_flushing(inode, session,
+ flushing = __mark_caps_flushing(inode, session, false,
&flush_tid,
&oldest_flush_tid);
} else {
@@ -1822,7 +1866,7 @@ ack:
* otherwise cancel.
*/
if (delayed && is_delayed)
- force_requeue = 1; /* __send_cap delayed release; requeue */
+ force_requeue = true; /* __send_cap delayed release; requeue */
if (!delayed && !is_delayed)
__cap_delay_cancel(mdsc, ci);
else if (!is_delayed || force_requeue)
@@ -1873,8 +1917,8 @@ retry:
if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
goto out;
- flushing = __mark_caps_flushing(inode, session, &flush_tid,
- &oldest_flus