summaryrefslogtreecommitdiffstats
path: root/fs/ceph
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-02-06 12:21:01 +0000
committerLinus Torvalds <torvalds@linux-foundation.org>2020-02-06 12:21:01 +0000
commit4c46bef2e96a92df0f40fc91848e56889ef7c15e (patch)
treee27d414870e1ad502b9c2c1455ce45ae0f5499a6 /fs/ceph
parent5b21115414f5b5220e7ab3ca7f5d2c1396f11854 (diff)
parent3325322f773bae68b20d8fa0e9e8ebb005271db5 (diff)
Merge tag 'ceph-for-5.6-rc1' of https://github.com/ceph/ceph-client
Pull ceph fixes from Ilya Dryomov: - a set of patches that fixes various corner cases in mount and umount code (Xiubo Li). This has to do with choosing an MDS, distinguishing between laggy and down MDSes and parsing the server path. - inode initialization fixes (Jeff Layton). The one included here mostly concerns things like open_by_handle() and there is another one that will come through Al. - copy_file_range() now uses the new copy-from2 op (Luis Henriques). The existing copy-from op turned out to be infeasible for generic filesystem use; we disable the copy offload if OSDs don't support copy-from2. - a patch to link "rbd" and "block" devices together in sysfs (Hannes Reinecke) ... and a smattering of cleanups from Xiubo, Jeff and Chengguang. * tag 'ceph-for-5.6-rc1' of https://github.com/ceph/ceph-client: (25 commits) rbd: set the 'device' link in sysfs ceph: move net/ceph/ceph_fs.c to fs/ceph/util.c ceph: print name of xattr in __ceph_{get,set}xattr() douts ceph: print r_direct_hash in hex in __choose_mds() dout ceph: use copy-from2 op in copy_file_range ceph: close holes in structs ceph_mds_session and ceph_mds_request rbd: work around -Wuninitialized warning ceph: allocate the correct amount of extra bytes for the session features ceph: rename get_session and switch to use ceph_get_mds_session ceph: remove the extra slashes in the server path ceph: add possible_max_rank and make the code more readable ceph: print dentry offset in hex and fix xattr_version type ceph: only touch the caps which have the subset mask requested ceph: don't clear I_NEW until inode metadata is fully populated ceph: retry the same mds later after the new session is opened ceph: check availability of mds cluster on mount after wait timeout ceph: keep the session state until it is released ceph: add __send_request helper ceph: ensure we have a new cap before continuing in fill_inode ceph: drop unused ttl_from parameter from fill_inode ...
Diffstat (limited to 'fs/ceph')
-rw-r--r--fs/ceph/Makefile2
-rw-r--r--fs/ceph/acl.c4
-rw-r--r--fs/ceph/caps.c3
-rw-r--r--fs/ceph/debugfs.c2
-rw-r--r--fs/ceph/dir.c4
-rw-r--r--fs/ceph/file.c11
-rw-r--r--fs/ceph/inode.c47
-rw-r--r--fs/ceph/mds_client.c171
-rw-r--r--fs/ceph/mds_client.h39
-rw-r--r--fs/ceph/mdsmap.c91
-rw-r--r--fs/ceph/super.c128
-rw-r--r--fs/ceph/super.h2
-rw-r--r--fs/ceph/util.c100
-rw-r--r--fs/ceph/xattr.c7
14 files changed, 436 insertions, 175 deletions
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index c1da294418d1..0a0823d378db 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -8,7 +8,7 @@ obj-$(CONFIG_CEPH_FS) += ceph.o
ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
export.o caps.o snap.o xattr.o quota.o io.o \
mds_client.o mdsmap.o strings.o ceph_frag.o \
- debugfs.o
+ debugfs.o util.o
ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index aa55f412a6e3..26be6520d3fb 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -222,8 +222,8 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8);
if (err)
goto out_err;
- err = ceph_pagelist_encode_string(pagelist,
- XATTR_NAME_POSIX_ACL_DEFAULT, len);
+ ceph_pagelist_encode_string(pagelist,
+ XATTR_NAME_POSIX_ACL_DEFAULT, len);
err = posix_acl_to_xattr(&init_user_ns, default_acl,
tmp_buf, val_size2);
if (err < 0)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 9d09bb53c1ab..28ae0c134700 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -908,7 +908,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
ci_node);
if (!__cap_is_valid(cap))
continue;
- __touch_cap(cap);
+ if (cap->issued & mask)
+ __touch_cap(cap);
}
}
return 1;
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index c281f32b54f7..fb7cabd98e7b 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -33,7 +33,7 @@ static int mdsmap_show(struct seq_file *s, void *p)
seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds);
seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout);
seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose);
- for (i = 0; i < mdsmap->m_num_mds; i++) {
+ for (i = 0; i < mdsmap->possible_max_rank; i++) {
struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr;
int state = mdsmap->m_info[i].state;
seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 2e4764fd1872..d0cd0aba5843 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1186,7 +1186,7 @@ void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di)
struct dentry *dn = di->dentry;
struct ceph_mds_client *mdsc;
- dout("dentry_dir_lease_touch %p %p '%pd' (offset %lld)\n",
+ dout("dentry_dir_lease_touch %p %p '%pd' (offset 0x%llx)\n",
di, dn, dn, di->offset);
if (!list_empty(&di->lease_list)) {
@@ -1567,7 +1567,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
inode = d_inode(dentry);
}
- dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
+ dout("d_revalidate %p '%pd' inode %p offset 0x%llx\n", dentry,
dentry, inode, ceph_dentry(dentry)->offset);
/* always trust cached snapped dentries, snapdir dentry */
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 11929d2bb594..c3b8e8e0bf17 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1974,6 +1974,9 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
if (ceph_test_mount_opt(src_fsc, NOCOPYFROM))
return -EOPNOTSUPP;
+ if (!src_fsc->have_copy_from2)
+ return -EOPNOTSUPP;
+
/*
* Striped file layouts require that we copy partial objects, but the
* OSD copy-from operation only supports full-object copies. Limit
@@ -2101,8 +2104,14 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
&dst_oid, &dst_oloc,
CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
- CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, 0);
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
+ dst_ci->i_truncate_seq, dst_ci->i_truncate_size,
+ CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
if (err) {
+ if (err == -EOPNOTSUPP) {
+ src_fsc->have_copy_from2 = false;
+ pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
+ }
dout("ceph_osdc_copy_from returned %d\n", err);
if (!ret)
ret = err;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index c07407586ce8..d01710a16a4a 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -55,11 +55,9 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW) {
+ if (inode->i_state & I_NEW)
dout("get_inode created new inode %p %llx.%llx ino %llx\n",
inode, ceph_vinop(inode), (u64)inode->i_ino);
- unlock_new_inode(inode);
- }
dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
vino.snap, inode);
@@ -88,6 +86,10 @@ struct inode *ceph_get_snapdir(struct inode *parent)
inode->i_fop = &ceph_snapdir_fops;
ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
ci->i_rbytes = 0;
+
+ if (inode->i_state & I_NEW)
+ unlock_new_inode(inode);
+
return inode;
}
@@ -728,8 +730,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
static int fill_inode(struct inode *inode, struct page *locked_page,
struct ceph_mds_reply_info_in *iinfo,
struct ceph_mds_reply_dirfrag *dirinfo,
- struct ceph_mds_session *session,
- unsigned long ttl_from, int cap_fmode,
+ struct ceph_mds_session *session, int cap_fmode,
struct ceph_cap_reservation *caps_reservation)
{
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
@@ -754,8 +755,11 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
info_caps = le32_to_cpu(info->cap.caps);
/* prealloc new cap struct */
- if (info_caps && ceph_snap(inode) == CEPH_NOSNAP)
+ if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) {
new_cap = ceph_get_cap(mdsc, caps_reservation);
+ if (!new_cap)
+ return -ENOMEM;
+ }
/*
* prealloc xattr data, if it looks like we'll need it. only
@@ -1237,7 +1241,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
if (dir) {
err = fill_inode(dir, NULL,
&rinfo->diri, rinfo->dirfrag,
- session, req->r_request_started, -1,
+ session, -1,
&req->r_caps_reservation);
if (err < 0)
goto done;
@@ -1302,18 +1306,22 @@ retry_lookup:
err = PTR_ERR(in);
goto done;
}
- req->r_target_inode = in;
err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL,
- session, req->r_request_started,
+ session,
(!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
- rinfo->head->result == 0) ? req->r_fmode : -1,
+ rinfo->head->result == 0) ? req->r_fmode : -1,
&req->r_caps_reservation);
if (err < 0) {
pr_err("fill_inode badness %p %llx.%llx\n",
in, ceph_vinop(in));
+ if (in->i_state & I_NEW)
+ discard_new_inode(in);
goto done;
}
+ req->r_target_inode = in;
+ if (in->i_state & I_NEW)
+ unlock_new_inode(in);
}
/*
@@ -1493,12 +1501,18 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
continue;
}
rc = fill_inode(in, NULL, &rde->inode, NULL, session,
- req->r_request_started, -1,
- &req->r_caps_reservation);
+ -1, &req->r_caps_reservation);
if (rc < 0) {
pr_err("fill_inode badness on %p got %d\n", in, rc);
err = rc;
+ if (in->i_state & I_NEW) {
+ ihold(in);
+ discard_new_inode(in);
+ }
+ } else if (in->i_state & I_NEW) {
+ unlock_new_inode(in);
}
+
/* avoid calling iput_final() in mds dispatch threads */
ceph_async_iput(in);
}
@@ -1694,19 +1708,24 @@ retry_lookup:
}
ret = fill_inode(in, NULL, &rde->inode, NULL, session,
- req->r_request_started, -1,
- &req->r_caps_reservation);
+ -1, &req->r_caps_reservation);
if (ret < 0) {
pr_err("fill_inode badness on %p\n", in);
if (d_really_is_negative(dn)) {
/* avoid calling iput_final() in mds
* dispatch threads */
+ if (in->i_state & I_NEW) {
+ ihold(in);
+ discard_new_inode(in);
+ }
ceph_async_iput(in);
}
d_drop(dn);
err = ret;
goto next_item;
}
+ if (in->i_state & I_NEW)
+ unlock_new_inode(in);
if (d_really_is_negative(dn)) {
if (ceph_security_xattr_deadlock(in)) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 21ada2c4a88c..bbbbddf71326 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -9,6 +9,7 @@
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/ratelimit.h>
+#include <linux/bits.h>
#include "super.h"
#include "mds_client.h"
@@ -530,6 +531,7 @@ const char *ceph_session_state_name(int s)
case CEPH_MDS_SESSION_OPEN: return "open";
case CEPH_MDS_SESSION_HUNG: return "hung";
case CEPH_MDS_SESSION_CLOSING: return "closing";
+ case CEPH_MDS_SESSION_CLOSED: return "closed";
case CEPH_MDS_SESSION_RESTARTING: return "restarting";
case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
case CEPH_MDS_SESSION_REJECTED: return "rejected";
@@ -537,7 +539,7 @@ const char *ceph_session_state_name(int s)
}
}
-static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
+struct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s)
{
if (refcount_inc_not_zero(&s->s_ref)) {
dout("mdsc get_session %p %d -> %d\n", s,
@@ -568,7 +570,7 @@ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
{
if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
return NULL;
- return get_session(mdsc->sessions[mds]);
+ return ceph_get_mds_session(mdsc->sessions[mds]);
}
static bool __have_session(struct ceph_mds_client *mdsc, int mds)
@@ -597,7 +599,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
{
struct ceph_mds_session *s;
- if (mds >= mdsc->mdsmap->m_num_mds)
+ if (mds >= mdsc->mdsmap->possible_max_rank)
return ERR_PTR(-EINVAL);
s = kzalloc(sizeof(*s), GFP_NOFS);
@@ -674,7 +676,6 @@ static void __unregister_session(struct ceph_mds_client *mdsc,
dout("__unregister_session mds%d %p\n", s->s_mds, s);
BUG_ON(mdsc->sessions[s->s_mds] != s);
mdsc->sessions[s->s_mds] = NULL;
- s->s_state = 0;
ceph_con_close(&s->s_con);
ceph_put_mds_session(s);
atomic_dec(&mdsc->num_sessions);
@@ -878,7 +879,8 @@ static struct inode *get_nonsnap_parent(struct dentry *dentry)
* Called under mdsc->mutex.
*/
static int __choose_mds(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req)
+ struct ceph_mds_request *req,
+ bool *random)
{
struct inode *inode;
struct ceph_inode_info *ci;
@@ -888,6 +890,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
u32 hash = req->r_direct_hash;
bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
+ if (random)
+ *random = false;
+
/*
* is there a specific mds we should try? ignore hint if we have
* no session and the mds is not up (active or recovering).
@@ -895,7 +900,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
if (req->r_resend_mds >= 0 &&
(__have_session(mdsc, req->r_resend_mds) ||
ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
- dout("choose_mds using resend_mds mds%d\n",
+ dout("%s using resend_mds mds%d\n", __func__,
req->r_resend_mds);
return req->r_resend_mds;
}
@@ -913,7 +918,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
rcu_read_lock();
inode = get_nonsnap_parent(req->r_dentry);
rcu_read_unlock();
- dout("__choose_mds using snapdir's parent %p\n", inode);
+ dout("%s using snapdir's parent %p\n", __func__, inode);
}
} else if (req->r_dentry) {
/* ignore race with rename; old or new d_parent is okay */
@@ -933,7 +938,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
/* direct snapped/virtual snapdir requests
* based on parent dir inode */
inode = get_nonsnap_parent(parent);
- dout("__choose_mds using nonsnap parent %p\n", inode);
+ dout("%s using nonsnap parent %p\n", __func__, inode);
} else {
/* dentry target */
inode = d_inode(req->r_dentry);
@@ -949,8 +954,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
rcu_read_unlock();
}
- dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
- (int)hash, mode);
+ dout("%s %p is_hash=%d (0x%x) mode %d\n", __func__, inode, (int)is_hash,
+ hash, mode);
if (!inode)
goto random;
ci = ceph_inode(inode);
@@ -968,30 +973,33 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
get_random_bytes(&r, 1);
r %= frag.ndist;
mds = frag.dist[r];
- dout("choose_mds %p %llx.%llx "
- "frag %u mds%d (%d/%d)\n",
- inode, ceph_vinop(inode),
- frag.frag, mds,
- (int)r, frag.ndist);
+ dout("%s %p %llx.%llx frag %u mds%d (%d/%d)\n",
+ __func__, inode, ceph_vinop(inode),
+ frag.frag, mds, (int)r, frag.ndist);
if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
- CEPH_MDS_STATE_ACTIVE)
+ CEPH_MDS_STATE_ACTIVE &&
+ !ceph_mdsmap_is_laggy(mdsc->mdsmap, mds))
goto out;
}
/* since this file/dir wasn't known to be
* replicated, then we want to look for the
* authoritative mds. */
- mode = USE_AUTH_MDS;
if (frag.mds >= 0) {
/* choose auth mds */
mds = frag.mds;
- dout("choose_mds %p %llx.%llx "
- "frag %u mds%d (auth)\n",
- inode, ceph_vinop(inode), frag.frag, mds);
+ dout("%s %p %llx.%llx frag %u mds%d (auth)\n",
+ __func__, inode, ceph_vinop(inode),
+ frag.frag, mds);
if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
- CEPH_MDS_STATE_ACTIVE)
- goto out;
+ CEPH_MDS_STATE_ACTIVE) {
+ if (mode == USE_ANY_MDS &&
+ !ceph_mdsmap_is_laggy(mdsc->mdsmap,
+ mds))
+ goto out;
+ }
}
+ mode = USE_AUTH_MDS;
}
}
@@ -1007,7 +1015,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
goto random;
}
mds = cap->session->s_mds;
- dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
+ dout("%s %p %llx.%llx mds%d (%scap %p)\n", __func__,
inode, ceph_vinop(inode), mds,
cap == ci->i_auth_cap ? "auth " : "", cap);
spin_unlock(&ci->i_ceph_lock);
@@ -1018,8 +1026,11 @@ out:
return mds;
random:
+ if (random)
+ *random = true;
+
mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
- dout("choose_mds chose random mds%d\n", mds);
+ dout("%s chose random mds%d\n", __func__, mds);
return mds;
}
@@ -1045,20 +1056,21 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
return msg;
}
+static const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED;
+#define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8)
static void encode_supported_features(void **p, void *end)
{
- static const unsigned char bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED;
- static const size_t count = ARRAY_SIZE(bits);
+ static const size_t count = ARRAY_SIZE(feature_bits);
if (count > 0) {
size_t i;
- size_t size = ((size_t)bits[count - 1] + 64) / 64 * 8;
+ size_t size = FEATURE_BYTES(count);
BUG_ON(*p + 4 + size > end);
ceph_encode_32(p, size);
memset(*p, 0, size);
for (i = 0; i < count; i++)
- ((unsigned char*)(*p))[i / 8] |= 1 << (bits[i] % 8);
+ ((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8);
*p += size;
} else {
BUG_ON(*p + 4 > end);
@@ -1079,6 +1091,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
int metadata_key_count = 0;
struct ceph_options *opt = mdsc->fsc->client->options;
struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
+ size_t size, count;
void *p, *end;
const char* metadata[][2] = {
@@ -1096,8 +1109,13 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
strlen(metadata[i][1]);
metadata_key_count++;
}
+
/* supported feature */
- extra_bytes += 4 + 8;
+ size = 0;
+ count = ARRAY_SIZE(feature_bits);
+ if (count > 0)
+ size = FEATURE_BYTES(count);
+ extra_bytes += 4 + size;
/* Allocate the message */
msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
@@ -1117,7 +1135,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
* Serialize client metadata into waiting buffer space, using
* the format that userspace expects for map<string, string>
*
- * ClientSession messages with metadata are v2
+ * ClientSession messages with metadata are v3
*/
msg->hdr.version = cpu_to_le16(3);
msg->hdr.compat_version = cpu_to_le16(1);
@@ -1219,7 +1237,7 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
struct ceph_mds_session *ts;
int i, mds = session->s_mds;
- if (mds >= mdsc->mdsmap->m_num_mds)
+ if (mds >= mdsc->mdsmap->possible_max_rank)
return;
mi = &mdsc->mdsmap->m_info[mds];
@@ -1967,7 +1985,7 @@ void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
if (mdsc->stopping)
return;
- get_session(session);
+ ceph_get_mds_session(session);
if (queue_work(mdsc->fsc->cap_wq,
&session->s_cap_release_work)) {
dout("cap release work queued\n");
@@ -2516,6 +2534,26 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
}
/*
+ * called under mdsc->mutex
+ */
+static int __send_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_mds_request *req,
+ bool drop_cap_releases)
+{
+ int err;
+
+ err = __prepare_send_request(mdsc, req, session->s_mds,
+ drop_cap_releases);
+ if (!err) {
+ ceph_msg_get(req->r_request);
+ ceph_con_send(&session->s_con, req->r_request);
+ }
+
+ return err;
+}
+
+/*
* send request, or put it on the appropriate wait list.
*/
static void __do_request(struct ceph_mds_client *mdsc,
@@ -2524,6 +2562,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session = NULL;
int mds = -1;
int err = 0;
+ bool random;
if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
@@ -2556,15 +2595,14 @@ static void __do_request(struct ceph_mds_client *mdsc,
if (!(mdsc->fsc->mount_options->flags &
CEPH_MOUNT_OPT_MOUNTWAIT) &&
!ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) {
- err = -ENOENT;
- pr_info("probably no mds server is up\n");
+ err = -EHOSTUNREACH;
goto finish;
}
}
put_request_session(req);
- mds = __choose_mds(mdsc, req);
+ mds = __choose_mds(mdsc, req, &random);
if (mds < 0 ||
ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
dout("do_request no mds or not active, waiting for map\n");
@@ -2581,7 +2619,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
goto finish;
}
}
- req->r_session = get_session(session);
+ req->r_session = ceph_get_mds_session(session);
dout("do_request mds%d session %p state %s\n", mds, session,
ceph_session_state_name(session->s_state));
@@ -2592,8 +2630,12 @@ static void __do_request(struct ceph_mds_client *mdsc,
goto out_session;
}
if (session->s_state == CEPH_MDS_SESSION_NEW ||
- session->s_state == CEPH_MDS_SESSION_CLOSING)
+ session->s_state == CEPH_MDS_SESSION_CLOSING) {
__open_session(mdsc, session);
+ /* retry the same mds later */
+ if (random)
+ req->r_resend_mds = mds;
+ }
list_add(&req->r_wait, &session->s_waiting);
goto out_session;
}
@@ -2604,11 +2646,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
if (req->r_request_started == 0) /* note request start time */
req->r_request_started = jiffies;
- err = __prepare_send_request(mdsc, req, mds, false);
- if (!err) {
- ceph_msg_get(req->r_request);
- ceph_con_send(&session->s_con, req->r_request);
- }
+ err = __send_request(mdsc, session, req, false);
out_session:
ceph_put_mds_session(session);
@@ -2861,7 +2899,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_unlock(&mdsc->mutex);
goto out;
} else {
- int mds = __choose_mds(mdsc, req);
+ int mds = __choose_mds(mdsc, req, NULL);
if (mds >= 0 && mds != req->r_session->s_mds) {
dout("but auth changed, so resending\n");
__do_request(mdsc, req);
@@ -2877,6 +2915,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
__unregister_request(mdsc, req);
+ /* last request during umount? */
+ if (mdsc->stopping && !__get_oldest_req(mdsc))
+ complete_all(&mdsc->safe_umount_waiters);
+
if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
/*
* We already handled the unsafe response, now do the
@@ -2887,9 +2929,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
*/
dout("got safe reply %llu, mds%d\n", tid, mds);
- /* last unsafe request during umount? */
- if (mdsc->stopping && !__get_oldest_req(mdsc))
- complete_all(&mdsc->safe_umount_waiters);
mutex_unlock(&mdsc->mutex);
goto out;
}
@@ -3104,7 +3143,7 @@ static void handle_session(struct ceph_mds_session *session,
mutex_lock(&mdsc->mutex);
if (op == CEPH_SESSION_CLOSE) {
- get_session(session);
+ ceph_get_mds_session(session);
__unregister_session(mdsc, session);
}
/* FIXME: this ttl calculation is generous */
@@ -3142,6 +3181,7 @@ static void handle_session(struct ceph_mds_session *session,
case CEPH_SESSION_CLOSE:
if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
pr_info("mds%d reconnect denied\n", session->s_mds);
+ session->s_state = CEPH_MDS_SESSION_CLOSED;
cleanup_session_requests(mdsc, session);
remove_session_caps(session);
wake = 2; /* for good measure */
@@ -3209,7 +3249,6 @@ bad:
return;
}
-
/*
* called under session->mutex.
*/
@@ -3218,18 +3257,12 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
{
struct ceph_mds_request *req, *nreq;
struct rb_node *p;
- int err;
dout("replay_unsafe_requests mds%d\n", session->s_mds);
mutex_lock(&mdsc->mutex);
- list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
- err = __prepare_send_request(mdsc, req, session->s_mds, true);
- if (!err) {
- ceph_msg_get(req->r_request);
- ceph_con_send(&session->s_con, req->r_request);
- }
- }
+ list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item)
+ __send_request(mdsc, session, req, true);
/*
* also re-send old requests when MDS enters reconnect stage. So that MDS
@@ -3244,14 +3277,8 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
if (req->r_attempts == 0)
continue; /* only old requests */
if (req->r_session &&
- req->r_session->s_mds == session->s_mds) {
- err = __prepare_send_request(mdsc, req,
- session->s_mds, true);
- if (!err) {
- ceph_msg_get(req->r_request);
- ceph_con_send(&session->s_con, req->r_request);
- }
- }
+ req->r_session->s_mds == session->s_mds)
+ __send_request(mdsc, session, req, true);
}
mutex_unlock(&mdsc->mutex);
}
@@ -3762,7 +3789,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
dout("check_new_map new %u old %u\n",
newmap->m_epoch, oldmap->m_epoch);
- for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) {
+ for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) {
if (!mdsc->sessions[i])
continue;
s = mdsc->sessions[i];
@@ -3776,9 +3803,9 @@ static void check_new_map(struct ceph_mds_client *mdsc,
ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
ceph_session_state_name(s->s_state));
- if (i >= newmap->m_num_mds) {
+ if (i >= newmap->possible_max_rank) {
/* force close session for stopped mds */
- get_session(s);
+ ceph_get_mds_session(s);
__unregister_session(mdsc, s);
__wake_requests(mdsc, &s->s_waiting);
mutex_unlock(&mdsc->mutex);
@@ -3833,7 +3860,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
}
}
- for (i = 0; i < newmap->m_num_mds && i < mdsc->max_sessions; i++) {
+ for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) {
s = mdsc->sessions[i];
if (!s)
continue;
@@ -4379,7 +4406,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
mutex_lock(&mdsc->mutex);
for (i = 0; i < mdsc->max_sessions; i++) {
if (mdsc->sessions[i]) {
- session = get_session(mdsc->sessions[i]);
+ session = ceph_get_mds_session(mdsc->sessions[i]);
__unregister_session(mdsc, session);
mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex);
@@ -4607,11 +4634,8 @@ static struct ceph_connection *con_get(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
- if (get_session(s)) {
- dout("mdsc con_get %p ok (%d)\n", s, refcount_read(&s->s_ref));
+ if (ceph_get_mds_session(s))
return con;
- }
- dout("mdsc con_get %p FAIL\n", s);
return NULL;
}
@@ -4619,7 +4643,6 @@ static void con_put(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
- dout("mdsc con_put %p (%d)\n", s, refcount_read(&s->s_ref) - 1);
ceph_put_mds_session(s);
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 14c7e8c49970..27a7446e10d3 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -17,22 +17,31 @@
#include <linux/ceph/auth.h>
/* The first 8 bits are reserved for old ceph releases */
-#define CEPHFS_FEATURE_MIMIC 8
-#define CEPHFS_FEATURE_REPLY_ENCODING 9
-#define CEPHFS_FEATURE_RECLAIM_CLIENT 10
-#define CEPHFS_FEATURE_LAZY_CAP_WANTED 11
-#define CEPHFS_FEATURE_MULTI_RECONNECT 12
+enum ceph_feature_type {
+ CEPHFS_FEATURE_MIMIC = 8,
+ CEPHFS_FEATURE_REPLY_ENCODING,
+ CEPHFS_FEATURE_RECLAIM_CLIENT,
+ CEPHFS_FEATURE_LAZY_CAP_WANTED,
+ CEPHFS_FEATURE_MULTI_RECONNECT,
+
+ CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MULTI_RECONNECT,
+};
-#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
+/*
+ * This will always have the highest feature bit value
+ * as the last element of the array.
+ */
+#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
0, 1, 2, 3, 4, 5, 6, 7, \
CEPHFS_FEATURE_MIMIC, \
CEPHFS_FEATURE_REPLY_ENCODING, \
CEPHFS_FEATURE_LAZY_CAP_WANTED, \
CEPHFS_FEATURE_MULTI_RECONNECT, \
+ \
+ CEPHFS_FEATURE_MAX, \
}
#define CEPHFS_FEATURES_CLIENT_REQUIRED {}
-
/*
* Some lock dependencies:
*
@@ -151,7 +160,8 @@ enum {
CEPH_MDS_SESSION_RESTARTING = 5,
CEPH_MDS_SESSION_RECONNECTING = 6,
CEPH_MDS_SESSION_CLOSING = 7,
- CEPH_MDS_SESSION_REJECTED = 8,
+ CEPH_MDS_SESSION_CLOSED = 8,
+ CEPH_MDS_SESSION_REJECTED = 9,
};
struct ceph_mds_session {
@@ -174,6 +184,7 @@ struct ceph_mds_session {
/* protected by s_cap_lock */
spinlock_t s_cap_lock;
+ refcount_t s_ref;
struct list_head s_caps; /* all caps issued by this session */
struct ceph_cap *s_cap_iterator;
int s_nr_caps;
@@ -188,7 +199,6 @@ struct ceph_mds_session {
unsigned long s_renew_requested; /* last time we sent a renew req */
u64 s_renew_seq;
- refcount_t s_ref;
struct list_head s_waiting; /* waiting requests */
struct list_head s_unsafe; /* unsafe requests */
};
@@ -224,6 +234,7 @@ struct ceph_mds_request {
struct rb_node r_node;
struct ceph_mds_client *r_mdsc;
+ struct kref r_kref;
int r_op; /* mds op code */
/* operation on what? */
@@ -294,7 +305,6 @@ struct ceph_mds_request {
int r_resend_mds; /* mds to resend to next, if any*/
u32 r_sent_on_mseq; /* cap mseq request was sent at*/
- struct kref r_kref;
struct list_head r_wait;
struct completion r_completion;
struct completion r_safe_completion;
@@ -451,15 +461,10 @@ extern const char *ceph_mds_op_name(int op);
extern struct ceph_mds_session *
__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
-static inline struct ceph_mds_session *
-ceph_get_mds_session(struct ceph_mds_session *s)
-{
- refcount_inc(&s->s_ref);
- return s;
-}
-
extern const char *ceph_session_state_name(int s);
+extern struct ceph_mds_session *
+ceph_get_mds_session(struct ceph_mds_session *s);
extern void ceph_put_mds_session(struct ceph_mds_session *s);
extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 471bac335fae..889627817e52 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -13,30 +13,25 @@
#include "super.h"
+#define CEPH_MDS_IS_READY(i, ignore_laggy) \
+ (m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy)
-/*
- * choose a random mds that is "up" (i.e. has a state > 0), or -1.
- */
-int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
+static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy)
{
int n = 0;
int i, j;
- /* special case for one mds */
- if (1 == m->m_num_mds && m->m_info[0].state > 0)
- return 0;
-
/* count */
- for (i = 0; i < m->m_num_mds; i++)
- if (m->m_info[i].state > 0)
+ for (i = 0; i < m->possible_max_rank; i++)
+ if (CEPH_MDS_IS_READY(i, ignore_laggy))
n++;
if (n == 0)
return -1;
/* pick */
n = prandom_u32() % n;
- for (j = 0, i = 0; i < m->m_num_mds; i++) {
- if (m->m_info[i].state > 0)
+ for (j = 0, i = 0; i < m->possible_max_rank; i++) {
+ if (CEPH_MDS_IS_READY(i, ignore_laggy))
j++;
if (j > n)
break;
@@ -45,6 +40,20 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
return i;
}
+/*
+ * choose a random mds that is "up" (i.e. has a state > 0), or -1.
+ */
+int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
+{
+ int mds;
+
+ mds = __mdsmap_get_random_mds(m