From 03f4fcb02884859b584c709652bb48f8125ceb45 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 5 Jan 2015 11:04:04 +0800 Subject: ceph: handle SESSION_FORCE_RO message mark session as readonly and wake up all cap waiters. Signed-off-by: Yan, Zheng --- fs/ceph/caps.c | 15 +++++++++++++++ fs/ceph/mds_client.c | 10 ++++++++++ fs/ceph/mds_client.h | 1 + 3 files changed, 26 insertions(+) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index b93c631c6c87..d0618e8412fd 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2143,6 +2143,21 @@ again: ret = 1; } } else { + int session_readonly = false; + if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) { + struct ceph_mds_session *s = ci->i_auth_cap->session; + spin_lock(&s->s_cap_lock); + session_readonly = s->s_readonly; + spin_unlock(&s->s_cap_lock); + } + if (session_readonly) { + dout("get_cap_refs %p needed %s but mds%d readonly\n", + inode, ceph_cap_string(need), ci->i_auth_cap->mds); + *err = -EROFS; + ret = 1; + goto out_unlock; + } + dout("get_cap_refs %p have %s needed %s\n", inode, ceph_cap_string(have), ceph_cap_string(need)); } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d2171f4a6980..c6c33b411a2f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2580,6 +2580,14 @@ static void handle_session(struct ceph_mds_session *session, send_flushmsg_ack(mdsc, session, seq); break; + case CEPH_SESSION_FORCE_RO: + dout("force_session_readonly %p\n", session); + spin_lock(&session->s_cap_lock); + session->s_readonly = true; + spin_unlock(&session->s_cap_lock); + wake_up_session_caps(session, 0); + break; + default: pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); WARN_ON(1); @@ -2791,6 +2799,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, spin_unlock(&session->s_gen_ttl_lock); spin_lock(&session->s_cap_lock); + /* don't know if session is readonly */ + session->s_readonly = 0; /* * notify __ceph_remove_cap() that we are composing cap reconnect. * If a cap get released before being added to the cap reconnect, diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index e2817d00f7d9..a87b92f500bb 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -137,6 +137,7 @@ struct ceph_mds_session { int s_nr_caps, s_trim_caps; int s_num_cap_releases; int s_cap_reconnect; + int s_readonly; struct list_head s_cap_releases; /* waiting cap_release messages */ struct list_head s_cap_releases_done; /* ready to send */ struct ceph_cap *s_cap_iterator; -- cgit v1.2.3 From 671762f8071563847e50f45c6fb0b329e6e8cf9a Mon Sep 17 00:00:00 2001 From: Rickard Strandqvist Date: Sun, 4 Jan 2015 00:44:54 +0100 Subject: ceph: acl: Remove unused function Remove the function ceph_get_cached_acl() that is not used anywhere. This was partially found by using a static code analysis program called cppcheck. Signed-off-by: Rickard Strandqvist Reviewed-by: Yan, Zheng --- fs/ceph/acl.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'fs') diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 5bd853ba44ff..64fa248343f6 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -40,20 +40,6 @@ static inline void ceph_set_cached_acl(struct inode *inode, spin_unlock(&ci->i_ceph_lock); } -static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode, - int type) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - struct posix_acl *acl = ACL_NOT_CACHED; - - spin_lock(&ci->i_ceph_lock); - if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) - acl = get_cached_acl(inode, type); - spin_unlock(&ci->i_ceph_lock); - - return acl; -} - struct posix_acl *ceph_get_acl(struct inode *inode, int type) { int size; -- cgit v1.2.3 From 1487a688d8ea596e6710b0d256300ab10ce99284 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 6 Jan 2015 15:29:14 +0800 Subject: ceph: properly zero data pages for file holes. A bug is found in striped_read() of fs/ceph/file.c. striped_read() calls ceph_zero_pape_vector_range(). The first argument, page_align + read + ret, passed to ceph_zero_pape_vector_range() is wrong. When a file has holes, this wrong parameter may cause memory corruption either in kernal space or user space. Kernel space memory may be corrupted in the case of non direct IO; user space memory may be corrupted in the case of direct IO. In the latter case, the application doing direct IO may crash due to memory corruption, as we have experienced. The correct value should be initial_align + read + ret, where intial_align = o_direct ? buf_align : io_align. Compared with page_align, the current page offest, initial_align is the initial page offest, which should be used to calculate the page and offset in ceph_zero_pape_vector_range(). Reported-by: caifeng zhu Signed-off-by: Yan, Zheng --- fs/ceph/file.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ce74b394b49d..663da44c06b6 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -392,13 +392,14 @@ more: if (ret >= 0) { int didpages; if (was_short && (pos + ret < inode->i_size)) { - u64 tmp = min(this_len - ret, - inode->i_size - pos - ret); + int zlen = min(this_len - ret, + inode->i_size - pos - ret); + int zoff = (o_direct ? buf_align : io_align) + + read + ret; dout(" zero gap %llu to %llu\n", - pos + ret, pos + ret + tmp); - ceph_zero_page_vector_range(page_align + read + ret, - tmp, pages); - ret += tmp; + pos + ret, pos + ret + zlen); + ceph_zero_page_vector_range(zoff, zlen, pages); + ret += zlen; } didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; -- cgit v1.2.3 From 982d6011bc30a26e8a3d546e0e7fc7db2c255d85 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 23 Dec 2014 15:30:54 +0800 Subject: ceph: improve reference tracking for snaprealm When snaprealm is created, its initial reference count is zero. But in some rare cases, the newly created snaprealm is not referenced by anyone. This causes snaprealm with zero reference count not freed. The fix is set reference count of newly snaprealm to 1. The reference is return the function who requests to create the snaprealm. When the function finishes its job, it releases the reference. Signed-off-by: Yan, Zheng --- fs/ceph/caps.c | 24 +++++++++++++++-------- fs/ceph/mds_client.c | 9 +++++++-- fs/ceph/snap.c | 54 ++++++++++++++++++++++++++++++++++++---------------- fs/ceph/super.h | 3 ++- 4 files changed, 63 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index d0618e8412fd..8ed1192606d9 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -577,7 +577,6 @@ void ceph_add_cap(struct inode *inode, struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, realmino); if (realm) { - ceph_get_snap_realm(mdsc, realm); spin_lock(&realm->inodes_with_caps_lock); ci->i_snap_realm = realm; list_add(&ci->i_snap_realm_item, @@ -2447,13 +2446,13 @@ static void invalidate_aliases(struct inode *inode) */ static void handle_cap_grant(struct ceph_mds_client *mdsc, struct inode *inode, struct ceph_mds_caps *grant, - void *snaptrace, int snaptrace_len, u64 inline_version, void *inline_data, int inline_len, struct ceph_buffer *xattr_buf, struct ceph_mds_session *session, struct ceph_cap *cap, int issued) __releases(ci->i_ceph_lock) + __releases(mdsc->snap_rwsem) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; @@ -2654,10 +2653,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, spin_unlock(&ci->i_ceph_lock); if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { - down_write(&mdsc->snap_rwsem); - ceph_update_snap_trace(mdsc, snaptrace, - snaptrace + snaptrace_len, false); - downgrade_write(&mdsc->snap_rwsem); kick_flushing_inode_caps(mdsc, session, inode); up_read(&mdsc->snap_rwsem); if (newcaps & ~issued) @@ -3067,6 +3062,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_cap *cap; struct ceph_mds_caps *h; struct ceph_mds_cap_peer *peer = NULL; + struct ceph_snap_realm *realm; int mds = session->s_mds; int op, issued; u32 seq, mseq; @@ -3168,11 +3164,23 @@ void ceph_handle_caps(struct ceph_mds_session *session, goto done_unlocked; case CEPH_CAP_OP_IMPORT: + realm = NULL; + if (snaptrace_len) { + down_write(&mdsc->snap_rwsem); + ceph_update_snap_trace(mdsc, snaptrace, + snaptrace + snaptrace_len, + false, &realm); + downgrade_write(&mdsc->snap_rwsem); + } else { + down_read(&mdsc->snap_rwsem); + } handle_cap_import(mdsc, inode, h, peer, session, &cap, &issued); - handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len, + handle_cap_grant(mdsc, inode, h, inline_version, inline_data, inline_len, msg->middle, session, cap, issued); + if (realm) + ceph_put_snap_realm(mdsc, realm); goto done_unlocked; } @@ -3192,7 +3200,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, case CEPH_CAP_OP_GRANT: __ceph_caps_issued(ci, &issued); issued |= __ceph_caps_dirty(ci); - handle_cap_grant(mdsc, inode, h, NULL, 0, + handle_cap_grant(mdsc, inode, h, inline_version, inline_data, inline_len, msg->middle, session, cap, issued); goto done_unlocked; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c6c33b411a2f..85c67ae03e46 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2286,6 +2286,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) struct ceph_mds_request *req; struct ceph_mds_reply_head *head = msg->front.iov_base; struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */ + struct ceph_snap_realm *realm; u64 tid; int err, result; int mds = session->s_mds; @@ -2401,11 +2402,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) } /* snap trace */ + realm = NULL; if (rinfo->snapblob_len) { down_write(&mdsc->snap_rwsem); ceph_update_snap_trace(mdsc, rinfo->snapblob, - rinfo->snapblob + rinfo->snapblob_len, - le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP); + rinfo->snapblob + rinfo->snapblob_len, + le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP, + &realm); downgrade_write(&mdsc->snap_rwsem); } else { down_read(&mdsc->snap_rwsem); @@ -2423,6 +2426,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_unlock(&req->r_fill_mutex); up_read(&mdsc->snap_rwsem); + if (realm) + ceph_put_snap_realm(mdsc, realm); out_err: mutex_lock(&mdsc->mutex); if (!req->r_aborted) { diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index ce35fbd4ba5d..a97e39f09ba6 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -70,13 +70,11 @@ void ceph_get_snap_realm(struct ceph_mds_client *mdsc, * safe. we do need to protect against concurrent empty list * additions, however. */ - if (atomic_read(&realm->nref) == 0) { + if (atomic_inc_return(&realm->nref) == 1) { spin_lock(&mdsc->snap_empty_lock); list_del_init(&realm->empty_item); spin_unlock(&mdsc->snap_empty_lock); } - - atomic_inc(&realm->nref); } static void __insert_snap_realm(struct rb_root *root, @@ -116,7 +114,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm( if (!realm) return ERR_PTR(-ENOMEM); - atomic_set(&realm->nref, 0); /* tree does not take a ref */ + atomic_set(&realm->nref, 1); /* for caller */ realm->ino = ino; INIT_LIST_HEAD(&realm->children); INIT_LIST_HEAD(&realm->child_item); @@ -134,8 +132,8 @@ static struct ceph_snap_realm *ceph_create_snap_realm( * * caller must hold snap_rwsem for write. */ -struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, - u64 ino) +static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc, + u64 ino) { struct rb_node *n = mdsc->snap_realms.rb_node; struct ceph_snap_realm *r; @@ -154,6 +152,16 @@ struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, return NULL; } +struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, + u64 ino) +{ + struct ceph_snap_realm *r; + r = __lookup_snap_realm(mdsc, ino); + if (r) + ceph_get_snap_realm(mdsc, r); + return r; +} + static void __put_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm); @@ -273,7 +281,6 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc, } realm->parent_ino = parentino; realm->parent = parent; - ceph_get_snap_realm(mdsc, parent); list_add(&realm->child_item, &parent->children); return 1; } @@ -631,12 +638,14 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) * Caller must hold snap_rwsem for write. */ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, - void *p, void *e, bool deletion) + void *p, void *e, bool deletion, + struct ceph_snap_realm **realm_ret) { struct ceph_mds_snap_realm *ri; /* encoded */ __le64 *snaps; /* encoded */ __le64 *prior_parent_snaps; /* encoded */ - struct ceph_snap_realm *realm; + struct ceph_snap_realm *realm = NULL; + struct ceph_snap_realm *first_realm = NULL; int invalidate = 0; int err = -ENOMEM; LIST_HEAD(dirty_realms); @@ -704,13 +713,18 @@ more: dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, realm, invalidate, p, e); - if (p < e) - goto more; - /* invalidate when we reach the _end_ (root) of the trace */ - if (invalidate) + if (invalidate && p >= e) rebuild_snap_realms(realm); + if (!first_realm) + first_realm = realm; + else + ceph_put_snap_realm(mdsc, realm); + + if (p < e) + goto more; + /* * queue cap snaps _after_ we've built the new snap contexts, * so that i_head_snapc can be set appropriately. @@ -721,12 +735,21 @@ more: queue_realm_cap_snaps(realm); } + if (realm_ret) + *realm_ret = first_realm; + else + ceph_put_snap_realm(mdsc, first_realm); + __cleanup_empty_realms(mdsc); return 0; bad: err = -EINVAL; fail: + if (realm && !IS_ERR(realm)) + ceph_put_snap_realm(mdsc, realm); + if (first_realm) + ceph_put_snap_realm(mdsc, first_realm); pr_err("update_snap_trace error %d\n", err); return err; } @@ -844,7 +867,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, if (IS_ERR(realm)) goto out; } - ceph_get_snap_realm(mdsc, realm); dout("splitting snap_realm %llx %p\n", realm->ino, realm); for (i = 0; i < num_split_inos; i++) { @@ -905,7 +927,7 @@ skip_inode: /* we may have taken some of the old realm's children. */ for (i = 0; i < num_split_realms; i++) { struct ceph_snap_realm *child = - ceph_lookup_snap_realm(mdsc, + __lookup_snap_realm(mdsc, le64_to_cpu(split_realms[i])); if (!child) continue; @@ -918,7 +940,7 @@ skip_inode: * snap, we can avoid queueing cap_snaps. */ ceph_update_snap_trace(mdsc, p, e, - op == CEPH_SNAP_OP_DESTROY); + op == CEPH_SNAP_OP_DESTROY, NULL); if (op == CEPH_SNAP_OP_SPLIT) /* we took a reference when we created the realm, above */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index e1aa32d0759d..72bc05a73b69 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -693,7 +693,8 @@ extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc, extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm); extern int ceph_update_snap_trace(struct ceph_mds_client *m, - void *p, void *e, bool deletion); + void *p, void *e, bool deletion, + struct ceph_snap_realm **realm_ret); extern void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); -- cgit v1.2.3 From d3383a8e37f802818cde4cb489bb0735db637cf0 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 8 Jan 2015 21:30:12 +0800 Subject: ceph: avoid block operation when !TASK_RUNNING (ceph_mdsc_sync) check_cap_flush() calls mutex_lock(), which may block. So we can't use it as condition check function for wait_event(); Signed-off-by: Yan, Zheng --- fs/ceph/caps.c | 2 +- fs/ceph/mds_client.c | 51 ++++++++++++++++++++++++++++++++++----------------- 2 files changed, 35 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 8ed1192606d9..844b57cb52bd 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1450,8 +1450,8 @@ static int __mark_caps_flushing(struct inode *inode, spin_lock(&mdsc->cap_dirty_lock); list_del_init(&ci->i_dirty_item); - ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; if (list_empty(&ci->i_flushing_item)) { + ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); mdsc->num_cap_flushing++; dout(" inode %p now flushing seq %lld\n", inode, diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 85c67ae03e46..fdf5cc8737ee 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1464,19 +1464,33 @@ out_unlocked: return err; } +static int check_cap_flush(struct inode *inode, u64 want_flush_seq) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + spin_lock(&ci->i_ceph_lock); + if (ci->i_flushing_caps) + ret = ci->i_cap_flush_seq >= want_flush_seq; + else + ret = 1; + spin_unlock(&ci->i_ceph_lock); + return ret; +} + /* * flush all dirty inode data to disk. * * returns true if we've flushed through want_flush_seq */ -static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) +static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) { - int mds, ret = 1; + int mds; dout("check_cap_flush want %lld\n", want_flush_seq); mutex_lock(&mdsc->mutex); - for (mds = 0; ret && mds < mdsc->max_sessions; mds++) { + for (mds = 0; mds < mdsc->max_sessions; mds++) { struct ceph_mds_session *session = mdsc->sessions[mds]; + struct inode *inode = NULL; if (!session) continue; @@ -1489,29 +1503,29 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) list_entry(session->s_cap_flushing.next, struct ceph_inode_info, i_flushing_item); - struct inode *inode = &ci->vfs_inode; - spin_lock(&ci->i_ceph_lock); - if (ci->i_cap_flush_seq <= want_flush_seq) { + if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) { dout("check_cap_flush still flushing %p " - "seq %lld <= %lld to mds%d\n", inode, - ci->i_cap_flush_seq, want_flush_seq, - session->s_mds); - ret = 0; + "seq %lld <= %lld to mds%d\n", + &ci->vfs_inode, ci->i_cap_flush_seq, + want_flush_seq, session->s_mds); + inode = igrab(&ci->vfs_inode); } - spin_unlock(&ci->i_ceph_lock); } mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); - if (!ret) - return ret; + if (inode) { + wait_event(mdsc->cap_flushing_wq, + check_cap_flush(inode, want_flush_seq)); + iput(inode); + } + mutex_lock(&mdsc->mutex); } mutex_unlock(&mdsc->mutex); dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); - return ret; } /* @@ -3447,14 +3461,17 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) dout("sync\n"); mutex_lock(&mdsc->mutex); want_tid = mdsc->last_tid; - want_flush = mdsc->cap_flush_seq; mutex_unlock(&mdsc->mutex); - dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); ceph_flush_dirty_caps(mdsc); + spin_lock(&mdsc->cap_dirty_lock); + want_flush = mdsc->cap_flush_seq; + spin_unlock(&mdsc->cap_dirty_lock); + + dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); wait_unsafe_requests(mdsc, want_tid); - wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); + wait_caps_flush(mdsc, want_flush); } /* -- cgit v1.2.3 From c4d4a582c538e890f09c338bc3063c28dfdc9ae5 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 9 Jan 2015 15:56:18 +0800 Subject: ceph: avoid block operation when !TASK_RUNNING (ceph_get_caps) we should not do block operation in wait_event_interruptible()'s condition check function, but reading inline data can block. so move the read inline data code to ceph_get_caps() Signed-off-by: Yan, Zheng --- fs/ceph/caps.c | 86 ++++++++++++++++++++++++++++------------------------------ 1 file changed, 42 insertions(+), 44 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 844b57cb52bd..8172775428a0 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2072,17 +2072,16 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got) * requested from the MDS. */ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, - loff_t endoff, int *got, struct page **pinned_page, - int *check_max, int *err) + loff_t endoff, int *got, int *check_max, int *err) { struct inode *inode = &ci->vfs_inode; int ret = 0; - int have, implemented, _got = 0; + int have, implemented; int file_wanted; dout("get_cap_refs %p need %s want %s\n", inode, ceph_cap_string(need), ceph_cap_string(want)); -again: + spin_lock(&ci->i_ceph_lock); /* make sure file is actually open */ @@ -2137,8 +2136,8 @@ again: inode, ceph_cap_string(have), ceph_cap_string(not), ceph_cap_string(revoking)); if ((revoking & not) == 0) { - _got = need | (have & want); - __take_cap_refs(ci, _got); + *got = need | (have & want); + __take_cap_refs(ci, *got); ret = 1; } } else { @@ -2163,39 +2162,8 @@ again: out_unlock: spin_unlock(&ci->i_ceph_lock); - if (ci->i_inline_version != CEPH_INLINE_NONE && - (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && - i_size_read(inode) > 0) { - int ret1; - struct page *page = find_get_page(inode->i_mapping, 0); - if (page) { - if (PageUptodate(page)) { - *pinned_page = page; - goto out; - } - page_cache_release(page); - } - /* - * drop cap refs first because getattr while holding - * caps refs can cause deadlock. - */ - ceph_put_cap_refs(ci, _got); - _got = 0; - - /* getattr request will bring inline data into page cache */ - ret1 = __ceph_do_getattr(inode, NULL, - CEPH_STAT_CAP_INLINE_DATA, true); - if (ret1 >= 0) { - ret = 0; - goto again; - } - *err = ret1; - ret = 1; - } -out: dout("get_cap_refs %p ret %d got %s\n", inode, - ret, ceph_cap_string(_got)); - *got = _got; + ret, ceph_cap_string(*got)); return ret; } @@ -2235,22 +2203,52 @@ static void check_max_size(struct inode *inode, loff_t endoff) int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, loff_t endoff, int *got, struct page **pinned_page) { - int check_max, ret, err; + int _got, check_max, ret, err = 0; retry: if (endoff > 0) check_max_size(&ci->vfs_inode, endoff); + _got = 0; check_max = 0; - err = 0; ret = wait_event_interruptible(ci->i_cap_wq, - try_get_cap_refs(ci, need, want, endoff, - got, pinned_page, - &check_max, &err)); + try_get_cap_refs(ci, need, want, endoff, + &_got, &check_max, &err)); if (err) ret = err; + if (ret < 0) + return ret; + if (check_max) goto retry; - return ret; + + if (ci->i_inline_version != CEPH_INLINE_NONE && + (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && + i_size_read(&ci->vfs_inode) > 0) { + struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0); + if (page) { + if (PageUptodate(page)) { + *pinned_page = page; + goto out; + } + page_cache_release(page); + } + /* + * drop cap refs first because getattr while holding + * caps refs can cause deadlock. + */ + ceph_put_cap_refs(ci, _got); + _got = 0; + + /* getattr request will bring inline data into page cache */ + ret = __ceph_do_getattr(&ci->vfs_inode, NULL, + CEPH_STAT_CAP_INLINE_DATA, true); + if (ret < 0) + return ret; + goto retry; + } +out: + *got = _got; + return 0; } /* -- cgit v1.2.3 From 86d8f67b26a8b30228b5177b7e594bbc89798a23 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 9 Jan 2015 17:00:42 +0800 Subject: ceph: avoid block operation when !TASK_RUNNING (ceph_mdsc_close_sessions) use an atomic variable to track number of sessions, this can avoid block operation inside wait loops. Signed-off-by: Yan, Zheng --- fs/ceph/mds_client.c | 13 ++++--------- fs/ceph/mds_client.h | 1 + 2 files changed, 5 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index fdf5cc8737ee..c90ca99331be 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -480,6 +480,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, mdsc->max_sessions = newmax; } mdsc->sessions[mds] = s; + atomic_inc(&mdsc->num_sessions); atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds, @@ -503,6 +504,7 @@ static void __unregister_session(struct ceph_mds_client *mdsc, mdsc->sessions[s->s_mds] = NULL; ceph_con_close(&s->s_con); ceph_put_mds_session(s); + atomic_dec(&mdsc->num_sessions); } /* @@ -3328,6 +3330,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); mdsc->sessions = NULL; + atomic_set(&mdsc->num_sessions, 0); mdsc->max_sessions = 0; mdsc->stopping = 0; init_rwsem(&mdsc->snap_rwsem); @@ -3479,17 +3482,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) */ static bool done_closing_sessions(struct ceph_mds_client *mdsc) { - int i, n = 0; - if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) return true; - - mutex_lock(&mdsc->mutex); - for (i = 0; i < mdsc->max_sessions; i++) - if (mdsc->sessions[i]) - n++; - mutex_unlock(&mdsc->mutex); - return n == 0; + return atomic_read(&mdsc->num_sessions) == 0; } /* diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index a87b92f500bb..1875b5d985c6 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -273,6 +273,7 @@ struct ceph_mds_client { struct list_head waiting_for_map; struct ceph_mds_session **sessions; /* NULL for mds if no session */ + atomic_t num_sessions; int max_sessions; /* len of s_mds_sessions */ int stopping; /* true if shutting down */ -- cgit v1.2.3 From fcc02d2a03fc629b82d1ca1006fbd06570385264 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sat, 10 Jan 2015 11:43:12 +0800 Subject: ceph: fix reading inline data when i_size > PAGE_SIZE when inode has inline data but its size > PAGE_SIZE (it was truncated to larger size), previous direct read code return -EIO. This patch adds code to return zeros for data whose offset > PAGE_SIZE. Signed-off-by: Yan, Zheng --- fs/ceph/addr.c | 19 ++++++++++++------- fs/ceph/file.c | 22 ++++++++++++++-------- 2 files changed, 26 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c81c0e004588..7d05e37874d4 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -196,17 +196,22 @@ static int readpage_nounlock(struct file *filp, struct page *page) u64 len = PAGE_CACHE_SIZE; if (off >= i_size_read(inode)) { - zero_user_segment(page, err, PAGE_CACHE_SIZE); + zero_user_segment(page, 0, PAGE_CACHE_SIZE); SetPageUptodate(page); return 0; } - /* - * Uptodate inline data should have been added into page cache - * while getting Fcr caps. - */ - if (ci->i_inline_version != CEPH_INLINE_NONE) - return -EINVAL; + if (ci->i_inline_version != CEPH_INLINE_NONE) { + /* + * Uptodate inline data should have been added + * into page cache while getting Fcr caps. + */ + if (off == 0) + return -EINVAL; + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + return 0; + } err = ceph_readpage_from_fscache(inode, page); if (err == 0) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 663da44c06b6..c407abb52b7b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -879,28 +879,34 @@ again: i_size = i_size_read(inode); if (retry_op == READ_INLINE) { - /* does not support inline data > PAGE_SIZE */ - if (i_size > PAGE_CACHE_SIZE) { - ret = -EIO; - } else if (iocb->ki_pos < i_size) { + BUG_ON(ret > 0 || read > 0); + if (iocb->ki_pos < i_size && + iocb->ki_pos < PAGE_CACHE_SIZE) { loff_t end = min_t(loff_t, i_size, iocb->ki_pos + len); + end = min_t(loff_t, end, PAGE_CACHE_SIZE); if (statret < end) zero_user_segment(page, statret, end); ret = copy_page_to_iter(page, iocb->ki_pos & ~PAGE_MASK, end - iocb->ki_pos, to); iocb->ki_pos += ret; - } else { - ret = 0; + read += ret; + } + if (iocb->ki_pos < i_size && read < len) { + size_t zlen = min_t(size_t, len - read, + i_size - iocb->ki_pos); + ret = iov_iter_zero(zlen, to); + iocb->ki_pos += ret; + read += ret; } __free_pages(page, 0); - return ret; + return read; } /* hit EOF or hole? */ if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && - ret < len) { + ret < len) { dout("sync_read hit hole, ppos %lld < size %lld" ", reading more\n", iocb->ki_pos, inode->i_size); -- cgit v1.2.3 From 1f041a89b4f22cf2e701514f4b8f73a8b1e06a3e Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 13 Jan 2015 15:20:52 +0800 Subject: ceph: fix request time stamp encoding struct timespec uses 'long' to present second and nanosecond. 'long' is 64 bits on 64bits machine. ceph MDS expects time stamp to be encoded as struct ceph_timespec, which uses 'u32' to present second and nanosecond. Signed-off-by: Yan, Zheng --- fs/ceph/mds_client.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c90ca99331be..03720fe3f531 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1939,7 +1939,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, head->num_releases = cpu_to_le16(releases); /* time stamp */ - ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); + { + struct ceph_timespec ts; + ceph_encode_timespec(&ts, &req->r_stamp); + ceph_encode_copy(&p, &ts, sizeof(ts)); + } BUG_ON(p > end); msg->front.iov_len = p - msg->front.iov_base; @@ -2028,7 +2032,11 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, /* time stamp */ p = msg->front.iov_base + req->r_request_release_offset; - ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); + { + struct ceph_timespec ts; + ceph_encode_timespec(&ts, &req->r_stamp); + ceph_encode_copy(&p, &ts, sizeof(ts)); + } msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); -- cgit v1.2.3 From 38c48b5f0a7fd5ed9fdab6da4d208aa23cc5391a Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 14 Jan 2015 13:46:04 +0800 Subject: ceph: provide seperate {inode,file}_operations for snapdir remove all unsupported operations from {inode,file}_operations. Signed-off-by: Yan, Zheng --- fs/ceph/dir.c | 17 +++++++++++++++-- fs/ceph/inode.c | 4 ++-- fs/ceph/super.h | 2 ++ 3 files changed, 19 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index c241603764fd..709f3b98ca5c 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -26,8 +26,6 @@ * point by name. */ -const struct inode_operations ceph_dir_iops; -const struct file_operations ceph_dir_fops; const struct dentry_operations ceph_dentry_ops; /* @@ -1335,6 +1333,13 @@ const struct file_operations ceph_dir_fops = { .fsync = ceph_dir_fsync, }; +const struct file_operations ceph_snapdir_fops = { + .iterate = ceph_readdir, + .llseek = ceph_dir_llseek, + .open = ceph_open, + .release = ceph_release, +}; + const struct inode_operations ceph_dir_iops = { .lookup = ceph_lookup, .permission = ceph_permission, @@ -1357,6 +1362,14 @@ const struct inode_operations ceph_dir_iops = { .atomic_open = ceph_atomic_open, }; +const struct inode_operations ceph_snapdir_iops = { + .lookup = ceph_lookup, + .permission = ceph_permission, + .getattr = ceph_getattr, + .mkdir = ceph_mkdir, + .rmdir = ceph_unlink, +}; + const struct dentry_operations ceph_dentry_ops = { .d_revalidate = ceph_d_revalidate, .d_release = ceph_d_release, diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index f61a74115beb..d0fe2f437fbb 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -82,8 +82,8 @@ struct inode *ceph_get_snapdir(struct inode *parent) inode->i_mode = parent->i_mode; inode->i_uid = parent->i_uid; inode->i_gid = parent->i_gid; - inode->i_op = &ceph_dir_iops; - inode->i_fop = &ceph_dir_fops; + inode->i_op = &ceph_snapdir_iops; + inode->i_fop = &ceph_snapdir_fops; ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ ci->i_rbytes = 0; return inode; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 72bc05a73b69..04c8124ed30e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -893,7 +893,9 @@ extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, int ceph_uninline_data(struct file *filp, struct page *locked_page); /* dir.c */ extern const struct file_operations ceph_dir_fops; +extern const struct file_operations ceph_snapdir_fops; extern const struct inode_operations ceph_dir_iops; +extern const struct inode_operations ceph_snapdir_iops; extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, ceph_snapdir_dentry_ops; -- cgit v1.2.3 From a6a5ce4f0df9146ba8cb61121b80aa191fbb1f04 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 16 Jan 2015 10:54:43 +0800 Subject: client: include kernel version in client metadata Signed-off-by: Yan, Zheng --- fs/ceph/mds_client.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 03720fe3f531..03482c0974b6 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -844,8 +844,9 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 struct ceph_options *opt = mdsc->fsc->client->options; void *p; - const char* metadata[3][2] = { + const char* metadata[][2] = { {"hostname", utsname()->nodename}, + {"kernel_version", utsname()->release}, {"entity_id", opt->name ? opt->name : ""}, {NULL, NULL} }; -- cgit v1.2.3 From 2f92b3d0a9a583a5a4dd786a84fc42e6f1aa40fa Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 19 Jan 2015 13:12:24 +0800 Subject: ceph: properly mark empty directory as complete ceph_add_cap() calls __check_cap_issue(), which clears directory inode' complete flag. so we should set the complete flag for empty directory should be set after calling ceph_add_cap(). Signed-off-by: Yan, Zheng --- fs/ceph/inode.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index d0fe2f437fbb..f88a0f059dc8 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -840,30 +840,31 @@ static int fill_inode(struct inode *inode, struct page *locked_page, ceph_vinop(inode), inode->i_mode); } - /* set dir completion flag? */ - if (S_ISDIR(inode->i_mode) && - ci->i_files == 0 && ci->i_subdirs == 0 && - ceph_snap(inode) == CEPH_NOSNAP && - (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && - (issued & CEPH_CAP_FILE_EXCL) == 0 && - !__ceph_dir_is_complete(ci)) { - dout(" marking %p complete (empty)\n", inode); - __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count), - ci->i_ordered_count); - } - /* were we issued a capability? */ if (info->cap.caps) { if (ceph_snap(inode) == CEPH_NOSNAP) { + unsigned caps = le32_to_cpu(info->cap.caps); ceph_add_cap(inode, session, le64_to_cpu(info->cap.cap_id), - cap_fmode, - le32_to_cpu(info->cap.caps), + cap_fmode, caps, le32_to_cpu(info->cap.wanted), le32_to_cpu(info->cap.seq), le32_to_cpu(info->cap.mseq), le64_to_cpu(info->cap.realm), info->cap.flags, &new_cap); + + /* set dir completion flag? */ + if (S_ISDIR(inode->i_mode) && + ci->i_files == 0 && ci->i_subdirs == 0 && + (caps & CEPH_CAP_FILE_SHARED) && + (issued & CEPH_CAP_FILE_EXCL) == 0 && + !__ceph_dir_is_complete(ci)) { + dout(" marking %p complete (empty)\n", inode); + __ceph_dir_set_complete(ci, + atomic_read(&ci->i_release_count), + ci->i_ordered_count); + } + wake = true; } else { dout(" %p got snap_caps %s\n", inode, -- cgit v1.2.3 From bf91c3150880ed6304f578cf00bd408d642fe6a0 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 19 Jan 2015 13:23:20 +0800 Subject: ceph: fix atomic_open snapdir ceph_handle_snapdir() checks ceph_mdsc_do_request()'s return value and creates snapdir inode if it's -ENOENT Signed-off-by: Yan, Zheng --- fs/ceph/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index c407abb52b7b..848969ee24db 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -275,10 +275,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, req); + err = ceph_handle_snapdir(req, dentry, err); if (err) goto out_req; - err = ceph_handle_snapdir(req, dentry, err); if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); -- cgit v1.2.3 From 2a0b61cefcd52ad63ff03aacae6d4113cdf46812 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 2 Feb 2015 17:54:17 +0300 Subject: ceph: show nocephx_require_signatures and notcp_nodelay options Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 50f06cddc94b..8f8983f38b82 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -425,6 +425,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",noshare"); if (opt->flags & CEPH_OPT_NOCRC) seq_puts(m, ",nocrc"); + if (opt->flags & CEPH_OPT_NOMSGAUTH) + seq_puts(m, ",nocephx_require_signatures"); + if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0) + seq_puts(m, ",notcp_nodelay"); if (opt->name) seq_printf(m, ",name=%s", opt->name); -- cgit v1.2.3 From 3de22be6771353241eaec237fe594dfea3daf30f Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 4 Feb 2015 14:26:22 +0800 Subject: ceph: re-send requests when MDS enters reconnecting stage So that MDS can check if any request is already completed and process completed requests in clientreplay stage. When completed requests are processed in clientreplay stage, MDS can avoid sending traceless replies. Signed-off-by: Yan, Zheng --- fs/ceph/mds_client.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 03482c0974b6..4c1e36a171af 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2184,6 +2184,8 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds) p = rb_next(p); if (req->r_got_unsafe) continue; + if (req->r_attempts > 0) + continue; /* only new requests */ if (req->r_session && req->r_session->s_mds == mds) { dout(" kicking tid %llu\n", req->r_tid); @@ -2517,6 +2519,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); BUG_ON(req->r_err); BUG_ON(req->r_got_result); + req->r_attempts = 0; req->r_num_fwd = fwd_seq; req->r_resend_mds = next_mds; put_request_session(req); @@ -2648,6 +2651,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_mds_request *req, *nreq; + struct rb_node *p; int err; dout("replay_unsafe_requests mds%d\n", session->s_mds); @@ -2660,6 +2664,28 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, ceph_con_send(&session->s_con, req->r_request); } } + + /* + * also re-send old requests when MDS enters reconnect stage. So that MDS + * can process completed request in clientreplay stage. + */ + p = rb_first(&mdsc->request_tree); + while (p) { + req = rb_entry(p, struct ceph_mds_request, r_node); + p = rb_next(p); + if (req->r_got_unsafe) + continue; + if (req->r_attempts == 0) + continue; /* only old requests */ + if (req->r_session && + req->r_session->s_mds == session->s_mds) { + err = __prepare_send_request(mdsc, req, session->s_mds); + if (!err) { + ceph_msg_get(req->r_request); + ceph_con_send(&session->s_con, req->r_request); + } + } + } mutex_unlock(&mdsc->mutex); } @@ -2977,9 +3003,6 @@ static void check_new_map(struct ceph_mds_client *mdsc, mutex_unlock(&s->s_mutex); s->s_state = CEPH_MDS_SESSION_RESTARTING; } - - /* kick any requests waiting on the recovering mds */ - kick_requests(mdsc, i); } else if (oldstate == newstate) { continue; /* nothing new with this mds */ } -- cgit v1.2.3 From 5cba372c0fe78d24e83d9e0556ecbeb219625c15 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 2 Feb 2015 11:27:56 +0800 Subject: ceph: fix dentry leaks Signed-off-by: Yan, Zheng --- fs/ceph/dir.c | 1 + fs/ceph/inode.c | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 709f3b98ca5c..77eeb768f95a 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -676,6 +676,7 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) */ BUG_ON(!result->d_inode); d_instantiate(dentry, result->d_inode); + d_drop(result); return 0; } return PTR_ERR(result); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index f88a0f059dc8..be3af18e4cf1 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1449,12 +1449,14 @@ retry_lookup: } if (!dn->d_inode) { - dn = splice_dentry(dn, in, NULL); - if (IS_ERR(dn)) { - err = PTR_ERR(dn); + struct dentry *realdn = splice_dentry(dn, in, NULL); + if (IS_ERR(realdn)) { + err = PTR_ERR(realdn); + d_drop(dn); dn = NULL; goto next_item; } + dn = realdn; } di = dn->d_fsdata; -- cgit v1.2.3 From 4d41cef279f72f3965140fffa6b48f2a7d51408c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 4 Feb 2015 15:10:48 +0800 Subject: ceph: return error for traceless reply race When we receives traceless reply for request that created new inode, we re-send a lookup request to MDS get information of the newly created inode. (VFS expects FS' callback return an inode in create case) This breaks one request into two requests. Other client may modify or move to the new inode in the middle. When the race happens, ceph_handle_notrace_create() unconditionally links the dentry for 'create' operation to the inode returned by lookup. This may confuse VFS when the inode is a directory (VFS does not allow multiple linkages for directory inode). This patch makes ceph_handle_notrace_create() when it detect a race. This event should be rare and it happens only when we talk to old MDS. Recent MDS does not send traceless reply for request that creates new inode. Signed-off-by: Yan, Zheng --- fs/ceph/dir.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 77eeb768f95a..0411dbb15815 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -670,14 +670,17 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) /* * We created the item, then did a lookup, and found * it was already linked to another inode we already - * had in our cache (and thus got spliced). Link our - * dentry to that inode, but don't hash it, just in - * case the VFS wants to dereference it. + * had in our cache (and thus got spliced). To not + * confuse VFS (especially when inode is a directory), + * we don't link our dentry to that inode, return an + * error instead. + * + * This event should be rare and it happens only when + * we talk to old MDS. Recent MDS does not send traceless + * reply for request that creates new inode. */ - BUG_ON(!result->d_inode); - d_instantiate(dentry, result->d_inode); d_drop(result); - return 0; + return -ESTALE; } return PTR_ERR(result); } -- cgit v1.2.3