From 82dcabad750a36a2b749889bc89c5a3188775b2e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 19 Jan 2016 16:19:06 +0100 Subject: libceph: revamp subs code, switch to SUBSCRIBE2 protocol It is currently hard-coded in the mon_client that mdsmap and monmap subs are continuous, while osdmap sub is always "onetime". To better handle full clusters/pools in the osd_client, we need to be able to issue continuous osdmap subs. Revamp subs code to allow us to specify for each sub whether it should be continuous or not. Although not strictly required for the above, switch to SUBSCRIBE2 protocol while at it, eliminating the ambiguity between a request for "every map since X" and a request for "just the latest" when we don't have a map yet (i.e. have epoch 0). SUBSCRIBE2 feature bit is now required - it's been supported since pre-argonaut (2010). Move "got mdsmap" call to the end of ceph_mdsc_handle_map() - calling in before we validate the epoch and successfully install the new map can mess up mon_client sub state. Signed-off-by: Ilya Dryomov --- include/linux/ceph/ceph_features.h | 2 ++ include/linux/ceph/ceph_fs.h | 4 ++-- include/linux/ceph/mon_client.h | 28 ++++++++++++++++++++-------- 3 files changed, 24 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 15151f3c4120..ae2f66833762 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -105,6 +105,7 @@ static inline u64 ceph_sanitize_features(u64 features) */ #define CEPH_FEATURES_SUPPORTED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ + CEPH_FEATURE_SUBSCRIBE2 | \ CEPH_FEATURE_RECONNECT_SEQ | \ CEPH_FEATURE_PGID64 | \ CEPH_FEATURE_PGPOOL3 | \ @@ -127,6 +128,7 @@ static inline u64 ceph_sanitize_features(u64 features) #define CEPH_FEATURES_REQUIRED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ + CEPH_FEATURE_SUBSCRIBE2 | \ CEPH_FEATURE_RECONNECT_SEQ | \ CEPH_FEATURE_PGID64 | \ CEPH_FEATURE_PGPOOL3 | \ diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index d7d072a25c27..bf74005eedec 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -198,8 +198,8 @@ struct ceph_client_mount { #define CEPH_SUBSCRIBE_ONETIME 1 /* i want only 1 update after have */ struct ceph_mon_subscribe_item { - __le64 have_version; __le64 have; - __u8 onetime; + __le64 start; + __u8 flags; } __attribute__ ((packed)); struct ceph_mon_subscribe_ack { diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index 81810dc21f06..8b2d2f0b659e 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -68,7 +68,8 @@ struct ceph_mon_client { bool hunting; int cur_mon; /* last monitor i contacted */ - unsigned long sub_sent, sub_renew_after; + unsigned long sub_renew_after; + unsigned long sub_renew_sent; struct ceph_connection con; /* pending generic requests */ @@ -76,10 +77,12 @@ struct ceph_mon_client { int num_generic_requests; u64 last_tid; - /* mds/osd map */ - int want_mdsmap; - int want_next_osdmap; /* 1 = want, 2 = want+asked */ - u32 have_osdmap, have_mdsmap; + /* subs, indexed with CEPH_SUB_* */ + struct { + struct ceph_mon_subscribe_item item; + bool want; + u32 have; /* epoch */ + } subs[3]; #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_file; @@ -93,14 +96,23 @@ extern int ceph_monmap_contains(struct ceph_monmap *m, extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); extern void ceph_monc_stop(struct ceph_mon_client *monc); +enum { + CEPH_SUB_MDSMAP = 0, + CEPH_SUB_MONMAP, + CEPH_SUB_OSDMAP, +}; + +extern const char *ceph_sub_str[]; + /* * The model here is to indicate that we need a new map of at least - * epoch @want, and also call in when we receive a map. We will + * epoch @epoch, and also call in when we receive a map. We will * periodically rerequest the map from the monitor cluster until we * get what we want. */ -extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have); -extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have); +bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch, + bool continuous); +void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch); extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc); extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, -- cgit v1.2.3 From 58d81b1294f02262a141687cd62529c1ec8e6484 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 21 Jan 2016 16:33:15 +0100 Subject: libceph: monc ping rate is 10s Split ping interval and ping timeout: ping interval is 10s; keepalive timeout is 30s. Make monc_ping_timeout a constant while at it - it's not actually exported as a mount option (and the rest of tick-related settings won't be either), so it's got no place in ceph_options. Signed-off-by: Ilya Dryomov --- include/linux/ceph/libceph.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 3e3799cdc6e6..f5466273b9a3 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -47,7 +47,6 @@ struct ceph_options { unsigned long mount_timeout; /* jiffies */ unsigned long osd_idle_ttl; /* jiffies */ unsigned long osd_keepalive_timeout; /* jiffies */ - unsigned long monc_ping_timeout; /* jiffies */ /* * any type that can't be simply compared or doesn't need need @@ -68,7 +67,9 @@ struct ceph_options { #define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000) #define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000) #define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000) -#define CEPH_MONC_PING_TIMEOUT_DEFAULT msecs_to_jiffies(30 * 1000) + +#define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000) +#define CEPH_MONC_PING_TIMEOUT msecs_to_jiffies(30 * 1000) #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) #define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024) -- cgit v1.2.3 From 168b9090c739c4b5556023a3f08789b349ca7339 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 21 Jan 2016 16:33:19 +0100 Subject: libceph: monc hunt rate is 3s with backoff up to 30s Unless we are in the process of setting up a client (i.e. connecting to the monitor cluster for the first time), apply a backoff: every time we want to reopen a session, increase our timeout by a multiple (currently 2); when we complete the connection, reduce that multipler by 50%. Mirrors ceph.git commit 794c86fd289bd62a35ed14368fa096c46736e9a2. Signed-off-by: Ilya Dryomov --- include/linux/ceph/libceph.h | 3 +++ include/linux/ceph/mon_client.h | 3 +++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index f5466273b9a3..e7975e4681e1 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -68,8 +68,11 @@ struct ceph_options { #define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000) #define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000) +#define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000) #define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000) #define CEPH_MONC_PING_TIMEOUT msecs_to_jiffies(30 * 1000) +#define CEPH_MONC_HUNT_BACKOFF 2 +#define CEPH_MONC_HUNT_MAX_MULT 10 #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) #define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024) diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index 8b2d2f0b659e..e230e7ed60d3 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -72,6 +72,9 @@ struct ceph_mon_client { unsigned long sub_renew_sent; struct ceph_connection con; + bool had_a_connection; + int hunt_mult; /* [1..CEPH_MONC_HUNT_MAX_MULT] */ + /* pending generic requests */ struct rb_root generic_request_tree; int num_generic_requests; -- cgit v1.2.3 From de2aa102ea464a54dba14b9588e0bc188bd94707 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 8 Feb 2016 13:39:46 +0100 Subject: libceph: rename ceph_osd_req_op::payload_len to indata_len Follow userspace nomenclature on this - the next commit adds outdata_len. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 7506b485bb6d..35c8b006916f 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -77,7 +77,7 @@ struct ceph_osd_data { struct ceph_osd_req_op { u16 op; /* CEPH_OSD_OP_* */ u32 flags; /* CEPH_OSD_OP_FLAG_* */ - u32 payload_len; + u32 indata_len; /* request */ union { struct ceph_osd_data raw_data_in; struct { -- cgit v1.2.3 From 7665d85b7307fa0218881bc2009de067c42dc52e Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 7 Jan 2016 16:48:57 +0800 Subject: libceph: move r_reply_op_{len,result} into struct ceph_osd_req_op This avoids defining large array of r_reply_op_{len,result} in in struct ceph_osd_request. Signed-off-by: Yan, Zheng Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 35c8b006916f..c6d1d603bacf 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -78,6 +78,9 @@ struct ceph_osd_req_op { u16 op; /* CEPH_OSD_OP_* */ u32 flags; /* CEPH_OSD_OP_FLAG_* */ u32 indata_len; /* request */ + u32 outdata_len; /* reply */ + s32 rval; + union { struct ceph_osd_data raw_data_in; struct { @@ -148,8 +151,6 @@ struct ceph_osd_request { struct ceph_eversion *r_request_reassert_version; int r_result; - int r_reply_op_len[CEPH_OSD_MAX_OP]; - s32 r_reply_op_result[CEPH_OSD_MAX_OP]; int r_got_reply; int r_linger; -- cgit v1.2.3 From 3f1af42ad0fad8a12242233dd0d9fc42f5e83415 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 9 Feb 2016 17:50:15 +0100 Subject: libceph: enable large, variable-sized OSD requests Turn r_ops into a flexible array member to enable large, consisting of up to 16 ops, OSD requests. The use case is scattered writeback in cephfs and, as far as the kernel client is concerned, 16 is just a made up number. r_ops had size 3 for copyup+hint+write, but copyup is really a special case - it can only happen once. ceph_osd_request_cache is therefore stuffed with num_ops=2 requests, anything bigger than that is allocated with kmalloc(). req_mempool is backed by ceph_osd_request_cache, which means either num_ops=1 or num_ops=2 for use_mempool=true - all existing users (ceph_writepages_start(), ceph_osdc_writepages()) are fine with that. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index c6d1d603bacf..aada6a1383a4 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -43,7 +43,8 @@ struct ceph_osd { }; -#define CEPH_OSD_MAX_OP 3 +#define CEPH_OSD_SLAB_OPS 2 +#define CEPH_OSD_MAX_OPS 16 enum ceph_osd_data_type { CEPH_OSD_DATA_TYPE_NONE = 0, @@ -139,7 +140,6 @@ struct ceph_osd_request { /* request osd ops array */ unsigned int r_num_ops; - struct ceph_osd_req_op r_ops[CEPH_OSD_MAX_OP]; /* these are updated on each send */ __le32 *r_request_osdmap_epoch; @@ -175,6 +175,8 @@ struct ceph_osd_request { unsigned long r_stamp; /* send OR check time */ struct ceph_snap_context *r_snapc; /* snap context for writes */ + + struct ceph_osd_req_op r_ops[]; }; struct ceph_request_redirect { -- cgit v1.2.3 From 2c63f49a724a10bb71cc0fd34f8e5acce78525d5 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 7 Jan 2016 17:32:54 +0800 Subject: libceph: add helper that duplicates last extent operation This helper duplicates last extent operation in OSD request, then adjusts the new extent operation's offset and length. The helper is for scatterd page writeback, which adds nonconsecutive dirty pages to single OSD request. Signed-off-by: Yan, Zheng Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index aada6a1383a4..4343df806710 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -266,6 +266,8 @@ extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req, u64 truncate_size, u32 truncate_seq); extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req, unsigned int which, u64 length); +extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, + unsigned int which, u64 offset_inc); extern struct ceph_osd_data *osd_req_op_extent_osd_data( struct ceph_osd_request *osd_req, -- cgit v1.2.3 From 315f24088048a51eed341c53be66ea477a3c7d16 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 7 Mar 2016 10:34:50 +0800 Subject: ceph: fix security xattr deadlock When security is enabled, security module can call filesystem's getxattr/setxattr callbacks during d_instantiate(). For cephfs, d_instantiate() is usually called by MDS' dispatch thread, while handling MDS reply. If the MDS reply does not include xattrs and corresponding caps, getxattr/setxattr need to send a new request to MDS and waits for the reply. This makes MDS' dispatch sleep, nobody handles later MDS replies. The fix is make sure lookup/atomic_open reply include xattrs and corresponding caps. So getxattr can be handled by cached xattrs. This requires some modification to both MDS and request message. (Client tells MDS what caps it wants; MDS encodes proper caps in the reply) Smack security module may call setxattr during d_instantiate(). Unlike getxattr, we can't force MDS to issue CEPH_CAP_XATTR_EXCL to us. So just make setxattr return error when called by MDS' dispatch thread. Signed-off-by: Yan, Zheng --- include/linux/ceph/ceph_fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index bf74005eedec..37f28bf55ce4 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -376,7 +376,8 @@ union ceph_mds_request_args { __le32 stripe_count; /* ... */ __le32 object_size; __le32 file_replication; - __le32 unused; /* used to be preferred osd */ + __le32 mask; /* CEPH_CAP_* */ + __le32 old_size; } __attribute__ ((packed)) open; struct { __le32 flags; -- cgit v1.2.3