From 13d1ad16d05eebb4db977eb955716b9da2c19fbd Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 27 Apr 2016 14:15:51 +0200 Subject: libceph: move message allocation out of ceph_osdc_alloc_request() The size of ->r_request and ->r_reply messages depends on the size of the object name (ceph_object_id), while the size of ceph_osd_request is fixed. Move message allocation into a separate function that would have to be called after ceph_object_id and ceph_object_locator (which is also going to become variable in size with RADOS namespaces) have been filled in: req = ceph_osdc_alloc_request(...); r_base_oid> r_base_oloc> ceph_osdc_alloc_messages(req); Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index cbf460927c42..66a1fcd5bff7 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -322,6 +322,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client * unsigned int num_ops, bool use_mempool, gfp_t gfp_flags); +int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp); extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, struct ceph_snap_context *snapc, -- cgit v1.2.3 From d30291b985d1854565d7f2c82a4457869d5265e8 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 29 Apr 2016 19:54:20 +0200 Subject: libceph: variable-sized ceph_object_id Currently ceph_object_id can hold object names of up to 100 (CEPH_MAX_OID_NAME_LEN) characters. This is enough for all use cases, expect one - long rbd image names: - a format 1 header is named ".rbd" - an object that points to a format 2 header is named "rbd_id." We operate on these potentially long-named objects during rbd map, and, for format 1 images, during header refresh. (A format 2 header name is a small system-generated string.) Lift this 100 character limit by making ceph_object_id be able to point to an externally-allocated string. Apart from being able to work with almost arbitrarily-long named objects, this allows us to reduce the size of ceph_object_id from >100 bytes to 64 bytes. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 62 +++++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index e55c08bc3a96..777a29412706 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -64,11 +64,47 @@ struct ceph_object_locator { */ #define CEPH_MAX_OID_NAME_LEN 100 +/* + * 51-char inline_name is long enough for all cephfs and all but one + * rbd requests: in ".rbd"/"rbd_id." can be + * arbitrarily long (~PAGE_SIZE). It's done once during rbd map; all + * other rbd requests fit into inline_name. + * + * Makes ceph_object_id 64 bytes on 64-bit. + */ +#define CEPH_OID_INLINE_LEN 52 + +/* + * Both inline and external buffers have space for a NUL-terminator, + * which is carried around. It's not required though - RADOS object + * names don't have to be NUL-terminated and may contain NULs. + */ struct ceph_object_id { - char name[CEPH_MAX_OID_NAME_LEN]; + char *name; + char inline_name[CEPH_OID_INLINE_LEN]; int name_len; }; +static inline void ceph_oid_init(struct ceph_object_id *oid) +{ + oid->name = oid->inline_name; + oid->name_len = 0; +} + +static inline bool ceph_oid_empty(const struct ceph_object_id *oid) +{ + return oid->name == oid->inline_name && !oid->name_len; +} + +void ceph_oid_copy(struct ceph_object_id *dest, + const struct ceph_object_id *src); +__printf(2, 3) +void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...); +__printf(3, 4) +int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp, + const char *fmt, ...); +void ceph_oid_destroy(struct ceph_object_id *oid); + struct ceph_pg_mapping { struct rb_node node; struct ceph_pg pgid; @@ -113,30 +149,6 @@ struct ceph_osdmap { int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3]; }; -static inline void ceph_oid_set_name(struct ceph_object_id *oid, - const char *name) -{ - int len; - - len = strlen(name); - if (len > sizeof(oid->name)) { - WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n", - name, len, sizeof(oid->name)); - len = sizeof(oid->name); - } - - memcpy(oid->name, name, len); - oid->name_len = len; -} - -static inline void ceph_oid_copy(struct ceph_object_id *dest, - struct ceph_object_id *src) -{ - BUG_ON(src->name_len > sizeof(dest->name)); - memcpy(dest->name, src->name, src->name_len); - dest->name_len = src->name_len; -} - static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd) { return osd >= 0 && osd < map->max_osd && -- cgit v1.2.3 From 0c0a8de13f9612a663b050afa955e6668858d1eb Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:21 +0200 Subject: libceph: nuke unused fields and functions Either unused or useless: osdmap->mkfs_epoch osd->o_marked_for_keepalive monc->num_generic_requests osdc->map_waiters osdc->last_requested_map osdc->timeout_tid osd_req_op_cls_response_data() osdmap_apply_incremental() @msgr arg Signed-off-by: Ilya Dryomov --- include/linux/ceph/mon_client.h | 1 - include/linux/ceph/osd_client.h | 8 -------- include/linux/ceph/osdmap.h | 6 ++---- 3 files changed, 2 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index e230e7ed60d3..330d045e4092 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -77,7 +77,6 @@ struct ceph_mon_client { /* pending generic requests */ struct rb_root generic_request_tree; - int num_generic_requests; u64 last_tid; /* subs, indexed with CEPH_SUB_* */ diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 66a1fcd5bff7..63854a8df183 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -37,11 +37,9 @@ struct ceph_osd { struct list_head o_osd_lru; struct ceph_auth_handshake o_auth; unsigned long lru_ttl; - int o_marked_for_keepalive; struct list_head o_keepalive_item; }; - #define CEPH_OSD_SLAB_OPS 2 #define CEPH_OSD_MAX_OPS 16 @@ -206,13 +204,10 @@ struct ceph_osd_client { struct ceph_osdmap *osdmap; /* current map */ struct rw_semaphore map_sem; - struct completion map_waiters; - u64 last_requested_map; struct mutex request_mutex; struct rb_root osds; /* osds */ struct list_head osd_lru; /* idle osds */ - u64 timeout_tid; /* tid of timeout triggering rq */ u64 last_tid; /* tid of last request */ struct rb_root requests; /* pending requests */ struct list_head req_lru; /* in-flight lru */ @@ -271,9 +266,6 @@ extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, extern struct ceph_osd_data *osd_req_op_extent_osd_data( struct ceph_osd_request *osd_req, unsigned int which); -extern struct ceph_osd_data *osd_req_op_cls_response_data( - struct ceph_osd_request *osd_req, - unsigned int which); extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *, unsigned int which, diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 777a29412706..ce7a41a182d4 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -123,7 +123,6 @@ struct ceph_pg_mapping { struct ceph_osdmap { struct ceph_fsid fsid; u32 epoch; - u32 mkfs_epoch; struct ceph_timespec created, modified; u32 flags; /* CEPH_OSDMAP_* */ @@ -205,9 +204,8 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) } extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end); -extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, - struct ceph_osdmap *map, - struct ceph_messenger *msgr); +struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, + struct ceph_osdmap *map); extern void ceph_osdmap_destroy(struct ceph_osdmap *map); /* calculate mapping of a file extent to an object */ -- cgit v1.2.3 From fcd00b68bbe2bf5606cb45c2cd4a250a390bcc1f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:22 +0200 Subject: libceph: DEFINE_RB_FUNCS macro Given struct foo { u64 id; struct rb_node bar_node; }; generate insert_bar(), erase_bar() and lookup_bar() functions with DEFINE_RB_FUNCS(bar, struct foo, id, bar_node) The key is assumed to be an integer (u64, int, etc), compared with < and >. nodefld has to be initialized with RB_CLEAR_NODE(). Start using it for MDS, MON and OSD requests and OSD sessions. Signed-off-by: Ilya Dryomov --- include/linux/ceph/libceph.h | 57 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'include') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index db92a8d4926e..690985daad1c 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -180,6 +180,63 @@ static inline int calc_pages_for(u64 off, u64 len) (off >> PAGE_SHIFT); } +/* + * These are not meant to be generic - an integer key is assumed. + */ +#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \ +static void insert_##name(struct rb_root *root, type *t) \ +{ \ + struct rb_node **n = &root->rb_node; \ + struct rb_node *parent = NULL; \ + \ + BUG_ON(!RB_EMPTY_NODE(&t->nodefld)); \ + \ + while (*n) { \ + type *cur = rb_entry(*n, type, nodefld); \ + \ + parent = *n; \ + if (t->keyfld < cur->keyfld) \ + n = &(*n)->rb_left; \ + else if (t->keyfld > cur->keyfld) \ + n = &(*n)->rb_right; \ + else \ + BUG(); \ + } \ + \ + rb_link_node(&t->nodefld, parent, n); \ + rb_insert_color(&t->nodefld, root); \ +} \ +static void erase_##name(struct rb_root *root, type *t) \ +{ \ + BUG_ON(RB_EMPTY_NODE(&t->nodefld)); \ + rb_erase(&t->nodefld, root); \ + RB_CLEAR_NODE(&t->nodefld); \ +} + +#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \ +static type *lookup_##name(struct rb_root *root, \ + typeof(((type *)0)->keyfld) key) \ +{ \ + struct rb_node *n = root->rb_node; \ + \ + while (n) { \ + type *cur = rb_entry(n, type, nodefld); \ + \ + if (key < cur->keyfld) \ + n = n->rb_left; \ + else if (key > cur->keyfld) \ + n = n->rb_right; \ + else \ + return cur; \ + } \ + \ + return NULL; \ +} + +#define DEFINE_RB_FUNCS(name, type, keyfld, nodefld) \ +DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \ +DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) + extern struct kmem_cache *ceph_inode_cachep; extern struct kmem_cache *ceph_cap_cachep; extern struct kmem_cache *ceph_cap_flush_cachep; -- cgit v1.2.3 From 985c1673885b77b2e0167c6478a833817d1e2fe5 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:22 +0200 Subject: libceph: fix ceph_eversion encoding eversion_t is version+epoch in userspace and is encoded in that order. ceph_eversion is defined as epoch+version in rados.h, yet we memcpy it in __send_request(). Reoder ceph_eversion fields. Signed-off-by: Ilya Dryomov --- include/linux/ceph/rados.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 2f822dca1046..913c87c26d33 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -114,8 +114,8 @@ struct ceph_object_layout { * compound epoch+version, used by storage layer to serialize mutations */ struct ceph_eversion { - __le32 epoch; __le64 version; + __le32 epoch; } __attribute__ ((packed)); /* -- cgit v1.2.3 From d9591f5e28686277d9312d3c7422faf1368b305e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:22 +0200 Subject: libceph: rename ceph_oloc_oid_to_pg() Rename ceph_oloc_oid_to_pg() to ceph_object_locator_to_pg(). Emphasise that returned is raw PG and return -ENOENT instead of -EIO if the pool doesn't exist. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index ce7a41a182d4..b70440c05b49 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -213,11 +213,10 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 off, u64 len, u64 *bno, u64 *oxoff, u64 *oxlen); -/* calculate mapping of object to a placement group */ -extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, - struct ceph_object_locator *oloc, - struct ceph_object_id *oid, - struct ceph_pg *pg_out); +int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + struct ceph_pg *raw_pgid); extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, -- cgit v1.2.3 From 6f3bfd45cd233eea0b07e3cabc0386b5de9321d2 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:22 +0200 Subject: libceph: ceph_osds, ceph_pg_to_up_acting_osds() Knowning just acting set isn't enough, we need to be able to record up set as well to detect interval changes. This means returning (up[], up_len, up_primary, acting[], acting_len, acting_primary) and passing it around. Introduce and switch to ceph_osds to help with that. Rename ceph_calc_pg_acting() to ceph_pg_to_up_acting_osds() and return both up and acting sets from it. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index b70440c05b49..942189d311e0 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -208,6 +208,20 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_osdmap *map); extern void ceph_osdmap_destroy(struct ceph_osdmap *map); +struct ceph_osds { + int osds[CEPH_PG_MAX_SIZE]; + int size; + int primary; /* id, NOT index */ +}; + +static inline void ceph_osds_init(struct ceph_osds *set) +{ + set->size = 0; + set->primary = -1; +} + +void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src); + /* calculate mapping of a file extent to an object */ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 off, u64 len, @@ -218,9 +232,10 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, struct ceph_object_locator *oloc, struct ceph_pg *raw_pgid); -extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, - struct ceph_pg pgid, - int *osds, int *primary); +void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, + const struct ceph_pg *raw_pgid, + struct ceph_osds *up, + struct ceph_osds *acting); extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid); -- cgit v1.2.3 From f81f16339a05775df600b2ff75a79be1864975c1 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:23 +0200 Subject: libceph: rename ceph_calc_pg_primary() Rename ceph_calc_pg_primary() to ceph_pg_to_acting_primary() to emphasise that it returns acting primary. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 942189d311e0..3fd978a1639b 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -236,8 +236,8 @@ void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, const struct ceph_pg *raw_pgid, struct ceph_osds *up, struct ceph_osds *acting); -extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, - struct ceph_pg pgid); +int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, + const struct ceph_pg *raw_pgid); extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id); -- cgit v1.2.3 From f984cb76cc5fb9fc76d6abb6c4694a5412e3f49b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:23 +0200 Subject: libceph: make pgid_cmp() global calc_target() code is going to need to know how to compare PGs. Take lhs and rhs pgid by const * while at it. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 3fd978a1639b..7783237ab06c 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -24,6 +24,8 @@ struct ceph_pg { uint32_t seed; }; +int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs); + #define CEPH_POOL_FLAG_HASHPSPOOL 1 struct ceph_pg_pool_info { -- cgit v1.2.3 From 04812acf572ef41fd51c11e0bf3385f34c0e1b5b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:23 +0200 Subject: libceph: pi->min_size, pi->last_force_request_resend Add and decode pi->min_size and pi->last_force_request_resend. These are going to be used by calc_target(). Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 7783237ab06c..989294d0b8d2 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -26,20 +26,23 @@ struct ceph_pg { int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs); -#define CEPH_POOL_FLAG_HASHPSPOOL 1 +#define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id + together */ struct ceph_pg_pool_info { struct rb_node node; s64 id; - u8 type; + u8 type; /* CEPH_POOL_TYPE_* */ u8 size; + u8 min_size; u8 crush_ruleset; u8 object_hash; + u32 last_force_request_resend; u32 pg_num, pgp_num; int pg_num_mask, pgp_num_mask; s64 read_tier; s64 write_tier; /* wins for read+write ops */ - u64 flags; + u64 flags; /* CEPH_POOL_FLAG_* */ char *name; }; -- cgit v1.2.3 From 63244fa123a755e4bbaee03022b68613c71d1332 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:23 +0200 Subject: libceph: introduce ceph_osd_request_target, calc_target() Introduce ceph_osd_request_target, containing all mapping-related fields of ceph_osd_request and calc_target() for calculating mappings and populating it. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 23 +++++++++++++++++++++++ include/linux/ceph/osdmap.h | 34 ++++++++++++++++++++++++++++++++++ include/linux/ceph/rados.h | 5 +++++ 3 files changed, 62 insertions(+) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 63854a8df183..48806ee4488d 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -24,6 +24,8 @@ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, struct ceph_msg *); typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); +#define CEPH_HOMELESS_OSD -1 + /* a given osd we're communicating with */ struct ceph_osd { atomic_t o_ref; @@ -118,6 +120,27 @@ struct ceph_osd_req_op { }; }; +struct ceph_osd_request_target { + struct ceph_object_id base_oid; + struct ceph_object_locator base_oloc; + struct ceph_object_id target_oid; + struct ceph_object_locator target_oloc; + + struct ceph_pg pgid; + u32 pg_num; + u32 pg_num_mask; + struct ceph_osds acting; + struct ceph_osds up; + int size; + int min_size; + bool sort_bitwise; + + unsigned int flags; /* CEPH_OSD_FLAG_* */ + bool paused; + + int osd; +}; + /* an in-flight request */ struct ceph_osd_request { u64 r_tid; /* unique for this client */ diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 989294d0b8d2..420bb7968b25 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -28,6 +28,7 @@ int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs); #define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id together */ +#define CEPH_POOL_FLAG_FULL (1ULL << 1) /* pool is full */ struct ceph_pg_pool_info { struct rb_node node; @@ -62,6 +63,22 @@ struct ceph_object_locator { s64 pool; }; +static inline void ceph_oloc_init(struct ceph_object_locator *oloc) +{ + oloc->pool = -1; +} + +static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc) +{ + return oloc->pool == -1; +} + +static inline void ceph_oloc_copy(struct ceph_object_locator *dest, + const struct ceph_object_locator *src) +{ + dest->pool = src->pool; +} + /* * Maximum supported by kernel client object name length * @@ -227,6 +244,23 @@ static inline void ceph_osds_init(struct ceph_osds *set) void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src); +bool ceph_is_new_interval(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting, + const struct ceph_osds *old_up, + const struct ceph_osds *new_up, + int old_size, + int new_size, + int old_min_size, + int new_min_size, + u32 old_pg_num, + u32 new_pg_num, + bool old_sort_bitwise, + bool new_sort_bitwise, + const struct ceph_pg *pgid); +bool ceph_osds_changed(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting, + bool any_change); + /* calculate mapping of a file extent to an object */ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 off, u64 len, diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 913c87c26d33..f28ed864e682 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -153,6 +153,11 @@ extern const char *ceph_osd_state_name(int s); #define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */ #define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */ #define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */ +#define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */ +#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */ +#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */ +#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */ +#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */ /* * The error code to return when an OSD can't handle a write -- cgit v1.2.3 From a66dd38309f5d9c66ec9bc7911ff8da8cc37bb9f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:23 +0200 Subject: libceph: switch to calc_target(), part 1 Replace __calc_request_pg() and most of __map_request() with calc_target() and start using req->r_t. ceph_osdc_build_request() however still encodes base_oid, because it's called before calc_target() is and target_oid is empty at that point in time; a printf in osdc_show() also shows base_oid. This is fixed in "libceph: switch to calc_target(), part 2". Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 48806ee4488d..03bf9d9e1517 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -150,12 +150,13 @@ struct ceph_osd_request { struct list_head r_linger_item; struct list_head r_linger_osd_item; struct ceph_osd *r_osd; - struct ceph_pg r_pgid; - int r_pg_osds[CEPH_PG_MAX_SIZE]; - int r_num_pg_osds; + + struct ceph_osd_request_target r_t; +#define r_base_oid r_t.base_oid +#define r_base_oloc r_t.base_oloc +#define r_flags r_t.flags struct ceph_msg *r_request, *r_reply; - int r_flags; /* any additional flags for the osd */ u32 r_sent; /* >0 if r_request is sending/sent */ /* request osd ops array */ @@ -167,7 +168,6 @@ struct ceph_osd_request { __le64 *r_request_pool; void *r_request_pgid; __le32 *r_request_attempts; - bool r_paused; struct ceph_eversion *r_request_reassert_version; int r_result; @@ -186,11 +186,6 @@ struct ceph_osd_request { struct inode *r_inode; /* for use by callbacks */ void *r_priv; /* ditto */ - struct ceph_object_locator r_base_oloc; - struct ceph_object_id r_base_oid; - struct ceph_object_locator r_target_oloc; - struct ceph_object_id r_target_oid; - u64 r_snapid; unsigned long r_stamp; /* send OR check time */ -- cgit v1.2.3 From bb873b539154ab51893430b4ad6ba4051775276a Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 26 May 2016 00:29:52 +0200 Subject: libceph: switch to calc_target(), part 2 The crux of this is getting rid of ceph_osdc_build_request(), so that MOSDOp can be encoded not before but after calc_target() calculates the actual target. Encoding now happens within ceph_osdc_start_request(). Also nuked is the accompanying bunch of pointers into the encoded buffer that was used to update fields on each send - instead, the entire front is re-encoded. If we want to support target->name_len != base->name_len in the future, there is no other way, because oid is surrounded by other fields in the encoded buffer. Encoding OSD ops and adding data items to the request message were mixed together in osd_req_encode_op(). While we want to re-encode OSD ops, we don't want to add duplicate data items to the message when resending, so all call to ceph_osdc_msg_data_add() are factored out into a new setup_request_data(). Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 29 +++++++++++------------------ include/linux/ceph/rados.h | 7 +++++++ 2 files changed, 18 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 03bf9d9e1517..67a37d98e0ca 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -104,7 +104,7 @@ struct ceph_osd_req_op { struct ceph_osd_data response_data; __u8 class_len; __u8 method_len; - __u8 argc; + u32 indata_len; } cls; struct { u64 cookie; @@ -162,14 +162,6 @@ struct ceph_osd_request { /* request osd ops array */ unsigned int r_num_ops; - /* these are updated on each send */ - __le32 *r_request_osdmap_epoch; - __le32 *r_request_flags; - __le64 *r_request_pool; - void *r_request_pgid; - __le32 *r_request_attempts; - struct ceph_eversion *r_request_reassert_version; - int r_result; int r_got_reply; int r_linger; @@ -180,16 +172,22 @@ struct ceph_osd_request { struct completion r_completion, r_safe_completion; ceph_osdc_callback_t r_callback; ceph_osdc_unsafe_callback_t r_unsafe_callback; - struct ceph_eversion r_reassert_version; struct list_head r_unsafe_item; struct inode *r_inode; /* for use by callbacks */ void *r_priv; /* ditto */ - u64 r_snapid; - unsigned long r_stamp; /* send OR check time */ + /* set by submitter */ + u64 r_snapid; /* for reads, CEPH_NOSNAP o/w */ + struct ceph_snap_context *r_snapc; /* for writes */ + struct timespec r_mtime; /* ditto */ + u64 r_data_offset; /* ditto */ - struct ceph_snap_context *r_snapc; /* snap context for writes */ + /* internal */ + unsigned long r_stamp; /* jiffies, send or check time */ + int r_attempts; + struct ceph_eversion r_replay_version; /* aka reassert_version */ + u32 r_last_force_resend; struct ceph_osd_req_op r_ops[]; }; @@ -334,11 +332,6 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client * gfp_t gfp_flags); int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp); -extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, - struct ceph_snap_context *snapc, - u64 snap_id, - struct timespec *mtime); - extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, struct ceph_file_layout *layout, struct ceph_vino vino, diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index f28ed864e682..28740a58f32c 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -394,6 +394,13 @@ enum { CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */ CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */ CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */ + CEPH_OSD_FLAG_MAP_SNAP_CLONE = 0x80000, /* map snap direct to clone id */ + CEPH_OSD_FLAG_ENFORCE_SNAPC = 0x100000, /* use snapc provided even if + pool uses pool snaps */ + CEPH_OSD_FLAG_REDIRECTED = 0x200000, /* op has been redirected */ + CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */ + CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */ + CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */ }; enum { -- cgit v1.2.3 From 85e084feb47349d62989efe1713a8723af95f4ea Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:24 +0200 Subject: libceph: drop msg argument from ceph_osdc_callback_t finish_read(), its only user, uses it to get to hdr.data_len, which is what ->r_result is set to on success. This gains us the ability to safely call callbacks from contexts other than reply, e.g. map check. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 67a37d98e0ca..3bebd60e7f9f 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -20,8 +20,7 @@ struct ceph_osd_client; /* * completion callback for async writepages */ -typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, - struct ceph_msg *); +typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *); typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); #define CEPH_HOMELESS_OSD -1 -- cgit v1.2.3 From fe5da05e979830b43b115d8a18ead521d507c783 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:24 +0200 Subject: libceph: redo callbacks and factor out MOSDOpReply decoding If you specify ACK | ONDISK and set ->r_unsafe_callback, both ->r_callback and ->r_unsafe_callback(true) are called on ack. This is very confusing. Redo this so that only one of them is called: ->r_unsafe_callback(true), on ack ->r_unsafe_callback(false), on commit or ->r_callback, on ack|commit Decode everything in decode_MOSDOpReply() to reduce clutter. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 3bebd60e7f9f..2415dc0cb008 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -162,13 +162,14 @@ struct ceph_osd_request { unsigned int r_num_ops; int r_result; - int r_got_reply; + bool r_got_reply; int r_linger; struct ceph_osd_client *r_osdc; struct kref r_kref; bool r_mempool; - struct completion r_completion, r_safe_completion; + struct completion r_completion; + struct completion r_safe_completion; /* fsync waiter */ ceph_osdc_callback_t r_callback; ceph_osdc_unsafe_callback_t r_unsafe_callback; struct list_head r_unsafe_item; -- cgit v1.2.3 From e5253a7bde13788d9dc75f42eb47ea119af5609f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:25 +0200 Subject: libceph: allocate dummy osdmap in ceph_osdc_init() This leads to a simpler osdmap handling code, particularly when dealing with pi->was_full, which is introduced in a later commit. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 420bb7968b25..8468c734d712 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -225,6 +225,7 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) return 0; } +struct ceph_osdmap *ceph_osdmap_alloc(void); extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end); struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_osdmap *map); -- cgit v1.2.3 From 42c1b1240326cbea86f15f5d4ce565d8b54be31f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:25 +0200 Subject: libceph: handle_one_map() Separate osdmap handling from decoding and iterating over a bag of maps in a fresh MOSDMap message. This sets up the scene for the updated OSD client. Of particular importance here is the addition of pi->was_full, which can be used to answer "did this pool go full -> not-full in this map?". This is the key bit for supporting pool quotas. We won't be able to downgrade map_sem for much longer, so drop downgrade_write(). Signed-off-by: Ilya Dryomov --- include/linux/ceph/mon_client.h | 1 + include/linux/ceph/osdmap.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index 330d045e4092..c14e9d861cda 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -115,6 +115,7 @@ extern const char *ceph_sub_str[]; bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch, bool continuous); void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch); +void ceph_monc_renew_subs(struct ceph_mon_client *monc); extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc); extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 8468c734d712..821e16fff39a 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -45,6 +45,8 @@ struct ceph_pg_pool_info { s64 write_tier; /* wins for read+write ops */ u64 flags; /* CEPH_POOL_FLAG_* */ char *name; + + bool was_full; /* for handle_one_map() */ }; static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool) -- cgit v1.2.3 From 9dd2845ccb40452d4ac943231ea34aade4a02c68 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:26 +0200 Subject: libceph: protect osdc->osd_lru list with a spinlock OSD client is getting moved from the big per-client lock to a set of per-session locks. The big rwlock would only be held for read most of the time, so a global osdc->osd_lru needs additional protection. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 2415dc0cb008..486d681694c4 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -224,6 +224,7 @@ struct ceph_osd_client { struct mutex request_mutex; struct rb_root osds; /* osds */ struct list_head osd_lru; /* idle osds */ + spinlock_t osd_lru_lock; u64 last_tid; /* tid of last request */ struct rb_root requests; /* pending requests */ struct list_head req_lru; /* in-flight lru */ -- cgit v1.2.3 From 5aea3dcd50215fa9563270251ad7323e2f2490ee Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:26 +0200 Subject: libceph: a major OSD client update This is a major sync up, up to ~Jewel. The highlights are: - per-session request trees (vs a global per-client tree) - per-session locking (vs a global per-client rwlock) - homeless OSD session - no ad-hoc global per-client lists - support for pool quotas - foundation for watch/notify v2 support - foundation for map check (pool deletion detection) support The switchover is incomplete: lingering requests can be setup and teared down but aren't ever reestablished. This functionality is restored with the introduction of the new lingering infrastructure (ceph_osd_linger_request, linger_work, etc) in a later commit. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 486d681694c4..342f22f1f040 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -33,12 +33,13 @@ struct ceph_osd { int o_incarnation; struct rb_node o_node; struct ceph_connection o_con; - struct list_head o_requests; + struct rb_root o_requests; struct list_head o_linger_requests; struct list_head o_osd_lru; struct ceph_auth_handshake o_auth; unsigned long lru_ttl; struct list_head o_keepalive_item; + struct mutex lock; }; #define CEPH_OSD_SLAB_OPS 2 @@ -144,8 +145,6 @@ struct ceph_osd_request_target { struct ceph_osd_request { u64 r_tid; /* unique for this client */ struct rb_node r_node; - struct list_head r_req_lru_item; - struct list_head r_osd_item; struct list_head r_linger_item; struct list_head r_linger_osd_item; struct ceph_osd *r_osd; @@ -219,19 +218,16 @@ struct ceph_osd_client { struct ceph_client *client; struct ceph_osdmap *osdmap; /* current map */ - struct rw_semaphore map_sem; + struct rw_semaphore lock; - struct mutex request_mutex; struct rb_root osds; /* osds */ struct list_head osd_lru; /* idle osds */ spinlock_t osd_lru_lock; - u64 last_tid; /* tid of last request */ - struct rb_root requests; /* pending requests */ - struct list_head req_lru; /* in-flight lru */ - struct list_head req_unsent; /* unsent/need-resend queue */ - struct list_head req_notarget; /* map to no osd */ struct list_head req_linger; /* lingering requests */ - int num_requests; + struct ceph_osd homeless_osd; + atomic64_t last_tid; /* tid of last request */ + atomic_t num_requests; + atomic_t num_homeless; struct delayed_work timeout_work; struct delayed_work osds_timeout_work; #ifdef CONFIG_DEBUG_FS -- cgit v1.2.3 From 922dab6134178cae317ae00de86376cba59f3147 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 26 May 2016 01:15:02 +0200 Subject: libceph, rbd: ceph_osd_linger_request, watch/notify v2 This adds support and switches rbd to a new, more reliable version of watch/notify protocol. As with the OSD client update, this is mostly about getting the right structures linked into the right places so that reconnects are properly sent when needed. watch/notify v2 also requires sending regular pings to the OSDs - send_linger_ping(). A major change from the old watch/notify implementation is the introduction of ceph_osd_linger_request - linger requests no longer piggy back on ceph_osd_request. ceph_osd_event has been merged into ceph_osd_linger_request. All the details are now hidden within libceph, the interface consists of a simple pair of watch/unwatch functions and ceph_osdc_notify_ack(). ceph_osdc_watch() does return ceph_osd_linger_request, but only to keep the lifetime management simple. ceph_osdc_notify_ack() accepts an optional data payload, which is relayed back to the notifier. Portions of this patch are loosely based on work by Douglas Fuller and Mike Christie . Signed-off-by: Ilya Dryomov --- include/linux/ceph/ceph_fs.h | 5 ++- include/linux/ceph/osd_client.h | 97 ++++++++++++++++++++++++----------------- include/linux/ceph/rados.h | 17 ++++++-- 3 files changed, 75 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 37f28bf55ce4..3b911ff889dd 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -153,8 +153,9 @@ struct ceph_dir_layout { /* watch-notify operations */ enum { - WATCH_NOTIFY = 1, /* notifying watcher */ - WATCH_NOTIFY_COMPLETE = 2, /* notifier notified when done */ + CEPH_WATCH_EVENT_NOTIFY = 1, /* notifying watcher */ + CEPH_WATCH_EVENT_NOTIFY_COMPLETE = 2, /* notifier notified when done */ + CEPH_WATCH_EVENT_DISCONNECT = 3, /* we were disconnected */ }; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 342f22f1f040..cd2dcb8939de 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -34,7 +34,7 @@ struct ceph_osd { struct rb_node o_node; struct ceph_connection o_con; struct rb_root o_requests; - struct list_head o_linger_requests; + struct rb_root o_linger_requests; struct list_head o_osd_lru; struct ceph_auth_handshake o_auth; unsigned long lru_ttl; @@ -108,11 +108,12 @@ struct ceph_osd_req_op { } cls; struct { u64 cookie; - u64 ver; - u32 prot_ver; - u32 timeout; - __u8 flag; + __u8 op; /* CEPH_OSD_WATCH_OP_ */ + u32 gen; } watch; + struct { + struct ceph_osd_data request_data; + } notify_ack; struct { u64 expected_object_size; u64 expected_write_size; @@ -145,8 +146,6 @@ struct ceph_osd_request_target { struct ceph_osd_request { u64 r_tid; /* unique for this client */ struct rb_node r_node; - struct list_head r_linger_item; - struct list_head r_linger_osd_item; struct ceph_osd *r_osd; struct ceph_osd_request_target r_t; @@ -162,7 +161,6 @@ struct ceph_osd_request { int r_result; bool r_got_reply; - int r_linger; struct ceph_osd_client *r_osdc; struct kref r_kref; @@ -181,6 +179,7 @@ struct ceph_osd_request { struct ceph_snap_context *r_snapc; /* for writes */ struct timespec r_mtime; /* ditto */ u64 r_data_offset; /* ditto */ + bool r_linger; /* don't resend on failure */ /* internal */ unsigned long r_stamp; /* jiffies, send or check time */ @@ -195,23 +194,40 @@ struct ceph_request_redirect { struct ceph_object_locator oloc; }; -struct ceph_osd_event { - u64 cookie; - int one_shot; +typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie, + u64 notifier_id, void *data, size_t data_len); +typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err); + +struct ceph_osd_linger_request { struct ceph_osd_client *osdc; - void (*cb)(u64, u64, u8, void *); - void *data; - struct rb_node node; - struct list_head osd_node; + u64 linger_id; + bool committed; + + struct ceph_osd *osd; + struct ceph_osd_request *reg_req; + struct ceph_osd_request *ping_req; + unsigned long ping_sent; + + struct ceph_osd_request_target t; + u32 last_force_resend; + + struct timespec mtime; + struct kref kref; -}; + struct mutex lock; + struct rb_node node; /* osd */ + struct rb_node osdc_node; /* osdc */ + struct list_head scan_item; + + struct completion reg_commit_wait; + int reg_commit_error; + int last_error; + + u32 register_gen; -struct ceph_osd_event_work { - struct work_struct work; - struct ceph_osd_event *event; - u64 ver; - u64 notify_id; - u8 opcode; + rados_watchcb2_t wcb; + rados_watcherrcb_t errcb; + void *data; }; struct ceph_osd_client { @@ -223,9 +239,10 @@ struct ceph_osd_client { struct rb_root osds; /* osds */ struct list_head osd_lru; /* idle osds */ spinlock_t osd_lru_lock; - struct list_head req_linger; /* lingering requests */ struct ceph_osd homeless_osd; atomic64_t last_tid; /* tid of last request */ + u64 last_linger_id; + struct rb_root linger_requests; /* lingering requests */ atomic_t num_requests; atomic_t num_homeless; struct delayed_work timeout_work; @@ -239,10 +256,6 @@ struct ceph_osd_client { struct ceph_msgpool msgpool_op; struct ceph_msgpool msgpool_op_reply; - spinlock_t event_lock; - struct rb_root event_tree; - u64 event_count; - struct workqueue_struct *notify_wq; }; @@ -314,9 +327,6 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *name, const void *value, size_t size, u8 cmp_op, u8 cmp_mode); -extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, - unsigned int which, u16 opcode, - u64 cookie, u64 version, int flag); extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, unsigned int which, u64 expected_object_size, @@ -339,9 +349,6 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, u32 truncate_seq, u64 truncate_size, bool use_mempool); -extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, - struct ceph_osd_request *req); - extern void ceph_osdc_get_request(struct ceph_osd_request *req); extern void ceph_osdc_put_request(struct ceph_osd_request *req); @@ -372,11 +379,23 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct timespec *mtime, struct page **pages, int nr_pages); -/* watch/notify events */ -extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, - void (*event_cb)(u64, u64, u8, void *), - void *data, struct ceph_osd_event **pevent); -extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); -extern void ceph_osdc_put_event(struct ceph_osd_event *event); +/* watch/notify */ +struct ceph_osd_linger_request * +ceph_osdc_watch(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + rados_watchcb2_t wcb, + rados_watcherrcb_t errcb, + void *data); +int ceph_osdc_unwatch(struct ceph_osd_client *osdc, + struct ceph_osd_linger_request *lreq); + +int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + u64 notify_id, + u64 cookie, + void *payload, + size_t payload_len); #endif diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 28740a58f32c..204c8c944703 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -427,7 +427,17 @@ enum { CEPH_OSD_CMPXATTR_MODE_U64 = 2 }; -#define RADOS_NOTIFY_VER 1 +enum { + CEPH_OSD_WATCH_OP_UNWATCH = 0, + CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1, + /* note: use only ODD ids to prevent pre-giant code from + interpreting the op as UNWATCH */ + CEPH_OSD_WATCH_OP_WATCH = 3, + CEPH_OSD_WATCH_OP_RECONNECT = 5, + CEPH_OSD_WATCH_OP_PING = 7, +}; + +const char *ceph_osd_watch_op_name(int o); /* * an individual object operation. each may be accompanied by some data @@ -462,8 +472,9 @@ struct ceph_osd_op { } __attribute__ ((packed)) snap; struct { __le64 cookie; - __le64 ver; - __u8 flag; /* 0 = unwatch, 1 = watch */ + __le64 ver; /* no longer used */ + __u8 op; /* CEPH_OSD_WATCH_OP_* */ + __le32 gen; /* registration generation */ } __attribute__ ((packed)) watch; struct { __le64 offset, length; -- cgit v1.2.3 From 1907920324f1f3ebb6618344417c03a2863bba01 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:27 +0200 Subject: libceph: support for sending notifies Implement ceph_osdc_notify() for sending notifies. Due to the fact that the current messenger can't do read-in into pagelists (it can only do write-out from them), I had to go with a page vector for a NOTIFY_COMPLETE payload, for now. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 20 ++++++++++++++++++++ include/linux/ceph/rados.h | 3 +++ 2 files changed, 23 insertions(+) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index cd2dcb8939de..63054fae4f15 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -114,6 +114,11 @@ struct ceph_osd_req_op { struct { struct ceph_osd_data request_data; } notify_ack; + struct { + u64 cookie; + struct ceph_osd_data request_data; + struct ceph_osd_data response_data; + } notify; struct { u64 expected_object_size; u64 expected_write_size; @@ -202,6 +207,7 @@ struct ceph_osd_linger_request { struct ceph_osd_client *osdc; u64 linger_id; bool committed; + bool is_watch; /* watch or notify */ struct ceph_osd *osd; struct ceph_osd_request *reg_req; @@ -220,14 +226,20 @@ struct ceph_osd_linger_request { struct list_head scan_item; struct completion reg_commit_wait; + struct completion notify_finish_wait; int reg_commit_error; + int notify_finish_error; int last_error; u32 register_gen; + u64 notify_id; rados_watchcb2_t wcb; rados_watcherrcb_t errcb; void *data; + + struct page ***preply_pages; + size_t *preply_len; }; struct ceph_osd_client { @@ -397,5 +409,13 @@ int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, u64 cookie, void *payload, size_t payload_len); +int ceph_osdc_notify(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + void *payload, + size_t payload_len, + u32 timeout, + struct page ***preply_pages, + size_t *preply_len); #endif diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 204c8c944703..5c0da61cb763 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -476,6 +476,9 @@ struct ceph_osd_op { __u8 op; /* CEPH_OSD_WATCH_OP_* */ __le32 gen; /* registration generation */ } __attribute__ ((packed)) watch; + struct { + __le64 cookie; + } __attribute__ ((packed)) notify; struct { __le64 offset, length; __le64 src_offset; -- cgit v1.2.3 From b07d3c4bd7270c74e2b6803af8ac8a00cb3e5ed2 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:27 +0200 Subject: libceph: support for checking on status of watch Implement ceph_osdc_watch_check() to be able to check on status of watch. Note that the time it takes for a watch/notify event to get delivered through the notify_wq is taken into account. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 63054fae4f15..2ae7cfd82ec9 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -213,6 +213,8 @@ struct ceph_osd_linger_request { struct ceph_osd_request *reg_req; struct ceph_osd_request *ping_req; unsigned long ping_sent; + unsigned long watch_valid_thru; + struct list_head pending_lworks; struct ceph_osd_request_target t; u32 last_force_resend; @@ -417,5 +419,7 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc, u32 timeout, struct page ***preply_pages, size_t *preply_len); +int ceph_osdc_watch_check(struct ceph_osd_client *osdc, + struct ceph_osd_linger_request *lreq); #endif -- cgit v1.2.3 From d0b19705e99939f5ae5aa9b57bfe41dd4777d951 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:27 +0200 Subject: libceph: async MON client generic requests For map check, we are going to need to send CEPH_MSG_MON_GET_VERSION messages asynchronously and get a callback on completion. Refactor MON client to allow firing off generic requests asynchronously and add an async variant of ceph_monc_get_version(). ceph_monc_do_statfs() is switched over and remains sync. Signed-off-by: Ilya Dryomov --- include/linux/ceph/mon_client.h | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index c14e9d861cda..19800d9b45f3 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -39,20 +39,31 @@ struct ceph_mon_request { ceph_monc_request_func_t do_request; }; +typedef void (*ceph_monc_callback_t)(struct ceph_mon_generic_request *); + /* * ceph_mon_generic_request is being used for the statfs and * mon_get_version requests which are being done a bit differently * because we need to get data back to the caller */ struct ceph_mon_generic_request { + struct ceph_mon_client *monc; struct kref kref; u64 tid; struct rb_node node; int result; - void *buf; + struct completion completion; + ceph_monc_callback_t complete_cb; + u64 private_data; /* r_tid/linger_id */ + struct ceph_msg *request; /* original request */ struct ceph_msg *reply; /* and reply */ + + union { + struct ceph_statfs *st; + u64 newest; + } u; }; struct ceph_mon_client { @@ -124,8 +135,10 @@ extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, extern int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf); -extern int ceph_monc_do_get_version(struct ceph_mon_client *monc, - const char *what, u64 *newest); +int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what, + u64 *newest); +int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what, + ceph_monc_callback_t cb, u64 private_data); extern int ceph_monc_open_session(struct ceph_mon_client *monc); -- cgit v1.2.3 From 4609245e2670e3698b083bcd9cc69a65b2b6f9a6 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:27 +0200 Subject: libceph: pool deletion detection This adds the "map check" infrastructure for sending osdmap version checks on CALC_TARGET_POOL_DNE and completing in-flight requests with -ENOENT if the target pool doesn't exist or has just been deleted. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 2ae7cfd82ec9..3e7bf72e4078 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -151,6 +151,7 @@ struct ceph_osd_request_target { struct ceph_osd_request { u64 r_tid; /* unique for this client */ struct rb_node r_node; + struct rb_node r_mc_node; /* map check */ struct ceph_osd *r_osd; struct ceph_osd_request_target r_t; @@ -191,6 +192,7 @@ struct ceph_osd_request { int r_attempts; struct ceph_eversion r_replay_version; /* aka reassert_version */ u32 r_last_force_resend; + u32 r_map_dne_bound; struct ceph_osd_req_op r_ops[]; }; @@ -218,6 +220,7 @@ struct ceph_osd_linger_request { struct ceph_osd_request_target t; u32 last_force_resend; + u32 map_dne_bound; struct timespec mtime; @@ -225,6 +228,7 @@ struct ceph_osd_linger_request { struct mutex lock; struct rb_node node; /* osd */ struct rb_node osdc_node; /* osdc */ + struct rb_node mc_node; /* map check */ struct list_head scan_item; struct completion reg_commit_wait; @@ -257,6 +261,8 @@ struct ceph_osd_client { atomic64_t last_tid; /* tid of last request */ u64 last_linger_id; struct rb_root linger_requests; /* lingering requests */ + struct rb_root map_checks; + struct rb_root linger_map_checks; atomic_t num_requests; atomic_t num_homeless; struct delayed_work timeout_work; -- cgit v1.2.3 From 7cca78c9dcd1afa243e46edc31896730df85d2b5 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:28 +0200 Subject: libceph: replace ceph_monc_request_next_osdmap() ... with a wrapper around maybe_request_map() - no need for two osdmap-specific functions. Signed-off-by: Ilya Dryomov --- include/linux/ceph/mon_client.h | 1 - include/linux/ceph/osd_client.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index 19800d9b45f3..1d730993c3f8 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -128,7 +128,6 @@ bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch, void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch); void ceph_monc_renew_subs(struct ceph_mon_client *monc); -extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc); extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, unsigned long timeout); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 3e7bf72e4078..19b14862d3e0 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -381,6 +381,7 @@ extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, extern void ceph_osdc_sync(struct ceph_osd_client *osdc); extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc); +void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc); extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, struct ceph_vino vino, -- cgit v1.2.3 From 737cc81ead34bcef0b1f6ea8322228e4378cf21a Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 26 May 2016 00:05:01 +0200 Subject: libceph: support for subscribing to "mdsmap." maps Signed-off-by: Ilya Dryomov --- include/linux/ceph/ceph_fs.h | 2 ++ include/linux/ceph/mon_client.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 3b911ff889dd..bae833d0d055 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -208,6 +208,8 @@ struct ceph_mon_subscribe_ack { struct ceph_fsid fsid; } __attribute__ ((packed)); +#define CEPH_FS_CLUSTER_ID_NONE -1 + /* * mdsmap flags */ diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index 1d730993c3f8..e2a92df08b47 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -96,6 +96,7 @@ struct ceph_mon_client { bool want; u32 have; /* epoch */ } subs[3]; + int fs_cluster_id; /* "mdsmap." sub */ #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_file; -- cgit v1.2.3 From 956d39d631dbcf7b57854873a24e309047f2a7f5 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 27 Apr 2016 17:48:30 +0800 Subject: ceph: define 'end/complete' in readdir reply as bit flags Set a flag in readdir request, which indicates that client interprets 'end/complete' as bit flags. So that mds can reply additional flags in readdir reply. Signed-off-by: Yan, Zheng --- include/linux/ceph/ceph_fs.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index bae833d0d055..a811c5e98bfa 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -347,6 +347,17 @@ extern const char *ceph_mds_op_name(int op); #define CEPH_XATTR_REPLACE (1 << 1) #define CEPH_XATTR_REMOVE (1 << 31) +/* + * readdir request flags; + */ +#define CEPH_READDIR_REPLY_BITFLAGS (1<<0) + +/* + * readdir reply flags. + */ +#define CEPH_READDIR_FRAG_END (1<<0) +#define CEPH_READDIR_FRAG_COMPLETE (1<<8) + union ceph_mds_request_args { struct { __le32 mask; /* CEPH_CAP_* */ @@ -364,6 +375,7 @@ union ceph_mds_request_args { __le32 frag; /* which dir fragment */ __le32 max_entries; /* how many dentries to grab */ __le32 max_bytes; + __le16 flags; } __attribute__ ((packed)) readdir; struct { __le32 mode; -- cgit v1.2.3 From f3c4ebe65ea149ec892f94474233cfebe9cbe299 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 29 Apr 2016 11:27:30 +0800 Subject: ceph: using hash value to compose dentry offset If MDS sorts dentries in dirfrag in hash order, we use hash value to compose dentry offset. dentry offset is: (0xff << 52) | ((24 bits hash) << 28) | (the nth entry hash hash collision) This offset is stable across directory fragmentation. This alos means there is no need to reset readdir offset if directory get fragmented in the middle of readdir. Signed-off-by: Yan, Zheng --- include/linux/ceph/ceph_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index a811c5e98bfa..dfce616002ad 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -357,6 +357,7 @@ extern const char *ceph_mds_op_name(int op); */ #define CEPH_READDIR_FRAG_END (1<<0) #define CEPH_READDIR_FRAG_COMPLETE (1<<8) +#define CEPH_READDIR_HASH_ORDER (1<<9) union ceph_mds_request_args { struct { -- cgit v1.2.3 From 3b33f692c84c28cc8178aaeeb9264d82b48787f1 Mon Sep 17 00:00:00 2001 From: Zhang Zhuoyu Date: Fri, 25 Mar 2016 05:18:39 -0400 Subject: ceph: make logical calculation functions return bool This patch makes serverl logical caculation functions return bool to improve readability due to these particular functions only using 0/1 as their return value. No functional change. Signed-off-by: Zhang Zhuoyu --- include/linux/ceph/ceph_frag.h | 4 ++-- include/linux/ceph/decode.h | 2 +- include/linux/ceph/osdmap.h | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h index b827e066e55a..146507df8650 100644 --- a/include/linux/ceph/ceph_frag.h +++ b/include/linux/ceph/ceph_frag.h @@ -51,11 +51,11 @@ static inline __u32 ceph_frag_make_child(__u32 f, int by, int i) return ceph_frag_make(newbits, ceph_frag_value(f) | (i << (24 - newbits))); } -static inline int ceph_frag_is_leftmost(__u32 f) +static inline bool ceph_frag_is_leftmost(__u32 f) { return ceph_frag_value(f) == 0; } -static inline int ceph_frag_is_rightmost(__u32 f) +static inline bool ceph_frag_is_rightmost(__u32 f) { return ceph_frag_value(f) == ceph_frag_mask(f); } diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index a6ef9cc267ec..19e9932f3e77 10