From 94e6992bb560be8bffb47f287194adf070b57695 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 26 Sep 2018 18:03:16 +0200 Subject: libceph: bump CEPH_MSG_MAX_DATA_LEN If the read is large enough, we end up spinning in the messenger: libceph: osd0 192.168.122.1:6801 io error libceph: osd0 192.168.122.1:6801 io error libceph: osd0 192.168.122.1:6801 io error This is a receive side limit, so only reads were affected. Cc: stable@vger.kernel.org Signed-off-by: Ilya Dryomov --- include/linux/ceph/libceph.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 49c93b9308d7..68bb09c29ce8 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -81,7 +81,13 @@ struct ceph_options { #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) #define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024) -#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) + +/* + * Handle the largest possible rbd object in one message. + * There is no limit on the size of cephfs objects, but it has to obey + * rsize and wsize mount options anyway. + */ +#define CEPH_MSG_MAX_DATA_LEN (32*1024*1024) #define CEPH_AUTH_NAME_DEFAULT "guest" -- cgit v1.2.3 From 24639ce56040a8ea890ad8836068c1ad8bd177c7 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 26 Sep 2018 19:12:07 +0200 Subject: libceph: osd_req_op_cls_init() doesn't need to take opcode Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 02096da01845..f3e828460c91 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -444,9 +444,8 @@ extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages); -extern int osd_req_op_cls_init(struct ceph_osd_request *osd_req, - unsigned int which, u16 opcode, - const char *class, const char *method); +int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, + const char *class, const char *method); extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *name, const void *value, size_t size, u8 cmp_op, u8 cmp_mode); -- cgit v1.2.3 From 33165d472310262d8c79c7e4d1a17dc60cea7e35 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 28 Sep 2018 15:38:34 +0200 Subject: libceph: introduce ceph_pagelist_alloc() struct ceph_pagelist cannot be embedded into anything else because it has its own refcount. Merge allocation and initialization together. Signed-off-by: Ilya Dryomov --- include/linux/ceph/pagelist.h | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/pagelist.h b/include/linux/ceph/pagelist.h index d0223364349f..5dead8486fd8 100644 --- a/include/linux/ceph/pagelist.h +++ b/include/linux/ceph/pagelist.h @@ -23,16 +23,7 @@ struct ceph_pagelist_cursor { size_t room; /* room remaining to reset to */ }; -static inline void ceph_pagelist_init(struct ceph_pagelist *pl) -{ - INIT_LIST_HEAD(&pl->head); - pl->mapped_tail = NULL; - pl->length = 0; - pl->room = 0; - INIT_LIST_HEAD(&pl->free_list); - pl->num_pages_free = 0; - refcount_set(&pl->refcnt, 1); -} +struct ceph_pagelist *ceph_pagelist_alloc(gfp_t gfp_flags); extern void ceph_pagelist_release(struct ceph_pagelist *pl); -- cgit v1.2.3 From 0d9c1ab3be4c0187663096a6a084421d0a1e45c6 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 15 Oct 2018 17:38:23 +0200 Subject: libceph: preallocate message data items Currently message data items are allocated with ceph_msg_data_create() in setup_request_data() inside send_request(). send_request() has never been allowed to fail, so each allocation is followed by a BUG_ON: data = ceph_msg_data_create(...); BUG_ON(!data); It's been this way since support for multiple message data items was added in commit 6644ed7b7e04 ("libceph: make message data be a pointer") in 3.10. There is no reason to delay the allocation of message data items until the last possible moment and we certainly don't need a linked list of them as they are only ever appended to the end and never erased. Make ceph_msg_new2() take max_data_items and adapt the rest of the code. Reported-by: Jerry Lee Signed-off-by: Ilya Dryomov --- include/linux/ceph/messenger.h | 24 +++++------------------- include/linux/ceph/msgpool.h | 11 ++++++----- 2 files changed, 11 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index fc2b4491ee0a..800a2128d411 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -82,22 +82,6 @@ enum ceph_msg_data_type { CEPH_MSG_DATA_BVECS, /* data source/destination is a bio_vec array */ }; -static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) -{ - switch (type) { - case CEPH_MSG_DATA_NONE: - case CEPH_MSG_DATA_PAGES: - case CEPH_MSG_DATA_PAGELIST: -#ifdef CONFIG_BLOCK - case CEPH_MSG_DATA_BIO: -#endif /* CONFIG_BLOCK */ - case CEPH_MSG_DATA_BVECS: - return true; - default: - return false; - } -} - #ifdef CONFIG_BLOCK struct ceph_bio_iter { @@ -181,7 +165,6 @@ struct ceph_bvec_iter { } while (0) struct ceph_msg_data { - struct list_head links; /* ceph_msg->data */ enum ceph_msg_data_type type; union { #ifdef CONFIG_BLOCK @@ -202,7 +185,6 @@ struct ceph_msg_data { struct ceph_msg_data_cursor { size_t total_resid; /* across all data items */ - struct list_head *data_head; /* = &ceph_msg->data */ struct ceph_msg_data *data; /* current data item */ size_t resid; /* bytes not yet consumed */ @@ -240,7 +222,9 @@ struct ceph_msg { struct ceph_buffer *middle; size_t data_length; - struct list_head data; + struct ceph_msg_data *data; + int num_data_items; + int max_data_items; struct ceph_msg_data_cursor cursor; struct ceph_connection *con; @@ -381,6 +365,8 @@ void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos, void ceph_msg_data_add_bvecs(struct ceph_msg *msg, struct ceph_bvec_iter *bvec_pos); +struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items, + gfp_t flags, bool can_fail); extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, bool can_fail); diff --git a/include/linux/ceph/msgpool.h b/include/linux/ceph/msgpool.h index 76c98a512758..729cdf700eae 100644 --- a/include/linux/ceph/msgpool.h +++ b/include/linux/ceph/msgpool.h @@ -13,14 +13,15 @@ struct ceph_msgpool { mempool_t *pool; int type; /* preallocated message type */ int front_len; /* preallocated payload size */ + int max_data_items; }; -extern int ceph_msgpool_init(struct ceph_msgpool *pool, int type, - int front_len, int size, bool blocking, - const char *name); +int ceph_msgpool_init(struct ceph_msgpool *pool, int type, + int front_len, int max_data_items, int size, + const char *name); extern void ceph_msgpool_destroy(struct ceph_msgpool *pool); -extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *, - int front_len); +struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len, + int max_data_items); extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *); #endif -- cgit v1.2.3 From 23ddf9bea900b4ce39e5707e1ddf66062cfe6d91 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Mon, 15 Oct 2018 16:45:58 +0100 Subject: libceph: support the RADOS copy-from operation Add support for performing remote object copies using the 'copy-from' operation. [ Add COPY_FROM to get_num_data_items(). ] Signed-off-by: Luis Henriques Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 17 +++++++++++++++++ include/linux/ceph/rados.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) (limited to 'include') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index f3e828460c91..7a2af5034278 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -136,6 +136,13 @@ struct ceph_osd_req_op { u64 expected_object_size; u64 expected_write_size; } alloc_hint; + struct { + u64 snapid; + u64 src_version; + u8 flags; + u32 src_fadvise_flags; + struct ceph_osd_data osd_data; + } copy_from; }; }; @@ -510,6 +517,16 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct timespec64 *mtime, struct page **pages, int nr_pages); +int ceph_osdc_copy_from(struct ceph_osd_client *osdc, + u64 src_snapid, u64 src_version, + struct ceph_object_id *src_oid, + struct ceph_object_locator *src_oloc, + u32 src_fadvise_flags, + struct ceph_object_id *dst_oid, + struct ceph_object_locator *dst_oloc, + u32 dst_fadvise_flags, + u8 copy_from_flags); + /* watch/notify */ struct ceph_osd_linger_request * ceph_osdc_watch(struct ceph_osd_client *osdc, diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index f1988387c5ad..3eb0e55665b4 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -410,6 +410,14 @@ enum { enum { CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */ CEPH_OSD_OP_FLAG_FAILOK = 2, /* continue despite failure */ + CEPH_OSD_OP_FLAG_FADVISE_RANDOM = 0x4, /* the op is random */ + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, /* the op is sequential */ + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED = 0x10,/* data will be accessed in + the near future */ + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED = 0x20,/* data will not be accessed + in the near future */ + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE = 0x40,/* data will be accessed only + once by this client */ }; #define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ @@ -431,6 +439,15 @@ enum { CEPH_OSD_CMPXATTR_MODE_U64 = 2 }; +enum { + CEPH_OSD_COPY_FROM_FLAG_FLUSH = 1, /* part of a flush operation */ + CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY = 2, /* ignore pool overlay */ + CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE = 4, /* ignore osd cache logic */ + CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE = 8, /* map snap direct to + * cloneid */ + CEPH_OSD_COPY_FROM_FLAG_RWORDERED = 16, /* order with write */ +}; + enum { CEPH_OSD_WATCH_OP_UNWATCH = 0, CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1, @@ -497,6 +514,17 @@ struct ceph_osd_op { __le64 expected_object_size; __le64 expected_write_size; } __attribute__ ((packed)) alloc_hint; + struct { + __le64 snapid; + __le64 src_version; + __u8 flags; /* CEPH_OSD_COPY_FROM_FLAG_* */ + /* + * CEPH_OSD_OP_FLAG_FADVISE_*: fadvise flags + * for src object, flags for dest object are in + * ceph_osd_op::flags. + */ + __le32 src_fadvise_flags; + } __attribute__ ((packed)) copy_from; }; __le32 payload_len; } __attribute__ ((packed)); -- cgit v1.2.3