diff options
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/rbd.c | 2452 | ||||
-rw-r--r-- | drivers/block/zram/zram_drv.c | 9 | ||||
-rw-r--r-- | drivers/block/zram/zram_drv.h | 16 |
3 files changed, 988 insertions, 1489 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 1e03b04819c8..07dc5419bd63 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -32,6 +32,7 @@ #include <linux/ceph/osd_client.h> #include <linux/ceph/mon_client.h> #include <linux/ceph/cls_lock_client.h> +#include <linux/ceph/striper.h> #include <linux/ceph/decode.h> #include <linux/parser.h> #include <linux/bsearch.h> @@ -200,95 +201,81 @@ struct rbd_client { }; struct rbd_img_request; -typedef void (*rbd_img_callback_t)(struct rbd_img_request *); - -#define BAD_WHICH U32_MAX /* Good which or bad which, which? */ - -struct rbd_obj_request; -typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); enum obj_request_type { - OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES + OBJ_REQUEST_NODATA = 1, + OBJ_REQUEST_BIO, /* pointer into provided bio (list) */ + OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */ + OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */ }; enum obj_operation_type { + OBJ_OP_READ = 1, OBJ_OP_WRITE, - OBJ_OP_READ, OBJ_OP_DISCARD, }; -enum obj_req_flags { - OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ - OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ - OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ - OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ +/* + * Writes go through the following state machine to deal with + * layering: + * + * need copyup + * RBD_OBJ_WRITE_GUARD ---------------> RBD_OBJ_WRITE_COPYUP + * | ^ | + * v \------------------------------/ + * done + * ^ + * | + * RBD_OBJ_WRITE_FLAT + * + * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether + * there is a parent or not. + */ +enum rbd_obj_write_state { + RBD_OBJ_WRITE_FLAT = 1, + RBD_OBJ_WRITE_GUARD, + RBD_OBJ_WRITE_COPYUP, }; struct rbd_obj_request { - u64 object_no; - u64 offset; /* object start byte */ - u64 length; /* bytes from offset */ - unsigned long flags; - - /* - * An object request associated with an image will have its - * img_data flag set; a standalone object request will not. - * - * A standalone object request will have which == BAD_WHICH - * and a null obj_request pointer. - * - * An object request initiated in support of a layered image - * object (to check for its existence before a write) will - * have which == BAD_WHICH and a non-null obj_request pointer. - * - * Finally, an object request for rbd image data will have - * which != BAD_WHICH, and will have a non-null img_request - * pointer. The value of which will be in the range - * 0..(img_request->obj_request_count-1). - */ + struct ceph_object_extent ex; union { - struct rbd_obj_request *obj_request; /* STAT op */ - struct { - struct rbd_img_request *img_request; - u64 img_offset; - /* links for img_request->obj_requests list */ - struct list_head links; - }; + bool tried_parent; /* for reads */ + enum rbd_obj_write_state write_state; /* for writes */ }; - u32 which; /* posn image request list */ - enum obj_request_type type; + struct rbd_img_request *img_request; + struct ceph_file_extent *img_extents; + u32 num_img_extents; + union { - struct bio *bio_list; + struct ceph_bio_iter bio_pos; struct { - struct page **pages; - u32 page_count; + struct ceph_bvec_iter bvec_pos; + u32 bvec_count; + u32 bvec_idx; }; }; - struct page **copyup_pages; - u32 copyup_page_count; + struct bio_vec *copyup_bvecs; + u32 copyup_bvec_count; struct ceph_osd_request *osd_req; u64 xferred; /* bytes transferred */ int result; - rbd_obj_callback_t callback; - struct kref kref; }; enum img_req_flags { - IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ - IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */ }; struct rbd_img_request { struct rbd_device *rbd_dev; - u64 offset; /* starting image byte offset */ - u64 length; /* byte count from offset */ + enum obj_operation_type op_type; + enum obj_request_type data_type; unsigned long flags; union { u64 snap_id; /* for reads */ @@ -298,26 +285,21 @@ struct rbd_img_request { struct request *rq; /* block request */ struct rbd_obj_request *obj_request; /* obj req initiator */ }; - struct page **copyup_pages; - u32 copyup_page_count; - spinlock_t completion_lock;/* protects next_completion */ - u32 next_completion; - rbd_img_callback_t callback; + spinlock_t completion_lock; u64 xferred;/* aggregate bytes transferred */ int result; /* first nonzero obj_request result */ + struct list_head object_extents; /* obj_req.ex structs */ u32 obj_request_count; - struct list_head obj_requests; /* rbd_obj_request structs */ + u32 pending_count; struct kref kref; }; #define for_each_obj_request(ireq, oreq) \ - list_for_each_entry(oreq, &(ireq)->obj_requests, links) -#define for_each_obj_request_from(ireq, oreq) \ - list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) + list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item) #define for_each_obj_request_safe(ireq, oreq, n) \ - list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) + list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item) enum rbd_watch_state { RBD_WATCH_STATE_UNREGISTERED, @@ -433,8 +415,6 @@ static DEFINE_SPINLOCK(rbd_client_list_lock); static struct kmem_cache *rbd_img_request_cache; static struct kmem_cache *rbd_obj_request_cache; -static struct bio_set *rbd_bio_clone; - static int rbd_major; static DEFINE_IDA(rbd_dev_id_ida); @@ -447,8 +427,6 @@ static bool single_major = true; module_param(single_major, bool, S_IRUGO); MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)"); -static int rbd_img_request_submit(struct rbd_img_request *img_request); - static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count); static ssize_t rbd_remove(struct bus_type *bus, const char *buf, @@ -458,7 +436,6 @@ static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf, static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, size_t count); static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth); -static void rbd_spec_put(struct rbd_spec *spec); static int rbd_dev_id_to_minor(int dev_id) { @@ -577,9 +554,6 @@ void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) # define rbd_assert(expr) ((void) 0) #endif /* !RBD_DEBUG */ -static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request); -static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); -static void rbd_img_parent_read(struct rbd_obj_request *obj_request); static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); static int rbd_dev_refresh(struct rbd_device *rbd_dev); @@ -857,26 +831,6 @@ static char* obj_op_name(enum obj_operation_type op_type) } /* - * Get a ceph client with specific addr and configuration, if one does - * not exist create it. Either way, ceph_opts is consumed by this - * function. - */ -static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) -{ - struct rbd_client *rbdc; - - mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); - rbdc = rbd_client_find(ceph_opts); - if (rbdc) /* using an existing client */ - ceph_destroy_options(ceph_opts); - else - rbdc = rbd_client_create(ceph_opts); - mutex_unlock(&client_mutex); - - return rbdc; -} - -/* * Destroy ceph client * * Caller must hold rbd_client_list_lock. @@ -904,6 +858,56 @@ static void rbd_put_client(struct rbd_client *rbdc) kref_put(&rbdc->kref, rbd_client_release); } +static int wait_for_latest_osdmap(struct ceph_client *client) +{ + u64 newest_epoch; + int ret; + + ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch); + if (ret) + return ret; + + if (client->osdc.osdmap->epoch >= newest_epoch) + return 0; + + ceph_osdc_maybe_request_map(&client->osdc); + return ceph_monc_wait_osdmap(&client->monc, newest_epoch, + client->options->mount_timeout); +} + +/* + * Get a ceph client with specific addr and configuration, if one does + * not exist create it. Either way, ceph_opts is consumed by this + * function. + */ +static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) +{ + struct rbd_client *rbdc; + int ret; + + mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); + rbdc = rbd_client_find(ceph_opts); + if (rbdc) { + ceph_destroy_options(ceph_opts); + + /* + * Using an existing client. Make sure ->pg_pools is up to + * date before we look up the pool id in do_rbd_add(). + */ + ret = wait_for_latest_osdmap(rbdc->client); + if (ret) { + rbd_warn(NULL, "failed to get latest osdmap: %d", ret); + rbd_put_client(rbdc); + rbdc = ERR_PTR(ret); + } + } else { + rbdc = rbd_client_create(ceph_opts); + } + mutex_unlock(&client_mutex); + + return rbdc; +} + static bool rbd_image_format_valid(u32 image_format) { return image_format == 1 || image_format == 2; @@ -1223,272 +1227,59 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) rbd_dev->mapping.features = 0; } -static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) -{ - u64 segment_size = rbd_obj_bytes(&rbd_dev->header); - - return offset & (segment_size - 1); -} - -static u64 rbd_segment_length(struct rbd_device *rbd_dev, - u64 offset, u64 length) -{ - u64 segment_size = rbd_obj_bytes(&rbd_dev->header); - - offset &= segment_size - 1; - - rbd_assert(length <= U64_MAX - offset); - if (offset + length > segment_size) - length = segment_size - offset; - - return length; -} - -/* - * bio helpers - */ - -static void bio_chain_put(struct bio *chain) -{ - struct bio *tmp; - - while (chain) { - tmp = chain; - chain = chain->bi_next; - bio_put(tmp); - } -} - -/* - * zeros a bio chain, starting at specific offset - */ -static void zero_bio_chain(struct bio *chain, int start_ofs) +static void zero_bvec(struct bio_vec *bv) { - struct bio_vec bv; - struct bvec_iter iter; - unsigned long flags; void *buf; - int pos = 0; - - while (chain) { - bio_for_each_segment(bv, chain, iter) { - if (pos + bv.bv_len > start_ofs) { - int remainder = max(start_ofs - pos, 0); - buf = bvec_kmap_irq(&bv, &flags); - memset(buf + remainder, 0, - bv.bv_len - remainder); - flush_dcache_page(bv.bv_page); - bvec_kunmap_irq(buf, &flags); - } - pos += bv.bv_len; - } + unsigned long flags; - chain = chain->bi_next; - } + buf = bvec_kmap_irq(bv, &flags); + memset(buf, 0, bv->bv_len); + flush_dcache_page(bv->bv_page); + bvec_kunmap_irq(buf, &flags); } -/* - * similar to zero_bio_chain(), zeros data defined by a page array, - * starting at the given byte offset from the start of the array and - * continuing up to the given end offset. The pages array is - * assumed to be big enough to hold all bytes up to the end. - */ -static void zero_pages(struct page **pages, u64 offset, u64 end) +static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes) { - struct page **page = &pages[offset >> PAGE_SHIFT]; + struct ceph_bio_iter it = *bio_pos; - rbd_assert(end > offset); - rbd_assert(end - offset <= (u64)SIZE_MAX); - while (offset < end) { - size_t page_offset; - size_t length; - unsigned long flags; - void *kaddr; - - page_offset = offset & ~PAGE_MASK; - length = min_t(size_t, PAGE_SIZE - page_offset, end - offset); - local_irq_save(flags); - kaddr = kmap_atomic(*page); - memset(kaddr + page_offset, 0, length); - flush_dcache_page(*page); - kunmap_atomic(kaddr); - local_irq_restore(flags); - - offset += length; - page++; - } + ceph_bio_iter_advance(&it, off); + ceph_bio_iter_advance_step(&it, bytes, ({ + zero_bvec(&bv); + })); } -/* - * Clone a portion of a bio, starting at the given byte offset - * and continuing for the number of bytes indicated. - */ -static struct bio *bio_clone_range(struct bio *bio_src, - unsigned int offset, - unsigned int len, - gfp_t gfpmask) +static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes) { - struct bio *bio; - - bio = bio_clone_fast(bio_src, gfpmask, rbd_bio_clone); - if (!bio) - return NULL; /* ENOMEM */ + struct ceph_bvec_iter it = *bvec_pos; - bio_advance(bio, offset); - bio->bi_iter.bi_size = len; - - return bio; + ceph_bvec_iter_advance(&it, off); + ceph_bvec_iter_advance_step(&it, bytes, ({ + zero_bvec(&bv); + })); } /* - * Clone a portion of a bio chain, starting at the given byte offset - * into the first bio in the source chain and continuing for the - * number of bytes indicated. The result is another bio chain of - * exactly the given length, or a null pointer on error. - * - * The bio_src and offset parameters are both in-out. On entry they - * refer to the first source bio and the offset into that bio where - * the start of data to be cloned is located. + * Zero a range in @obj_req data buffer defined by a bio (list) or + * (private) bio_vec array. * - * On return, bio_src is updated to refer to the bio in the source - * chain that contains first un-cloned byte, and *offset will - * contain the offset of that byte within that bio. + * @off is relative to the start of the data buffer. */ -static struct bio *bio_chain_clone_range(struct bio **bio_src, - unsigned int *offset, - unsigned int len, - gfp_t gfpmask) +static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off, + u32 bytes) { - struct bio *bi = *bio_src; - unsigned int off = *offset; - struct bio *chain = NULL; - struct bio **end; - - /* Build up a chain of clone bios up to the limit */ - - if (!bi || off >= bi->bi_iter.bi_size || !len) - return NULL; /* Nothing to clone */ - - end = &chain; - while (len) { - unsigned int bi_size; - struct bio *bio; - - if (!bi) { - rbd_warn(NULL, "bio_chain exhausted with %u left", len); - goto out_err; /* EINVAL; ran out of bio's */ - } - bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len); - bio = bio_clone_range(bi, off, bi_size, gfpmask); - if (!bio) - goto out_err; /* ENOMEM */ - - *end = bio; - end = &bio->bi_next; - - off += bi_size; - if (off == bi->bi_iter.bi_size) { - bi = bi->bi_next; - off = 0; - } - len -= bi_size; - } - *bio_src = bi; - *offset = off; - - return chain; -out_err: - bio_chain_put(chain); - - return NULL; -} - -/* - * The default/initial value for all object request flags is 0. For - * each flag, once its value is set to 1 it is never reset to 0 - * again. - */ -static void obj_request_img_data_set(struct rbd_obj_request *obj_request) -{ - if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { - struct rbd_device *rbd_dev; - - rbd_dev = obj_request->img_request->rbd_dev; - rbd_warn(rbd_dev, "obj_request %p already marked img_data", - obj_request); - } -} - -static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) -{ - smp_mb(); - return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; -} - -static void obj_request_done_set(struct rbd_obj_request *obj_request) -{ - if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { - struct rbd_device *rbd_dev = NULL; - - if (obj_request_img_data_test(obj_request)) - rbd_dev = obj_request->img_request->rbd_dev; - rbd_warn(rbd_dev, "obj_request %p already marked done", - obj_request); + switch (obj_req->img_request->data_type) { + case OBJ_REQUEST_BIO: + zero_bios(&obj_req->bio_pos, off, bytes); + break; + case OBJ_REQUEST_BVECS: + case OBJ_REQUEST_OWN_BVECS: + zero_bvecs(&obj_req->bvec_pos, off, bytes); + break; + default: + rbd_assert(0); } } -static bool obj_request_done_test(struct rbd_obj_request *obj_request) -{ - smp_mb(); - return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; -} - -/* - * This sets the KNOWN flag after (possibly) setting the EXISTS - * flag. The latter is set based on the "exists" value provided. - * - * Note that for our purposes once an object exists it never goes - * away again. It's possible that the response from two existence - * checks are separated by the creation of the target object, and - * the first ("doesn't exist") response arrives *after* the second - * ("does exist"). In that case we ignore the second one. - */ -static void obj_request_existence_set(struct rbd_obj_request *obj_request, - bool exists) -{ - if (exists) - set_bit(OBJ_REQ_EXISTS, &obj_request->flags); - set_bit(OBJ_REQ_KNOWN, &obj_request->flags); - smp_mb(); -} - -static bool obj_request_known_test(struct rbd_obj_request *obj_request) -{ - smp_mb(); - return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; -} - -static bool obj_request_exists_test(struct rbd_obj_request *obj_request) -{ - smp_mb(); - return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; -} - -static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request) -{ - struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; - - return obj_request->img_offset < - round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header)); -} - -static void rbd_obj_request_get(struct rbd_obj_request *obj_request) -{ - dout("%s: obj %p (was %d)\n", __func__, obj_request, - kref_read(&obj_request->kref)); - kref_get(&obj_request->kref); -} - static void rbd_obj_request_destroy(struct kref *kref); static void rbd_obj_request_put(struct rbd_obj_request *obj_request) { @@ -1505,18 +1296,13 @@ static void rbd_img_request_get(struct rbd_img_request *img_request) kref_get(&img_request->kref); } -static bool img_request_child_test(struct rbd_img_request *img_request); -static void rbd_parent_request_destroy(struct kref *kref); static void rbd_img_request_destroy(struct kref *kref); static void rbd_img_request_put(struct rbd_img_request *img_request) { rbd_assert(img_request != NULL); dout("%s: img %p (was %d)\n", __func__, img_request, kref_read(&img_request->kref)); - if (img_request_child_test(img_request)) - kref_put(&img_request->kref, rbd_parent_request_destroy); - else - kref_put(&img_request->kref, rbd_img_request_destroy); + kref_put(&img_request->kref, rbd_img_request_destroy); } static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, @@ -1526,139 +1312,37 @@ static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, /* Image request now owns object's original reference */ obj_request->img_request = img_request; - obj_request->which = img_request->obj_request_count; - rbd_assert(!obj_request_img_data_test(obj_request)); - obj_request_img_data_set(obj_request); - rbd_assert(obj_request->which != BAD_WHICH); img_request->obj_request_count++; - list_add_tail(&obj_request->links, &img_request->obj_requests); - dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, - obj_request->which); + img_request->pending_count++; + dout("%s: img %p obj %p\n", __func__, img_request, obj_request); } static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, struct rbd_obj_request *obj_request) { - rbd_assert(obj_request->which != BAD_WHICH); - - dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, - obj_request->which); - list_del(&obj_request->links); + dout("%s: img %p obj %p\n", __func__, img_request, obj_request); + list_del(&obj_request->ex.oe_item); rbd_assert(img_request->obj_request_count > 0); img_request->obj_request_count--; - rbd_assert(obj_request->which == img_request->obj_request_count); - obj_request->which = BAD_WHICH; - rbd_assert(obj_request_img_data_test(obj_request)); rbd_assert(obj_request->img_request == img_request); - obj_request->img_request = NULL; - obj_request->callback = NULL; rbd_obj_request_put(obj_request); } -static bool obj_request_type_valid(enum obj_request_type type) -{ - switch (type) { - case OBJ_REQUEST_NODATA: - case OBJ_REQUEST_BIO: - case OBJ_REQUEST_PAGES: - return true; - default: - return false; - } -} - -static void rbd_img_obj_callback(struct rbd_obj_request *obj_request); - static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) { struct ceph_osd_request *osd_req = obj_request->osd_req; dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__, - obj_request, obj_request->object_no, obj_request->offset, - obj_request->length, osd_req); - if (obj_request_img_data_test(obj_request)) { - WARN_ON(obj_request->callback != rbd_img_obj_callback); - rbd_img_request_get(obj_request->img_request); - } + obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off, + obj_request->ex.oe_len, osd_req); ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); } -static void rbd_img_request_complete(struct rbd_img_request *img_request) -{ - - dout("%s: img %p\n", __func__, img_request); - - /* - * If no error occurred, compute the aggregate transfer - * count for the image request. We could instead use - * atomic64_cmpxchg() to update it as each object request - * completes; not clear which way is better off hand. - */ - if (!img_request->result) { - struct rbd_obj_request *obj_request; - u64 xferred = 0; - - for_each_obj_request(img_request, obj_request) - xferred += obj_request->xferred; - img_request->xferred = xferred; - } - - if (img_request->callback) - img_request->callback(img_request); - else - rbd_img_request_put(img_request); -} - /* * The default/initial value for all image request flags is 0. Each * is conditionally set to 1 at image request initialization time * and currently never change thereafter. */ -static void img_request_write_set(struct rbd_img_request *img_request) -{ - set_bit(IMG_REQ_WRITE, &img_request->flags); - smp_mb(); -} - -static bool img_request_write_test(struct rbd_img_request *img_request) -{ - smp_mb(); - return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; -} - -/* - * Set the discard flag when the img_request is an discard request - */ -static void img_request_discard_set(struct rbd_img_request *img_request) -{ - set_bit(IMG_REQ_DISCARD, &img_request->flags); - smp_mb(); -} - -static bool img_request_discard_test(struct rbd_img_request *img_request) -{ - smp_mb(); - return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0; -} - -static void img_request_child_set(struct rbd_img_request *img_request) -{ - set_bit(IMG_REQ_CHILD, &img_request->flags); - smp_mb(); -} - -static void img_request_child_clear(struct rbd_img_request *img_request) -{ - clear_bit(IMG_REQ_CHILD, &img_request->flags); - smp_mb(); -} - -static bool img_request_child_test(struct rbd_img_request *img_request) -{ - smp_mb(); - return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; -} - static void img_request_layered_set(struct rbd_img_request *img_request) { set_bit(IMG_REQ_LAYERED, &img_request->flags); @@ -1677,209 +1361,70 @@ static bool img_request_layered_test(struct rbd_img_request *img_request) return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; } -static enum obj_operation_type -rbd_img_request_op_type(struct rbd_img_request *img_request) -{ - if (img_request_write_test(img_request)) - return OBJ_OP_WRITE; - else if (img_request_discard_test(img_request)) - return OBJ_OP_DISCARD; - else - return OBJ_OP_READ; -} - -static void -rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) -{ - u64 xferred = obj_request->xferred; - u64 length = obj_request->length; - - dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, - obj_request, obj_request->img_request, obj_request->result, - xferred, length); - /* - * ENOENT means a hole in the image. We zero-fill the entire - * length of the request. A short read also implies zero-fill - * to the end of the request. An error requires the whole - * length of the request to be reported finished with an error - * to the block layer. In each case we update the xferred - * count to indicate the whole request was satisfied. - */ - rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); - if (obj_request->result == -ENOENT) { - if (obj_request->type == OBJ_REQUEST_BIO) - zero_bio_chain(obj_request->bio_list, 0); - else - zero_pages(obj_request->pages, 0, length); - obj_request->result = 0; - } else if (xferred < length && !obj_request->result) { - if (obj_request->type == OBJ_REQUEST_BIO) - zero_bio_chain(obj_request->bio_list, xferred); - else - zero_pages(obj_request->pages, xferred, length); - } - obj_request->xferred = length; - obj_request_done_set(obj_request); -} - -static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) +static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req) { - dout("%s: obj %p cb %p\n", __func__, obj_request, - obj_request->callback); - obj_request->callback(obj_request); -} + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; -static void rbd_obj_request_error(struct rbd_obj_request *obj_request, int err) -{ - obj_request->result = err; - obj_request->xferred = 0; - /* - * kludge - mirror rbd_obj_request_submit() to match a put in - * rbd_img_obj_callback() - */ - if (obj_request_img_data_test(obj_request)) { - WARN_ON(obj_request->callback != rbd_img_obj_callback); - rbd_img_request_get(obj_request->img_request); - } - obj_request_done_set(obj_request); - rbd_obj_request_complete(obj_request); + return !obj_req->ex.oe_off && + obj_req->ex.oe_len == rbd_dev->layout.object_size; } -static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) +static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req) { - struct rbd_img_request *img_request = NULL; - struct rbd_device *rbd_dev = NULL; - bool layered = false; - - if (obj_request_img_data_test(obj_request)) { - img_request = obj_request->img_request; - layered = img_request && img_request_layered_test(img_request); - rbd_dev = img_request->rbd_dev; - } - - dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, - obj_request, img_request, obj_request->result, - obj_request->xferred, obj_request->length); - if (layered && obj_request->result == -ENOENT && - obj_request->img_offset < rbd_dev->parent_overlap) - rbd_img_parent_read(obj_request); - else if (img_request) - rbd_img_obj_request_read_callback(obj_request); - else - obj_request_done_set(obj_request); -} + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; -static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) -{ - dout("%s: obj %p result %d %llu\n", __func__, obj_request, - obj_request->result, obj_request->length); - /* - * There is no such thing as a successful short write. Set - * it to our originally-requested length. - */ - obj_request->xferred = obj_request->length; - obj_request_done_set(obj_request); + return obj_req->ex.oe_off + obj_req->ex.oe_len == + rbd_dev->layout.object_size; } -static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request) +static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req) { - dout("%s: obj %p result %d %llu\n", __func__, obj_request, - obj_request->result, obj_request->length); - /* - * There is no such thing as a successful short discard. Set - * it to our originally-requested length. - */ - obj_request->xferred = obj_request->length; - /* discarding a non-existent object is not a problem */ - if (obj_request->result == -ENOENT) - obj_request->result = 0; - obj_request_done_set(obj_request); + return ceph_file_extents_bytes(obj_req->img_extents, + obj_req->num_img_extents); } -/* - * For a simple stat call there's nothing to do. We'll do more if - * this is part of a write sequence for a layered image. - */ -static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) +static bool rbd_img_is_write(struct rbd_img_request *img_req) { - dout("%s: obj %p\n", __func__, obj_request); - obj_request_done_set(obj_request); + switch (img_req->op_type) { + case OBJ_OP_READ: + return false; + case OBJ_OP_WRITE: + case OBJ_OP_DISCARD: + return true; + default: + rbd_assert(0); + } } -static void rbd_osd_call_callback(struct rbd_obj_request *obj_request) -{ - dout("%s: obj %p\n", __func__, obj_request); - - if (obj_request_img_data_test(obj_request)) - rbd_osd_copyup_callback(obj_request); - else - obj_request_done_set(obj_request); -} +static void rbd_obj_handle_request(struct rbd_obj_request *obj_req); static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) { - struct rbd_obj_request *obj_request = osd_req->r_priv; - u16 opcode; + struct rbd_obj_request *obj_req = osd_req->r_priv; - dout("%s: osd_req %p\n", __func__, osd_req); - rbd_assert(osd_req == obj_request->osd_req); - if (obj_request_img_data_test(obj_request)) { - rbd_assert(obj_request->img_request); - rbd_assert(obj_request->which != BAD_WHICH); - } else { - rbd_assert(obj_request->which == BAD_WHICH); - } - - if (osd_req->r_result < 0) - obj_request->result = osd_req->r_result; - - /* - * We support a 64-bit length, but ultimately it has to be - * passed to the block layer, which just supports a 32-bit - * length field. - */ - obj_request->xferred = osd_req->r_ops[0].outdata_len; - rbd_assert(obj_request->xferred < (u64)UINT_MAX); + dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req, + osd_req->r_result, obj_req); + rbd_assert(osd_req == obj_req->osd_req); - opcode = osd_req->r_ops[0].op; - switch (opcode) { - case CEPH_OSD_OP_READ: - rbd_osd_read_callback(obj_request); - break; - case CEPH_OSD_OP_SETALLOCHINT: - rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE || - osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL); - /* fall through */ - case CEPH_OSD_OP_WRITE: - case CEPH_OSD_OP_WRITEFULL: - rbd_osd_write_callback(obj_request); - break; - case CEPH_OSD_OP_STAT: - rbd_osd_stat_callback(obj_request); - break; - case CEPH_OSD_OP_DELETE: - case CEPH_OSD_OP_TRUNCATE: - case CEPH_OSD_OP_ZERO: - rbd_osd_discard_callback(obj_request); - break; - case CEPH_OSD_OP_CALL: - rbd_osd_call_callback(obj_request); - break; - default: - rbd_warn(NULL, "unexpected OSD op: object_no %016llx opcode %d", - obj_request->object_no, opcode); - break; - } + obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0; + if (!obj_req->result && !rbd_img_is_write(obj_req->img_request)) + obj_req->xferred = osd_req->r_result; + else + /* + * Writes aren't allowed to return a data payload. In some + * guarded write cases (e.g. stat + zero on an empty object) + * a stat response makes it through, but we don't care. + */ + obj_req->xferred = 0; - if (obj_request_done_test(obj_request)) - rbd_obj_request_complete(obj_request); + rbd_obj_handle_request(obj_req); } static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) { struct ceph_osd_request *osd_req = obj_request->osd_req; - rbd_assert(obj_request_img_data_test(obj_request)); + osd_req->r_flags = CEPH_OSD_FLAG_READ; osd_req->r_snapid = obj_request->img_request->snap_id; } @@ -1887,32 +1432,33 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) { struct ceph_osd_request *osd_req = obj_request->osd_req; + osd_req->r_flags = CEPH_OSD_FLAG_WRITE; ktime_get_real_ts(&osd_req->r_mtime); - osd_req->r_data_offset = obj_request->offset; + osd_req->r_data_offset = obj_request->ex.oe_off; } static struct ceph_osd_request * -__rbd_osd_req_create(struct rbd_device *rbd_dev, - struct ceph_snap_context *snapc, - int num_ops, unsigned int flags, - struct rbd_obj_request *obj_request) +rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops) { + struct rbd_img_request *img_req = obj_req->img_request; + struct rbd_device *rbd_dev = img_req->rbd_dev; struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct ceph_osd_request *req; const char *name_format = rbd_dev->image_format == 1 ? RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT; - req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO); + req = ceph_osdc_alloc_request(osdc, + (rbd_img_is_write(img_req) ? img_req->snapc : NULL), + num_ops, false, GFP_NOIO); if (!req) return NULL; - req->r_flags = flags; req->r_callback = rbd_osd_req_callback; - req->r_priv = obj_request; + req->r_priv = obj_req; req->r_base_oloc.pool = rbd_dev->layout.pool_id; if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format, - rbd_dev->header.object_prefix, obj_request->object_no)) + rbd_dev->header.object_prefix, obj_req->ex.oe_objno)) goto err_req; if (ceph_osdc_alloc_messages(req, GFP_NOIO)) @@ -1925,83 +1471,20 @@ err_req: return NULL; } -/* - * Create an osd request. A read request has one osd op (read). - * A write request has either one (watch) or two (hint+write) osd ops. - * (All rbd data writes are prefixed with an allocation hint op, but - * technically osd watch is a write request, hence this distinction.) - */ -static struct ceph_osd_request *rbd_osd_req_create( - struct rbd_device *rbd_dev, - enum obj_operation_type op_type, - unsigned int num_ops, - struct rbd_obj_request *obj_request) -{ - struct ceph_snap_context *snapc = NULL; - - if (obj_request_img_data_test(obj_request) && - (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) { - struct rbd_img_request *img_request = obj_request->img_request; - if (op_type == OBJ_OP_WRITE) { - rbd_assert(img_request_write_test(img_request)); - } else { - rbd_assert(img_request_discard_test(img_request)); - } - snapc = img_request->snapc; - } |