summaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
authorAlex Elder <elder@inktank.com>2013-02-19 19:20:56 -0600
committerAlex Elder <elder@inktank.com>2013-02-19 19:21:08 -0600
commit4c7a08c83a7842e88838dde16684d6bafffdfaf0 (patch)
treec5fe0057b2ff9f98a64ceb6fa076e75da8225cdd /drivers/block
parent19f949f52599ba7c3f67a5897ac6be14bfcb1200 (diff)
parent903bb32e890237ca43ab847e561e5377cfe0fdb3 (diff)
Merge branch 'testing' of github.com:ceph/ceph-client into into linux-3.8-ceph
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/rbd.c1773
1 files changed, 1087 insertions, 686 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 89576a0b3f2e..b0eea3eaee93 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -52,9 +52,12 @@
#define SECTOR_SHIFT 9
#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
-/* It might be useful to have this defined elsewhere too */
+/* It might be useful to have these defined elsewhere */
-#define U64_MAX ((u64) (~0ULL))
+#define U8_MAX ((u8) (~0U))
+#define U16_MAX ((u16) (~0U))
+#define U32_MAX ((u32) (~0U))
+#define U64_MAX ((u64) (~0ULL))
#define RBD_DRV_NAME "rbd"
#define RBD_DRV_NAME_LONG "rbd (rados block device)"
@@ -66,7 +69,6 @@
(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
-#define RBD_MAX_OPT_LEN 1024
#define RBD_SNAP_HEAD_NAME "-"
@@ -93,8 +95,6 @@
#define DEV_NAME_LEN 32
#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
-#define RBD_READ_ONLY_DEFAULT false
-
/*
* block device image metadata (in-memory version)
*/
@@ -119,16 +119,33 @@ struct rbd_image_header {
* An rbd image specification.
*
* The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
- * identify an image.
+ * identify an image. Each rbd_dev structure includes a pointer to
+ * an rbd_spec structure that encapsulates this identity.
+ *
+ * Each of the id's in an rbd_spec has an associated name. For a
+ * user-mapped image, the names are supplied and the id's associated
+ * with them are looked up. For a layered image, a parent image is
+ * defined by the tuple, and the names are looked up.
+ *
+ * An rbd_dev structure contains a parent_spec pointer which is
+ * non-null if the image it represents is a child in a layered
+ * image. This pointer will refer to the rbd_spec structure used
+ * by the parent rbd_dev for its own identity (i.e., the structure
+ * is shared between the parent and child).
+ *
+ * Since these structures are populated once, during the discovery
+ * phase of image construction, they are effectively immutable so
+ * we make no effort to synchronize access to them.
+ *
+ * Note that code herein does not assume the image name is known (it
+ * could be a null pointer).
*/
struct rbd_spec {
u64 pool_id;
char *pool_name;
char *image_id;
- size_t image_id_len;
char *image_name;
- size_t image_name_len;
u64 snap_id;
char *snap_name;
@@ -136,10 +153,6 @@ struct rbd_spec {
struct kref kref;
};
-struct rbd_options {
- bool read_only;
-};
-
/*
* an instance of the client. multiple devices may share an rbd client.
*/
@@ -149,37 +162,76 @@ struct rbd_client {
struct list_head node;
};
-/*
- * a request completion status
- */
-struct rbd_req_status {
- int done;
- int rc;
- u64 bytes;
+struct rbd_img_request;
+typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
+
+#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
+
+struct rbd_obj_request;
+typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
+
+enum obj_request_type {
+ OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
};
-/*
- * a collection of requests
- */
-struct rbd_req_coll {
- int total;
- int num_done;
+struct rbd_obj_request {
+ const char *object_name;
+ u64 offset; /* object start byte */
+ u64 length; /* bytes from offset */
+
+ struct rbd_img_request *img_request;
+ struct list_head links; /* img_request->obj_requests */
+ u32 which; /* posn image request list */
+
+ enum obj_request_type type;
+ union {
+ struct bio *bio_list;
+ struct {
+ struct page **pages;
+ u32 page_count;
+ };
+ };
+
+ struct ceph_osd_request *osd_req;
+
+ u64 xferred; /* bytes transferred */
+ u64 version;
+ s32 result;
+ atomic_t done;
+
+ rbd_obj_callback_t callback;
+ struct completion completion;
+
struct kref kref;
- struct rbd_req_status status[0];
};
-/*
- * a single io request
- */
-struct rbd_request {
- struct request *rq; /* blk layer request */
- struct bio *bio; /* cloned bio */
- struct page **pages; /* list of used pages */
- u64 len;
- int coll_index;
- struct rbd_req_coll *coll;
+struct rbd_img_request {
+ struct request *rq;
+ struct rbd_device *rbd_dev;
+ u64 offset; /* starting image byte offset */
+ u64 length; /* byte count from offset */
+ bool write_request; /* false for read */
+ union {
+ struct ceph_snap_context *snapc; /* for writes */
+ u64 snap_id; /* for reads */
+ };
+ spinlock_t completion_lock;/* protects next_completion */
+ u32 next_completion;
+ rbd_img_callback_t callback;
+
+ u32 obj_request_count;
+ struct list_head obj_requests; /* rbd_obj_request structs */
+
+ struct kref kref;
};
+#define for_each_obj_request(ireq, oreq) \
+ list_for_each_entry(oreq, &(ireq)->obj_requests, links)
+#define for_each_obj_request_from(ireq, oreq) \
+ list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
+#define for_each_obj_request_safe(ireq, oreq, n) \
+ list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
+
struct rbd_snap {
struct device dev;
const char *name;
@@ -209,16 +261,18 @@ struct rbd_device {
char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
- spinlock_t lock; /* queue lock */
+ spinlock_t lock; /* queue, flags, open_count */
struct rbd_image_header header;
- bool exists;
+ unsigned long flags; /* possibly lock protected */
struct rbd_spec *spec;
char *header_name;
+ struct ceph_file_layout layout;
+
struct ceph_osd_event *watch_event;
- struct ceph_osd_request *watch_request;
+ struct rbd_obj_request *watch_request;
struct rbd_spec *parent_spec;
u64 parent_overlap;
@@ -235,7 +289,19 @@ struct rbd_device {
/* sysfs related */
struct device dev;
- unsigned long open_count;
+ unsigned long open_count; /* protected by lock */
+};
+
+/*
+ * Flag bits for rbd_dev->flags. If atomicity is required,
+ * rbd_dev->lock is used to protect access.
+ *
+ * Currently, only the "removing" flag (which is coupled with the
+ * "open_count" field) requires atomic access.
+ */
+enum rbd_dev_flags {
+ RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
+ RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
};
static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
@@ -277,6 +343,33 @@ static struct device rbd_root_dev = {
.release = rbd_root_dev_release,
};
+static __printf(2, 3)
+void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ if (!rbd_dev)
+ printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
+ else if (rbd_dev->disk)
+ printk(KERN_WARNING "%s: %s: %pV\n",
+ RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
+ else if (rbd_dev->spec && rbd_dev->spec->image_name)
+ printk(KERN_WARNING "%s: image %s: %pV\n",
+ RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
+ else if (rbd_dev->spec && rbd_dev->spec->image_id)
+ printk(KERN_WARNING "%s: id %s: %pV\n",
+ RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
+ else /* punt */
+ printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
+ RBD_DRV_NAME, rbd_dev, &vaf);
+ va_end(args);
+}
+
#ifdef RBD_DEBUG
#define rbd_assert(expr) \
if (unlikely(!(expr))) { \
@@ -296,14 +389,23 @@ static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
static int rbd_open(struct block_device *bdev, fmode_t mode)
{
struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
+ bool removing = false;
if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
return -EROFS;
+ spin_lock_irq(&rbd_dev->lock);
+ if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
+ removing = true;
+ else
+ rbd_dev->open_count++;
+ spin_unlock_irq(&rbd_dev->lock);
+ if (removing)
+ return -ENOENT;
+
mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
(void) get_device(&rbd_dev->dev);
set_device_ro(bdev, rbd_dev->mapping.read_only);
- rbd_dev->open_count++;
mutex_unlock(&ctl_mutex);
return 0;
@@ -312,10 +414,14 @@ static int rbd_open(struct block_device *bdev, fmode_t mode)
static int rbd_release(struct gendisk *disk, fmode_t mode)
{
struct rbd_device *rbd_dev = disk->private_data;
+ unsigned long open_count_before;
+
+ spin_lock_irq(&rbd_dev->lock);
+ open_count_before = rbd_dev->open_count--;
+ spin_unlock_irq(&rbd_dev->lock);
+ rbd_assert(open_count_before > 0);
mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
- rbd_assert(rbd_dev->open_count > 0);
- rbd_dev->open_count--;
put_device(&rbd_dev->dev);
mutex_unlock(&ctl_mutex);
@@ -426,6 +532,12 @@ static match_table_t rbd_opts_tokens = {
{-1, NULL}
};
+struct rbd_options {
+ bool read_only;
+};
+
+#define RBD_READ_ONLY_DEFAULT false
+
static int parse_rbd_opts_token(char *c, void *private)
{
struct rbd_options *rbd_opts = private;
@@ -512,18 +624,6 @@ static void rbd_put_client(struct rbd_client *rbdc)
kref_put(&rbdc->kref, rbd_client_release);
}
-/*
- * Destroy requests collection
- */
-static void rbd_coll_release(struct kref *kref)
-{
- struct rbd_req_coll *coll =
- container_of(kref, struct rbd_req_coll, kref);
-
- dout("rbd_coll_release %p\n", coll);
- kfree(coll);
-}
-
static bool rbd_image_format_valid(u32 image_format)
{
return image_format == 1 || image_format == 2;
@@ -707,7 +807,8 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
goto done;
rbd_dev->mapping.read_only = true;
}
- rbd_dev->exists = true;
+ set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
+
done:
return ret;
}
@@ -724,7 +825,7 @@ static void rbd_header_free(struct rbd_image_header *header)
header->snapc = NULL;
}
-static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
+static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
{
char *name;
u64 segment;
@@ -767,23 +868,6 @@ static u64 rbd_segment_length(struct rbd_device *rbd_dev,
return length;
}
-static int rbd_get_num_segments(struct rbd_image_header *header,
- u64 ofs, u64 len)
-{
- u64 start_seg;
- u64 end_seg;
-
- if (!len)
- return 0;
- if (len - 1 > U64_MAX - ofs)
- return -ERANGE;
-
- start_seg = ofs >> header->obj_order;
- end_seg = (ofs + len - 1) >> header->obj_order;
-
- return end_seg - start_seg + 1;
-}
-
/*
* returns the size of an object in the image
*/
@@ -949,8 +1033,10 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src,
unsigned int bi_size;
struct bio *bio;
- if (!bi)
+ if (!bi) {
+ rbd_warn(NULL, "bio_chain exhausted with %u left", len);
goto out_err; /* EINVAL; ran out of bio's */
+ }
bi_size = min_t(unsigned int, bi->bi_size - off, len);
bio = bio_clone_range(bi, off, bi_size, gfpmask);
if (!bio)
@@ -976,399 +1062,665 @@ out_err:
return NULL;
}
-/*
- * helpers for osd request op vectors.
- */
-static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
- int opcode, u32 payload_len)
+static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
{
- struct ceph_osd_req_op *ops;
+ kref_get(&obj_request->kref);
+}
- ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
- if (!ops)
+static void rbd_obj_request_destroy(struct kref *kref);
+static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
+{
+ rbd_assert(obj_request != NULL);
+ kref_put(&obj_request->kref, rbd_obj_request_destroy);
+}
+
+static void rbd_img_request_get(struct rbd_img_request *img_request)
+{
+ kref_get(&img_request->kref);
+}
+
+static void rbd_img_request_destroy(struct kref *kref);
+static void rbd_img_request_put(struct rbd_img_request *img_request)
+{
+ rbd_assert(img_request != NULL);
+ kref_put(&img_request->kref, rbd_img_request_destroy);
+}
+
+static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
+ struct rbd_obj_request *obj_request)
+{
+ rbd_assert(obj_request->img_request == NULL);
+
+ rbd_obj_request_get(obj_request);
+ obj_request->img_request = img_request;
+ obj_request->which = img_request->obj_request_count;
+ rbd_assert(obj_request->which != BAD_WHICH);
+ img_request->obj_request_count++;
+ list_add_tail(&obj_request->links, &img_request->obj_requests);
+}
+
+static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
+ struct rbd_obj_request *obj_request)
+{
+ rbd_assert(obj_request->which != BAD_WHICH);
+
+ list_del(&obj_request->links);
+ rbd_assert(img_request->obj_request_count > 0);
+ img_request->obj_request_count--;
+ rbd_assert(obj_request->which == img_request->obj_request_count);
+ obj_request->which = BAD_WHICH;
+ rbd_assert(obj_request->img_request == img_request);
+ obj_request->img_request = NULL;
+ obj_request->callback = NULL;
+ rbd_obj_request_put(obj_request);
+}
+
+static bool obj_request_type_valid(enum obj_request_type type)
+{
+ switch (type) {
+ case OBJ_REQUEST_NODATA:
+ case OBJ_REQUEST_BIO:
+ case OBJ_REQUEST_PAGES:
+ return true;
+ default:
+ return false;
+ }
+}
+
+struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
+{
+ struct ceph_osd_req_op *op;
+ va_list args;
+ size_t size;
+
+ op = kzalloc(sizeof (*op), GFP_NOIO);
+ if (!op)
return NULL;
+ op->op = opcode;
+ va_start(args, opcode);
+ switch (opcode) {
+ case CEPH_OSD_OP_READ:
+ case CEPH_OSD_OP_WRITE:
+ /* rbd_osd_req_op_create(READ, offset, length) */
+ /* rbd_osd_req_op_create(WRITE, offset, length) */
+ op->extent.offset = va_arg(args, u64);
+ op->extent.length = va_arg(args, u64);
+ if (opcode == CEPH_OSD_OP_WRITE)
+ op->payload_len = op->extent.length;
+ break;
+ case CEPH_OSD_OP_STAT:
+ break;
+ case CEPH_OSD_OP_CALL:
+ /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
+ op->cls.class_name = va_arg(args, char *);
+ size = strlen(op->cls.class_name);
+ rbd_assert(size <= (size_t) U8_MAX);
+ op->cls.class_len = size;
+ op->payload_len = size;
+
+ op->cls.method_name = va_arg(args, char *);
+ size = strlen(op->cls.method_name);
+ rbd_assert(size <= (size_t) U8_MAX);
+ op->cls.method_len = size;
+ op->payload_len += size;
+
+ op->cls.argc = 0;
+ op->cls.indata = va_arg(args, void *);
+ size = va_arg(args, size_t);
+ rbd_assert(size <= (size_t) U32_MAX);
+ op->cls.indata_len = (u32) size;
+ op->payload_len += size;
+ break;
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ case CEPH_OSD_OP_WATCH:
+ /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
+ /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
+ op->watch.cookie = va_arg(args, u64);
+ op->watch.ver = va_arg(args, u64);
+ op->watch.ver = cpu_to_le64(op->watch.ver);
+ if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
+ op->watch.flag = (u8) 1;
+ break;
+ default:
+ rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
+ kfree(op);
+ op = NULL;
+ break;
+ }
+ va_end(args);
- ops[0].op = opcode;
+ return op;
+}
- /*
- * op extent offset and length will be set later on
- * in calc_raw_layout()
- */
- ops[0].payload_len = payload_len;
+static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
+{
+ kfree(op);
+}
- return ops;
+static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
+ struct rbd_obj_request *obj_request)
+{
+ return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
}
-static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
+static void rbd_img_request_complete(struct rbd_img_request *img_request)
{
- kfree(ops);
+ if (img_request->callback)
+ img_request->callback(img_request);
+ else
+ rbd_img_request_put(img_request);
}
-static void rbd_coll_end_req_index(struct request *rq,
- struct rbd_req_coll *coll,
- int index,
- int ret, u64 len)
+/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
+
+static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
{
- struct request_queue *q;
- int min, max, i;
+ return wait_for_completion_interruptible(&obj_request->completion);
+}
- dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
- coll, index, ret, (unsigned long long) len);
+static void obj_request_done_init(struct rbd_obj_request *obj_request)
+{
+ atomic_set(&obj_request->done, 0);
+ smp_wmb();
+}
- if (!rq)
- return;
+static void obj_request_done_set(struct rbd_obj_request *obj_request)
+{
+ atomic_set(&obj_request->done, 1);
+ smp_wmb();
+}
- if (!coll) {
- blk_end_request(rq, ret, len);
- return;
- }
+static bool obj_request_done_test(struct rbd_obj_request *obj_request)
+{
+ smp_rmb();
+ return atomic_read(&obj_request->done) != 0;
+}
+
+static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request,
+ struct ceph_osd_op *op)
+{
+ obj_request_done_set(obj_request);
+}
- q = rq->q;
-
- spin_lock_irq(q->queue_lock);
- coll->status[index].done = 1;
- coll->status[index].rc = ret;
- coll->status[index].bytes = len;
- max = min = coll->num_done;
- while (max < coll->total && coll->status[max].done)
- max++;
-
- for (i = min; i<max; i++) {
- __blk_end_request(rq, coll->status[i].rc,
- coll->status[i].bytes);
- coll->num_done++;
- kref_put(&coll->kref, rbd_coll_release);
+static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
+{
+ if (obj_request->callback)
+ obj_request->callback(obj_request);
+ else
+ complete_all(&obj_request->completion);
+}
+
+static void rbd_osd_read_callback(struct rbd_obj_request *obj_request,
+ struct ceph_osd_op *op)
+{
+ u64 xferred;
+
+ /*
+ * We support a 64-bit length, but ultimately it has to be
+ * passed to blk_end_request(), which takes an unsigned int.
+ */
+ xferred = le64_to_cpu(op->extent.length);
+ rbd_assert(xferred < (u64) UINT_MAX);
+ if (obj_request->result == (s32) -ENOENT) {
+ zero_bio_chain(obj_request->bio_list, 0);
+ obj_request->result = 0;
+ } else if (xferred < obj_request->length && !obj_request->result) {
+ zero_bio_chain(obj_request->bio_list, xferred);
+ xferred = obj_request->length;
}
- spin_unlock_irq(q->queue_lock);
+ obj_request->xferred = xferred;
+ obj_request_done_set(obj_request);
}
-static void rbd_coll_end_req(struct rbd_request *req,
- int ret, u64 len)
+static void rbd_osd_write_callback(struct rbd_obj_request *obj_request,
+ struct ceph_osd_op *op)
{
- rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
+ obj_request->xferred = le64_to_cpu(op->extent.length);
+ obj_request_done_set(obj_request);
}
/*
- * Send ceph osd request
+ * For a simple stat call there's nothing to do. We'll do more if
+ * this is part of a write sequence for a layered image.
*/
-static int rbd_do_request(struct request *rq,
- struct rbd_device *rbd_dev,
- struct ceph_snap_context *snapc,
- u64 snapid,
- const char *object_name, u64 ofs, u64 len,
- struct bio *bio,
- struct page **pages,
- int num_pages,
- int flags,
- struct ceph_osd_req_op *ops,
- struct rbd_req_coll *coll,
- int coll_index,
- void (*rbd_cb)(struct ceph_osd_request *req,
- struct ceph_msg *msg),
- struct ceph_osd_request **linger_req,
- u64 *ver)
-{
- struct ceph_osd_request *req;
- struct ceph_file_layout *layout;
- int ret;
- u64 bno;
- struct timespec mtime = CURRENT_TIME;
- struct rbd_request *req_data;
- struct ceph_osd_request_head *reqhead;
- struct ceph_osd_client *osdc;
+static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request,
+ struct ceph_osd_op *op)
+{
+ obj_request_done_set(obj_request);
+}
- req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
- if (!req_data) {
- if (coll)
- rbd_coll_end_req_index(rq, coll, coll_index,
- -ENOMEM, len);
- return -ENOMEM;
+static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
+ struct ceph_msg *msg)
+{
+ struct rbd_obj_request *obj_request = osd_req->r_priv;
+ struct ceph_osd_reply_head *reply_head;
+ struct ceph_osd_op *op;
+ u32 num_ops;
+ u16 opcode;
+
+ rbd_assert(osd_req == obj_request->osd_req);
+ rbd_assert(!!obj_request->img_request ^
+ (obj_request->which == BAD_WHICH));
+
+ obj_request->xferred = le32_to_cpu(msg->hdr.data_len);
+ reply_head = msg->front.iov_base;
+ obj_request->result = (s32) le32_to_cpu(reply_head->result);
+ obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
+
+ num_ops = le32_to_cpu(reply_head->num_ops);
+ WARN_ON(num_ops != 1); /* For now */
+
+ op = &reply_head->ops[0];
+ opcode = le16_to_cpu(op->op);
+ switch (opcode) {
+ case CEPH_OSD_OP_READ:
+ rbd_osd_read_callback(obj_request, op);
+ break;
+ case CEPH_OSD_OP_WRITE:
+ rbd_osd_write_callback(obj_request, op);
+ break;
+ case CEPH_OSD_OP_STAT:
+ rbd_osd_stat_callback(obj_request, op);
+ break;
+ case CEPH_OSD_OP_CALL:
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ case CEPH_OSD_OP_WATCH:
+ rbd_osd_trivial_callback(obj_request, op);
+ break;
+ default:
+ rbd_warn(NULL, "%s: unsupported op %hu\n",
+ obj_request->object_name, (unsigned short) opcode);
+ break;
}
- if (coll) {
- req_data->coll = coll;
- req_data->coll_index = coll_index;
+ if (obj_request_done_test(obj_request))
+ rbd_obj_request_complete(obj_request);
+}
+
+static struct ceph_osd_request *rbd_osd_req_create(
+ struct rbd_device *rbd_dev,
+ bool write_request,
+ struct rbd_obj_request *obj_request,
+ struct ceph_osd_req_op *op)
+{
+ struct rbd_img_request *img_request = obj_request->img_request;
+ struct ceph_snap_context *snapc = NULL;
+ struct ceph_osd_client *osdc;
+ struct ceph_osd_request *osd_req;
+ struct timespec now;
+ struct timespec *mtime;
+ u64 snap_id = CEPH_NOSNAP;
+ u64 offset = obj_request->offset;
+ u64 length = obj_request->length;
+
+ if (img_request) {
+ rbd_assert(img_request->write_request == write_request);
+ if (img_request->write_request)
+ snapc = img_request->snapc;
+ else
+ snap_id = img_request->snap_id;
}
- dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
- object_name, (unsigned long long) ofs,
- (unsigned long long) len, coll, coll_index);
+ /* Allocate and initialize the request, for the single op */
osdc = &rbd_dev->rbd_client->client->osdc;
- req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
- false, GFP_NOIO, pages, bio);
- if (!req) {
- ret = -ENOMEM;
- goto done_pages;
+ osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
+ if (!osd_req)
+ return NULL; /* ENOMEM */
+
+ rbd_assert(obj_request_type_valid(obj_request->type));
+ switch (obj_request->type) {
+ case OBJ_REQUEST_NODATA:
+ break; /* Nothing to do */
+ case OBJ_REQUEST_BIO:
+ rbd_assert(obj_request->bio_list != NULL);
+ osd_req->r_bio = obj_request->bio_list;
+ break;
+ case OBJ_REQUEST_PAGES:
+ osd_req->r_pages = obj_request->pages;
+ osd_req->r_num_pages = obj_request->page_count;
+ osd_req->r_page_alignment = offset & ~PAGE_MASK;
+ break;
}
- req->r_callback = rbd_cb;
+ if (write_request) {
+ osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+ now = CURRENT_TIME;
+ mtime = &now;
+ } else {
+ osd_req->r_flags = CEPH_OSD_FLAG_READ;
+ mtime = NULL; /* not needed for reads */
+ offset = 0; /* These are not used... */
+ length = 0; /* ...for osd read requests */
+ }
- req_data->rq = rq;
- req_data->bio = bio;
- req_data->pages = pages;
- req_data->len = len;
+ osd_req->r_callback = rbd_osd_req_callback;
+ osd_req->r_priv = obj_request;
- req->r_priv = req_data;
+ osd_req->r_oid_len = strlen(obj_request->object_name);
+ rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
+ memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
- reqhead = req->r_request->front.iov_base;
- reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
+ osd_req->r_file_layout = rbd_dev->layout; /* struct */
- strncpy(req->r_oid, object_name, sizeof(req->r_oid));
- req->r_oid_len = strlen(req->r_oid);
+ /* osd_req will get its own reference to snapc (if non-null) */
- layout = &req->r_file_layout;
- memset(layout, 0, sizeof(*layout));
- layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
- layout->fl_stripe_count = cpu_to_le32(1);
- layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
- layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
- ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
- req, ops);
- rbd_assert(ret == 0);
+ ceph_osdc_build_request(osd_req, offset, length, 1, op,
+ snapc, snap_id, mtime);
- ceph_osdc_build_request(req, ofs, &len,
- ops,
- snapc,
- &mtime,
- req->r_oid, req->r_oid_len);
+ return osd_req;
+}
- if (linger_req) {
- ceph_osdc_set_request_linger(osdc, req);
- *linger_req = req;
- }
+static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
+{
+ ceph_osdc_put_request(osd_req);
+}
- ret = ceph_osdc_start_request(osdc, req, false);
- if (ret < 0)
- goto done_err;
-
- if (!rbd_cb) {
- ret = ceph_osdc_wait_request(osdc, req);
- if (ver)
- *ver = le64_to_cpu(req->r_reassert_version.version);
- dout("reassert_ver=%llu\n",
- (unsigned long long)
- le64_to_cpu(req->r_reassert_version.version));
- ceph_osdc_put_request(req);
+/* object_name is assumed to be a non-null pointer and NUL-terminated */
+
+static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
+ u64 offset, u64 length,
+ enum obj_request_type type)
+{
+ struct rbd_obj_request *obj_request;
+ size_t size;
+ char *name;
+
+ rbd_assert(obj_request_type_valid(type));
+
+ size = strlen(object_name) + 1;
+ obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
+ if (!obj_request)
+ return NULL;
+
+ name = (char *)(obj_request + 1);
+ obj_request->object_name = memcpy(name, object_name, size);
+ obj_request->offset = offset;
+ obj_request->length = length;
+ obj_request->which = BAD_WHICH;
+ obj_request->type = type;
+ INIT_LIST_HEAD(&obj_request->links);
+ obj_request_done_init(obj_request);
+ init_completion(&obj_request->completion);
+ kref_init(&obj_request->kref);
+
+ return obj_request;
+}
+
+static void rbd_obj_request_destroy(struct kref *kref)
+{
+ struct rbd_obj_request *obj_request;
+
+ obj_request = container_of(kref, struct rbd_obj_request, kref);
+
+ rbd_assert(obj_request->img_request == NULL);
+ rbd_assert(obj_request->which == BAD_WHICH);
+
+ if (obj_request->osd_req)
+ rbd_osd_req_destroy(obj_request->osd_req);
+
+ rbd_assert(obj_request_type_valid(obj_request->type));
+ switch (obj_request->type) {
+ case OBJ_REQUEST_NODATA:
+ break; /* Nothing to do */
+ case OBJ_REQUEST_BIO:
+ if (obj_request->bio_list)
+ bio_chain_put(obj_request->bio_list);
+ break;
+ case OBJ_REQUEST_PAGES:
+ if (obj_request->pages)
+ ceph_release_page_vector(obj_request->pages,
+ obj_request->page_count);
+ break;
}
- return ret;
-done_err:
- bio_chain_put(req_data->bio);
- ceph_osdc_put_request(req);
-done_pages:
- rbd_coll_end_req(req_data, ret, len);
- kfree(req_data);
- return ret;
+ kfree(obj_request);
}
/*
- * Ceph osd op callback
+ * Caller is responsible for filling in the list of object requests
+ * that comprises the image request, and the Linux request pointer
+ * (if there is one).
*/
-static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
+struct rbd_img_request *rbd_img_request_create(struct rbd_device *rbd_dev,
+ u64 offset, u64 length,
+ bool write_request)
{
- struct rbd_request *req_data = req->r_priv;
- struct ceph_osd_reply_head *replyhead;
- struct ceph_osd_op *op;
- __s32 rc;
- u64 bytes;
- int read_op;
-
- /* parse reply */
- replyhead = msg->front.iov_base;
- WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
- op = (void *)(replyhead + 1);
- rc = le32_to_cpu(replyhead->result);
- bytes = le64_to_cpu(op->extent.length);
- read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
-
- dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
- (unsigned long long) bytes, read_op, (int) rc);
-
- if (rc == -ENOENT && read_op) {
- zero_bio_chain(req_data->bio, 0);
- rc = 0;
- } else if (rc == 0 && read_op && bytes < req_data->len) {
- zero_bio_chain(req_data->bio, bytes);
- bytes = req_data->len;
- }
+ struct rbd_img_request *img_request;
+ struct ceph_snap_context *snapc = NULL;
- rbd_coll_end_req(req_data, rc, bytes);
+ img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
+ if (!img_request)
+ return NULL;
- if (req_data->bio)
- bio_chain_put(req_data->bio);
+ if (write_request) {
+ down_read(&rbd_dev->header_rwsem);
+ snapc = ceph_get_snap_context(rbd_dev->header.snapc);
+ up_read(&rbd_dev->header_rwsem);
+ if (WARN_ON(!snapc)) {
+ kfree(img_request);
+ return NULL; /* Shouldn't happen */
+ }
+ }
- ceph_osdc_put_request(req);
- kfree(req_data);
+ img_request->rq = NULL;
+ img_request->rbd_dev = rbd_dev;
+ img_request->offset = offset;
+ img_request->length = length;
+ img_request->write_request = write_request;
+ if (write_request)
+ img_request->snapc = snapc;
+ else
+ img_request->snap_id = rbd_dev->spec->snap_id;
+ spin_lock_init(&img_request->completion_lock);
+ img_request->next_completion = 0;
+ img_request->callback = NULL;
+ img_request->obj_request_count = 0;
+ INIT_LIST_HEAD(&img_request->obj_requests);
+ kref_init(&img_request->kref);
+
+ rbd_img_request_get(img_request); /* Avoid a warning */
+ rbd_img_request_put(img_request); /* TEMPORARY */
+
+ return img_request;
}
-static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
+static void rbd_img_request_destroy(struct kref *kref)
{
- ceph_osdc_put_request(req);
+ struct rbd_img_request *img_request;
+ struct rbd_obj_request *obj_request;
+ struct rbd_obj_request *next_obj_request;
+
+ img_request = container_of(kref, struct rbd_img_request, kref);
+
+ for_each_obj_request_safe(img_request, obj_request, next_obj_request)
+ rbd_img_obj_request_del(img_request, obj_request);
+ rbd_assert(img_request->obj_request_count == 0);
+
+ if (img_request->write_request)
+ ceph_put_snap_context(img_request->snapc);
+
+ kfree(img_request);
}
-/*
- * Do a synchronous ceph osd operation
- */
-static int rbd_req_sync_op(struct rbd_device *rbd_dev,
- struct ceph_snap_context *snapc,
- u64 snapid,
- int flags,
- struct ceph_osd_req_op *ops,
- const char *object_name,
- u64 ofs, u64 inbound_size,
- char *inbound,
- struct ceph_osd_request **linger_req,
- u64 *ver)
+static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
+ struct bio *bio_list)
{
- int ret;
- struct page **pages;
- int num_pages;
-
- rbd_assert(ops != NULL);
+ struct rbd_device *rbd_dev = img_request->rbd_dev;
+ struct rbd_obj_request *obj_request = NULL;
+ struct rbd_obj_request *next_obj_request;
+ unsigned int bio_offset;
+ u64 image_offset;
+ u64 resid;
+ u16 opcode;
+
+ opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
+ : CEPH_OSD_OP_READ;
+ bio_offset = 0;
+ image_offset = img_request->offset;
+ rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
+ resid = img_request->length;
+ while (resid) {
+ const char *object_name;
+ unsigned int clone_size;
+ struct ceph_osd_req_op *op;
+ u64 offset;
+ u64 length;
+
+ object_name = rbd_segment_name(rbd_dev, image_offset);
+ if (!object_name)
+ goto out_unwind;
+ offset = rbd_segment_offset(rbd_dev, image_offset);
+ length = rbd_segment_length(rbd_dev, image_offset, resid);
+ obj_request = rbd_obj_request_create(object_name,
+ offset, length,
+ OBJ_REQUEST_BIO);
+ kfree(object_name); /* object request has its own copy */
+ if (!obj_request)
+ goto out_unwind;
+
+ rbd_assert(length <= (u64) UINT_MAX);
+ clone_size = (unsigned int) length;
+ obj_request->bio_list = bio_chain_clone_range(&bio_list,
+ &bio_offset, clone_size,
+ GFP_ATOMIC);
+ if (!obj_request->bio_list)
+ goto out_partial;
- num_pages = calc_pages_for(ofs, inbound_size);
- pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
- if (IS_ERR(pages))
- return PTR_ERR(pages);
+ /*
+ * Build up the op to use in building the osd
+ * request. Note that the contents of the op are
+ * copied by rbd_osd_req_create().
+ */
+ op = rbd_osd_req_op_create(opcode, offset, length);
+ if (!op)
+ goto out_partial;
+ obj_request->osd_req = rbd_osd_req_create(rbd_dev,
+ img_request->write_request,
+ obj_request, op);
+ rbd_osd_req_op_destroy(op);
+ if (!obj_request->osd_req)
+ goto out_partial;
+ /* status and version are initially zero-filled */
+
+ rbd_img_obj_request_add(img_request, obj_request);
+
+ image_offset += length;
+ resid -= length;
+ }
- ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,<