summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-bus-rbd4
-rw-r--r--drivers/block/rbd.c1389
-rw-r--r--drivers/block/rbd_types.h2
-rw-r--r--fs/ceph/addr.c60
-rw-r--r--fs/ceph/caps.c18
-rw-r--r--fs/ceph/file.c73
-rw-r--r--fs/ceph/inode.c15
-rw-r--r--fs/ceph/mds_client.c11
-rw-r--r--fs/ceph/super.c4
-rw-r--r--include/linux/backing-dev.h1
-rw-r--r--include/linux/ceph/libceph.h2
-rw-r--r--include/linux/ceph/osdmap.h1
-rw-r--r--include/linux/ceph/rados.h2
-rw-r--r--net/ceph/ceph_common.c3
-rw-r--r--net/ceph/messenger.c107
-rw-r--r--net/ceph/osd_client.c59
-rw-r--r--net/ceph/osdmap.c47
17 files changed, 1192 insertions, 606 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-rbd b/Documentation/ABI/testing/sysfs-bus-rbd
index 1cf2adf46b11..cd9213ccf3dc 100644
--- a/Documentation/ABI/testing/sysfs-bus-rbd
+++ b/Documentation/ABI/testing/sysfs-bus-rbd
@@ -70,6 +70,10 @@ snap_*
A directory per each snapshot
+parent
+
+ Information identifying the pool, image, and snapshot id for
+ the parent image in a layered rbd image (format 2 only).
Entries under /sys/bus/rbd/devices/<dev-id>/snap_<snap-name>
-------------------------------------------------------------
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index bb3d9be3b1b4..89576a0b3f2e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -61,15 +61,29 @@
#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
-#define RBD_MAX_SNAP_NAME_LEN 32
+#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
+#define RBD_MAX_SNAP_NAME_LEN \
+ (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
+
#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
#define RBD_MAX_OPT_LEN 1024
#define RBD_SNAP_HEAD_NAME "-"
+/* This allows a single page to hold an image name sent by OSD */
+#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
#define RBD_IMAGE_ID_LEN_MAX 64
+
#define RBD_OBJ_PREFIX_LEN_MAX 64
+/* Feature bits */
+
+#define RBD_FEATURE_LAYERING 1
+
+/* Features supported by this (client software) implementation. */
+
+#define RBD_FEATURES_ALL (0)
+
/*
* An RBD device name will be "rbd#", where the "rbd" comes from
* RBD_DRV_NAME above, and # is a unique integer identifier.
@@ -101,6 +115,27 @@ struct rbd_image_header {
u64 obj_version;
};
+/*
+ * An rbd image specification.
+ *
+ * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
+ * identify an image.
+ */
+struct rbd_spec {
+ u64 pool_id;
+ char *pool_name;
+
+ char *image_id;
+ size_t image_id_len;
+ char *image_name;
+ size_t image_name_len;
+
+ u64 snap_id;
+ char *snap_name;
+
+ struct kref kref;
+};
+
struct rbd_options {
bool read_only;
};
@@ -155,11 +190,8 @@ struct rbd_snap {
};
struct rbd_mapping {
- char *snap_name;
- u64 snap_id;
u64 size;
u64 features;
- bool snap_exists;
bool read_only;
};
@@ -173,7 +205,6 @@ struct rbd_device {
struct gendisk *disk; /* blkdev's gendisk and rq */
u32 image_format; /* Either 1 or 2 */
- struct rbd_options rbd_opts;
struct rbd_client *rbd_client;
char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
@@ -181,17 +212,17 @@ struct rbd_device {
spinlock_t lock; /* queue lock */
struct rbd_image_header header;
- char *image_id;
- size_t image_id_len;
- char *image_name;
- size_t image_name_len;
+ bool exists;
+ struct rbd_spec *spec;
+
char *header_name;
- char *pool_name;
- int pool_id;
struct ceph_osd_event *watch_event;
struct ceph_osd_request *watch_request;
+ struct rbd_spec *parent_spec;
+ u64 parent_overlap;
+
/* protects updating the header */
struct rw_semaphore header_rwsem;
@@ -204,6 +235,7 @@ struct rbd_device {
/* sysfs related */
struct device dev;
+ unsigned long open_count;
};
static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
@@ -218,7 +250,7 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
static void rbd_dev_release(struct device *dev);
-static void __rbd_remove_snap_dev(struct rbd_snap *snap);
+static void rbd_remove_snap_dev(struct rbd_snap *snap);
static ssize_t rbd_add(struct bus_type *bus, const char *buf,
size_t count);
@@ -258,17 +290,8 @@ static struct device rbd_root_dev = {
# define rbd_assert(expr) ((void) 0)
#endif /* !RBD_DEBUG */
-static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
-{
- return get_device(&rbd_dev->dev);
-}
-
-static void rbd_put_dev(struct rbd_device *rbd_dev)
-{
- put_device(&rbd_dev->dev);
-}
-
-static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
+static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
+static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
static int rbd_open(struct block_device *bdev, fmode_t mode)
{
@@ -277,8 +300,11 @@ static int rbd_open(struct block_device *bdev, fmode_t mode)
if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
return -EROFS;
- rbd_get_dev(rbd_dev);
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+ (void) get_device(&rbd_dev->dev);
set_device_ro(bdev, rbd_dev->mapping.read_only);
+ rbd_dev->open_count++;
+ mutex_unlock(&ctl_mutex);
return 0;
}
@@ -287,7 +313,11 @@ static int rbd_release(struct gendisk *disk, fmode_t mode)
{
struct rbd_device *rbd_dev = disk->private_data;
- rbd_put_dev(rbd_dev);
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+ rbd_assert(rbd_dev->open_count > 0);
+ rbd_dev->open_count--;
+ put_device(&rbd_dev->dev);
+ mutex_unlock(&ctl_mutex);
return 0;
}
@@ -388,7 +418,7 @@ enum {
static match_table_t rbd_opts_tokens = {
/* int args above */
/* string args above */
- {Opt_read_only, "mapping.read_only"},
+ {Opt_read_only, "read_only"},
{Opt_read_only, "ro"}, /* Alternate spelling */
{Opt_read_write, "read_write"},
{Opt_read_write, "rw"}, /* Alternate spelling */
@@ -441,33 +471,17 @@ static int parse_rbd_opts_token(char *c, void *private)
* Get a ceph client with specific addr and configuration, if one does
* not exist create it.
*/
-static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
- size_t mon_addr_len, char *options)
+static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
{
- struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
- struct ceph_options *ceph_opts;
struct rbd_client *rbdc;
- rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
-
- ceph_opts = ceph_parse_options(options, mon_addr,
- mon_addr + mon_addr_len,
- parse_rbd_opts_token, rbd_opts);
- if (IS_ERR(ceph_opts))
- return PTR_ERR(ceph_opts);
-
rbdc = rbd_client_find(ceph_opts);
- if (rbdc) {
- /* using an existing client */
+ if (rbdc) /* using an existing client */
ceph_destroy_options(ceph_opts);
- } else {
+ else
rbdc = rbd_client_create(ceph_opts);
- if (IS_ERR(rbdc))
- return PTR_ERR(rbdc);
- }
- rbd_dev->rbd_client = rbdc;
- return 0;
+ return rbdc;
}
/*
@@ -492,10 +506,10 @@ static void rbd_client_release(struct kref *kref)
* Drop reference to ceph client node. If it's not referenced anymore, release
* it.
*/
-static void rbd_put_client(struct rbd_device *rbd_dev)
+static void rbd_put_client(struct rbd_client *rbdc)
{
- kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
- rbd_dev->rbd_client = NULL;
+ if (rbdc)
+ kref_put(&rbdc->kref, rbd_client_release);
}
/*
@@ -524,6 +538,16 @@ static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
return false;
+ /* The bio layer requires at least sector-sized I/O */
+
+ if (ondisk->options.order < SECTOR_SHIFT)
+ return false;
+
+ /* If we use u64 in a few spots we may be able to loosen this */
+
+ if (ondisk->options.order > 8 * sizeof (int) - 1)
+ return false;
+
/*
* The size of a snapshot header has to fit in a size_t, and
* that limits the number of snapshots.
@@ -635,6 +659,20 @@ out_err:
return -ENOMEM;
}
+static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
+{
+ struct rbd_snap *snap;
+
+ if (snap_id == CEPH_NOSNAP)
+ return RBD_SNAP_HEAD_NAME;
+
+ list_for_each_entry(snap, &rbd_dev->snaps, node)
+ if (snap_id == snap->id)
+ return snap->name;
+
+ return NULL;
+}
+
static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
{
@@ -642,7 +680,7 @@ static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
list_for_each_entry(snap, &rbd_dev->snaps, node) {
if (!strcmp(snap_name, snap->name)) {
- rbd_dev->mapping.snap_id = snap->id;
+ rbd_dev->spec->snap_id = snap->id;
rbd_dev->mapping.size = snap->size;
rbd_dev->mapping.features = snap->features;
@@ -653,26 +691,23 @@ static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
return -ENOENT;
}
-static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
+static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
{
int ret;
- if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
+ if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
sizeof (RBD_SNAP_HEAD_NAME))) {
- rbd_dev->mapping.snap_id = CEPH_NOSNAP;
+ rbd_dev->spec->snap_id = CEPH_NOSNAP;
rbd_dev->mapping.size = rbd_dev->header.image_size;
rbd_dev->mapping.features = rbd_dev->header.features;
- rbd_dev->mapping.snap_exists = false;
- rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
ret = 0;
} else {
- ret = snap_by_name(rbd_dev, snap_name);
+ ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
if (ret < 0)
goto done;
- rbd_dev->mapping.snap_exists = true;
rbd_dev->mapping.read_only = true;
}
- rbd_dev->mapping.snap_name = snap_name;
+ rbd_dev->exists = true;
done:
return ret;
}
@@ -695,13 +730,13 @@ static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
u64 segment;
int ret;
- name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
+ name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
if (!name)
return NULL;
segment = offset >> rbd_dev->header.obj_order;
- ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
+ ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
rbd_dev->header.object_prefix, segment);
- if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
+ if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
pr_err("error formatting segment name for #%llu (%d)\n",
segment, ret);
kfree(name);
@@ -800,77 +835,144 @@ static void zero_bio_chain(struct bio *chain, int start_ofs)
}
/*
- * bio_chain_clone - clone a chain of bios up to a certain length.
- * might return a bio_pair that will need to be released.
+ * Clone a portion of a bio, starting at the given byte offset
+ * and continuing for the number of bytes indicated.
*/
-static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
- struct bio_pair **bp,
- int len, gfp_t gfpmask)
-{
- struct bio *old_chain = *old;
- struct bio *new_chain = NULL;
- struct bio *tail;
- int total = 0;
-
- if (*bp) {
- bio_pair_release(*bp);
- *bp = NULL;
- }
+static struct bio *bio_clone_range(struct bio *bio_src,
+ unsigned int offset,
+ unsigned int len,
+ gfp_t gfpmask)
+{
+ struct bio_vec *bv;
+ unsigned int resid;
+ unsigned short idx;
+ unsigned int voff;
+ unsigned short end_idx;
+ unsigned short vcnt;
+ struct bio *bio;
- while (old_chain && (total < len)) {
- struct bio *tmp;
+ /* Handle the easy case for the caller */
- tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
- if (!tmp)
- goto err_out;
- gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
+ if (!offset && len == bio_src->bi_size)
+ return bio_clone(bio_src, gfpmask);
- if (total + old_chain->bi_size > len) {
- struct bio_pair *bp;
+ if (WARN_ON_ONCE(!len))
+ return NULL;
+ if (WARN_ON_ONCE(len > bio_src->bi_size))
+ return NULL;
+ if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
+ return NULL;
- /*
- * this split can only happen with a single paged bio,
- * split_bio will BUG_ON if this is not the case
- */
- dout("bio_chain_clone split! total=%d remaining=%d"
- "bi_size=%u\n",
- total, len - total, old_chain->bi_size);
+ /* Find first affected segment... */
- /* split the bio. We'll release it either in the next
- call, or it will have to be released outside */
- bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
- if (!bp)
- goto err_out;
+ resid = offset;
+ __bio_for_each_segment(bv, bio_src, idx, 0) {
+ if (resid < bv->bv_len)
+ break;
+ resid -= bv->bv_len;
+ }
+ voff = resid;
- __bio_clone(tmp, &bp->bio1);
+ /* ...and the last affected segment */
- *next = &bp->bio2;
- } else {
- __bio_clone(tmp, old_chain);
- *next = old_chain->bi_next;
- }
+ resid += len;
+ __bio_for_each_segment(bv, bio_src, end_idx, idx) {
+ if (resid <= bv->bv_len)
+ break;
+ resid -= bv->bv_len;
+ }
+ vcnt = end_idx - idx + 1;
+
+ /* Build the clone */
- tmp->bi_bdev = NULL;
- tmp->bi_next = NULL;
- if (new_chain)
- tail->bi_next = tmp;
- else
- new_chain = tmp;
- tail = tmp;
- old_chain = old_chain->bi_next;
+ bio = bio_alloc(gfpmask, (unsigned int) vcnt);
+ if (!bio)
+ return NULL; /* ENOMEM */
- total += tmp->bi_size;
+ bio->bi_bdev = bio_src->bi_bdev;
+ bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
+ bio->bi_rw = bio_src->bi_rw;
+ bio->bi_flags |= 1 << BIO_CLONED;
+
+ /*
+ * Copy over our part of the bio_vec, then update the first
+ * and last (or only) entries.
+ */
+ memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
+ vcnt * sizeof (struct bio_vec));
+ bio->bi_io_vec[0].bv_offset += voff;
+ if (vcnt > 1) {
+ bio->bi_io_vec[0].bv_len -= voff;
+ bio->bi_io_vec[vcnt - 1].bv_len = resid;
+ } else {
+ bio->bi_io_vec[0].bv_len = len;
}
- rbd_assert(total == len);
+ bio->bi_vcnt = vcnt;
+ bio->bi_size = len;
+ bio->bi_idx = 0;
+
+ return bio;
+}
+
+/*
+ * Clone a portion of a bio chain, starting at the given byte offset
+ * into the first bio in the source chain and continuing for the
+ * number of bytes indicated. The result is another bio chain of
+ * exactly the given length, or a null pointer on error.
+ *
+ * The bio_src and offset parameters are both in-out. On entry they
+ * refer to the first source bio and the offset into that bio where
+ * the start of data to be cloned is located.
+ *
+ * On return, bio_src is updated to refer to the bio in the source
+ * chain that contains first un-cloned byte, and *offset will
+ * contain the offset of that byte within that bio.
+ */
+static struct bio *bio_chain_clone_range(struct bio **bio_src,
+ unsigned int *offset,
+ unsigned int len,
+ gfp_t gfpmask)
+{
+ struct bio *bi = *bio_src;
+ unsigned int off = *offset;
+ struct bio *chain = NULL;
+ struct bio **end;
+
+ /* Build up a chain of clone bios up to the limit */
+
+ if (!bi || off >= bi->bi_size || !len)
+ return NULL; /* Nothing to clone */
- *old = old_chain;
+ end = &chain;
+ while (len) {
+ unsigned int bi_size;
+ struct bio *bio;
+
+ if (!bi)
+ goto out_err; /* EINVAL; ran out of bio's */
+ bi_size = min_t(unsigned int, bi->bi_size - off, len);
+ bio = bio_clone_range(bi, off, bi_size, gfpmask);
+ if (!bio)
+ goto out_err; /* ENOMEM */
+
+ *end = bio;
+ end = &bio->bi_next;
+
+ off += bi_size;
+ if (off == bi->bi_size) {
+ bi = bi->bi_next;
+ off = 0;
+ }
+ len -= bi_size;
+ }
+ *bio_src = bi;
+ *offset = off;
- return new_chain;
+ return chain;
+out_err:
+ bio_chain_put(chain);
-err_out:
- dout("bio_chain_clone with err\n");
- bio_chain_put(new_chain);
return NULL;
}
@@ -988,8 +1090,9 @@ static int rbd_do_request(struct request *rq,
req_data->coll_index = coll_index;
}
- dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
- (unsigned long long) ofs, (unsigned long long) len);
+ dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
+ object_name, (unsigned long long) ofs,
+ (unsigned long long) len, coll, coll_index);
osdc = &rbd_dev->rbd_client->client->osdc;
req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
@@ -1019,7 +1122,7 @@ static int rbd_do_request(struct request *rq,
layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
layout->fl_stripe_count = cpu_to_le32(1);
layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
- layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
+ layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
req, ops);
rbd_assert(ret == 0);
@@ -1154,8 +1257,6 @@ done:
static int rbd_do_op(struct request *rq,
struct rbd_device *rbd_dev,
struct ceph_snap_context *snapc,
- u64 snapid,
- int opcode, int flags,
u64 ofs, u64 len,
struct bio *bio,
struct rbd_req_coll *coll,
@@ -1167,6 +1268,9 @@ static int rbd_do_op(struct request *rq,
int ret;
struct ceph_osd_req_op *ops;
u32 payload_len;
+ int opcode;
+ int flags;
+ u64 snapid;
seg_name = rbd_segment_name(rbd_dev, ofs);
if (!seg_name)
@@ -1174,7 +1278,18 @@ static int rbd_do_op(struct request *rq,
seg_len = rbd_segment_length(rbd_dev, ofs, len);
seg_ofs = rbd_segment_offset(rbd_dev, ofs);
- payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
+ if (rq_data_dir(rq) == WRITE) {
+ opcode = CEPH_OSD_OP_WRITE;
+ flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
+ snapid = CEPH_NOSNAP;
+ payload_len = seg_len;
+ } else {
+ opcode = CEPH_OSD_OP_READ;
+ flags = CEPH_OSD_FLAG_READ;
+ snapc = NULL;
+ snapid = rbd_dev->spec->snap_id;
+ payload_len = 0;
+ }
ret = -ENOMEM;
ops = rbd_create_rw_ops(1, opcode, payload_len);
@@ -1202,41 +1317,6 @@ done:
}
/*
- * Request async osd write
- */
-static int rbd_req_write(struct request *rq,
- struct rbd_device *rbd_dev,
- struct ceph_snap_context *snapc,
- u64 ofs, u64 len,
- struct bio *bio,
- struct rbd_req_coll *coll,
- int coll_index)
-{
- return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
- CEPH_OSD_OP_WRITE,
- CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
- ofs, len, bio, coll, coll_index);
-}
-
-/*
- * Request async osd read
- */
-static int rbd_req_read(struct request *rq,
- struct rbd_device *rbd_dev,
- u64 snapid,
- u64 ofs, u64 len,
- struct bio *bio,
- struct rbd_req_coll *coll,
- int coll_index)
-{
- return rbd_do_op(rq, rbd_dev, NULL,
- snapid,
- CEPH_OSD_OP_READ,
- CEPH_OSD_FLAG_READ,
- ofs, len, bio, coll, coll_index);
-}
-
-/*
* Request sync osd read
*/
static int rbd_req_sync_read(struct rbd_device *rbd_dev,
@@ -1304,7 +1384,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
rbd_dev->header_name, (unsigned long long) notify_id,
(unsigned int) opcode);
- rc = rbd_refresh_header(rbd_dev, &hver);
+ rc = rbd_dev_refresh(rbd_dev, &hver);
if (rc)
pr_warning(RBD_DRV_NAME "%d got notification but failed to "
" update snaps: %d\n", rbd_dev->major, rc);
@@ -1460,18 +1540,16 @@ static void rbd_rq_fn(struct request_queue *q)
{
struct rbd_device *rbd_dev = q->queuedata;
struct request *rq;
- struct bio_pair *bp = NULL;
while ((rq = blk_fetch_request(q))) {
struct bio *bio;
- struct bio *rq_bio, *next_bio = NULL;
bool do_write;
unsigned int size;
- u64 op_size = 0;
u64 ofs;
int num_segs, cur_seg = 0;
struct rbd_req_coll *coll;
struct ceph_snap_context *snapc;
+ unsigned int bio_offset;
dout("fetched request\n");
@@ -1483,10 +1561,6 @@ static void rbd_rq_fn(struct request_queue *q)
/* deduce our operation (read, write) */
do_write = (rq_data_dir(rq) == WRITE);
-
- size = blk_rq_bytes(rq);
- ofs = blk_rq_pos(rq) * SECTOR_SIZE;
- rq_bio = rq->bio;
if (do_write && rbd_dev->mapping.read_only) {
__blk_end_request_all(rq, -EROFS);
continue;
@@ -1496,8 +1570,8 @@ static void rbd_rq_fn(struct request_queue *q)
down_read(&rbd_dev->header_rwsem);
- if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
- !rbd_dev->mapping.snap_exists) {
+ if (!rbd_dev->exists) {
+ rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
up_read(&rbd_dev->header_rwsem);
dout("request for non-existent snapshot");
spin_lock_irq(q->queue_lock);
@@ -1509,6 +1583,10 @@ static void rbd_rq_fn(struct request_queue *q)
up_read(&rbd_dev->header_rwsem);
+ size = blk_rq_bytes(rq);
+ ofs = blk_rq_pos(rq) * SECTOR_SIZE;
+ bio = rq->bio;
+
dout("%s 0x%x bytes at 0x%llx\n",
do_write ? "write" : "read",
size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
@@ -1528,45 +1606,37 @@ static void rbd_rq_fn(struct request_queue *q)
continue;
}
+ bio_offset = 0;
do {
- /* a bio clone to be passed down to OSD req */
+ u64 limit = rbd_segment_length(rbd_dev, ofs, size);
+ unsigned int chain_size;
+ struct bio *bio_chain;
+
+ BUG_ON(limit > (u64) UINT_MAX);
+ chain_size = (unsigned int) limit;
dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
- op_size = rbd_segment_length(rbd_dev, ofs, size);
+
kref_get(&coll->kref);
- bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
- op_size, GFP_ATOMIC);
- if (!bio) {
- rbd_coll_end_req_index(rq, coll, cur_seg,
- -ENOMEM, op_size);
- goto next_seg;
- }
+ /* Pass a cloned bio chain via an osd request */
- /* init OSD command: write or read */
- if (do_write)
- rbd_req_write(rq, rbd_dev,
- snapc,
- ofs,
- op_size, bio,
- coll, cur_seg);
+ bio_chain = bio_chain_clone_range(&bio,
+ &bio_offset, chain_size,
+ GFP_ATOMIC);
+ if (bio_chain)
+ (void) rbd_do_op(rq, rbd_dev, snapc,
+ ofs, chain_size,
+ bio_chain, coll, cur_seg);
else
- rbd_req_read(rq, rbd_dev,
- rbd_dev->mapping.snap_id,
- ofs,
- op_size, bio,
- coll, cur_seg);
-
-next_seg:
- size -= op_size;
- ofs += op_size;
+ rbd_coll_end_req_index(rq, coll, cur_seg,
+ -ENOMEM, chain_size);
+ size -= chain_size;
+ ofs += chain_size;
cur_seg++;
- rq_bio = next_bio;
} while (size > 0);
kref_put(&coll->kref, rbd_coll_release);
- if (bp)
- bio_pair_release(bp);
spin_lock_irq(q->queue_lock);
ceph_put_snap_context(snapc);
@@ -1576,28 +1646,47 @@ next_seg:
/*
* a queue callback. Makes sure that we don't create a bio that spans across
* multiple osd objects. One exception would be with a single page bios,
- * which we handle later at bio_chain_clone
+ * which we handle later at bio_chain_clone_range()
*/
static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
struct bio_vec *bvec)
{
struct rbd_device *rbd_dev = q->queuedata;
- unsigned int chunk_sectors;
- sector_t sector;
- unsigned int bio_sectors;
- int max;
+ sector_t sector_offset;
+ sector_t sectors_per_obj;
+ sector_t obj_sector_offset;
+ int ret;
- chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
- sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
- bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
+ /*
+ * Find how far into its rbd object the partition-relative
+ * bio start sector is to offset relative to the enclosing
+ * device.
+ */
+ sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
+ sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
+ obj_sector_offset = sector_offset & (sectors_per_obj - 1);
+
+ /*
+ * Compute the number of bytes from that offset to the end
+ * of the object. Account for what's already used by the bio.
+ */
+ ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
+ if (ret > bmd->bi_size)
+ ret -= bmd->bi_size;
+ else
+ ret = 0;
- max = (chunk_sectors - ((sector & (chunk_sectors - 1))
- + bio_sectors)) << SECTOR_SHIFT;
- if (max < 0)
- max = 0; /* bio_add cannot handle a negative return */
- if (max <= bvec->bv_len && bio_sectors == 0)
- return bvec->bv_len;
- return max;
+ /*
+ * Don't send back more than was asked for. And if the bio
+ * was empty, let the whole thing through because: "Note
+ * that a block device *must* allow a single page to be
+ * added to an empty bio."
+ */
+ rbd_assert(bvec->bv_len <= PAGE_SIZE);
+ if (ret > (int) bvec->bv_len || !bmd->bi_size)
+ ret = (int) bvec->bv_len;
+
+ return ret;
}
static void rbd_free_disk(struct rbd_device *rbd_dev)
@@ -1663,13 +1752,13 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
ret = -ENXIO;
pr_warning("short header read for image %s"
" (want %zd got %d)\n",
- rbd_dev->image_name, size, ret);
+ rbd_dev->spec->image_name, size, ret);
goto out_err;
}
if (!rbd_dev_ondisk_valid(ondisk)) {
ret = -ENXIO;
pr_warning("invalid header for image %s\n",
- rbd_dev->image_name);
+ rbd_dev->spec->image_name);
goto out_err;
}
@@ -1707,19 +1796,32 @@ static int rbd_read_header(struct rbd_device *rbd_dev,
return ret;
}
-static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
+static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
{
struct rbd_snap *snap;
struct rbd_snap *next;
list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
- __rbd_remove_snap_dev(snap);
+ rbd_remove_snap_dev(snap);
+}
+
+static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
+{
+ sector_t size;
+
+ if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
+ return;
+
+ size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
+ dout("setting size to %llu sectors", (unsigned long long) size);
+ rbd_dev->mapping.size = (u64) size;
+ set_capacity(rbd_dev->disk, size);
}
/*
* only read the first part of the ondisk header, without the snaps info
*/
-static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
+static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
{
int ret;
struct rbd_image_header h;
@@ -1730,17 +1832,9 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
down_write(&rbd_dev->header_rwsem);
- /* resized? */
- if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
- sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
-
- if (size != (sector_t) rbd_dev->mapping.size) {
- dout("setting size to %llu sectors",
- (unsigned long long) size);
- rbd_dev->mapping.size = (u64) size;
- set_capacity(rbd_dev->disk, size);
- }
- }
+ /* Update image size, and check for resize of mapped image */
+ rbd_dev->header.image_size = h.image_size;
+ rbd_update_mapping_size(rbd_dev);
/* rbd_dev->header.object_prefix shouldn't change */
kfree(rbd_dev->header.snap_sizes);
@@ -1768,12 +1862,16 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
return ret;
}
-static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
+static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
{
int ret;
+ rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
- ret = __rbd_refresh_header(rbd_dev, hver);
+ if (rbd_dev->image_format == 1)
+ ret = rbd_dev_v1_refresh(rbd_dev, hver);
+ else
+ ret = rbd_dev_v2_refresh(rbd_dev, hver);
mutex_unlock(&ctl_mutex);
return ret;
@@ -1885,7 +1983,7 @@ static ssize_t rbd_pool_show(struct device *dev,
{
struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
- return sprintf(buf, "%s\n", rbd_dev->pool_name);
+ return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
}
static ssize_t rbd_pool_id_show(struct device *dev,
@@ -1893,7 +1991,8 @@ static ssize_t rbd_pool_id_show(struct device *dev,
{
struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
- return sprintf(buf, "%d\n", rbd_dev->pool_id);
+ return sprintf(buf, "%llu\n",
+ (unsigned long long) rbd_dev->spec->pool_id);
}
static ssize_t rbd_name_show(struct device *dev,
@@ -1901,7 +2000,10 @@ static ssize_t rbd_name_show(struct device *dev,
{
struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
- return sprintf(buf, "%s\n", rbd_dev->image_name);
+ if (rbd_dev->spec->image_name)
+ return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
+
+ return sprintf(buf, "(unknown)\n");
}
static ssize_t rbd_image_id_show(struct device *dev,
@@ -1909,7 +2011,7 @@ static ssize_t rbd_image_id_show(struct device *dev,
{
struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
- return sprintf(buf, "%s\n", rbd_dev->image_id);
+ return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
}
/*
@@ -1922,7 +2024,50 @@ static ssize_t rbd_snap_show(struct device *dev,
{
struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
- return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
+ return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
+}
+
+/*
+ * For an rbd v2 image, shows the pool id, image id, and snapshot id
+ * for the parent image. If there is no parent, simply shows
+ * "(no parent image)".
+ */
+static ssize_t rbd_parent_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+ struct rbd_spec *spec = rbd_dev->parent_spec;
+ int count;
+ char *bufp = buf;
+
+ if (!spec)
+ return sprintf(buf, "(no parent image)\n");
+
+ count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
+ (unsigned long long) spec->pool_id, spec->pool_name);
+ if (count < 0)
+ return count;
+ bufp += count;
+
+ count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
+ spec->image_name ? spec->image_name : "(unknown)");
+ if (count < 0)
+ return count;
+ bufp += count;
+
+ count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
+ (unsigned long long) spec->snap_id, spec->snap_name);
+ if (count < 0)
+ return count;
+ bufp += count;
+
+ count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
+ if (count < 0)
+ return count;
+ bufp += count;
+
+ return (ssize_t) (bufp - buf);
}
static ssize_t rbd_image_refresh(struct device *dev,
@@ -1933,7 +2078,7 @@ static ssize_t rbd_image_refresh(struct device *dev,
struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
int ret;
- ret = rbd_refresh_header(rbd_dev, NULL);
+ ret = rbd_dev_refresh(rbd_dev, NULL);
return ret < 0 ? ret : size;
}
@@ -1948,6 +2093,7 @@ static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
+static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
static struct attribute *rbd_attrs[] = {
&dev_attr_size.attr,
@@ -1959,6 +2105,7 @@ static struct attribute *rbd_attrs[] = {
&dev_attr_name.attr,
&dev_attr_image_id.attr,