From 3a25cf43e00842ce51f7ce48ea5e38e516b574a8 Mon Sep 17 00:00:00 2001 From: Rickard Strandqvist Date: Thu, 1 Jan 2015 17:58:32 +0100 Subject: rbd: nuke copy_token() It's been largely superseded by dup_token() and unused for over 2 years, identified by cppcheck. Signed-off-by: Rickard Strandqvist [idryomov@redhat.com: changelog] Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 30 ------------------------------ 1 file changed, 30 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8a86b62466f7..ef070d7c5e3c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4770,36 +4770,6 @@ static inline size_t next_token(const char **buf) return strcspn(*buf, spaces); /* Return token length */ } -/* - * Finds the next token in *buf, and if the provided token buffer is - * big enough, copies the found token into it. The result, if - * copied, is guaranteed to be terminated with '\0'. Note that *buf - * must be terminated with '\0' on entry. - * - * Returns the length of the token found (not including the '\0'). - * Return value will be 0 if no token is found, and it will be >= - * token_size if the token would not fit. - * - * The *buf pointer will be updated to point beyond the end of the - * found token. Note that this occurs even if the token buffer is - * too small to hold it. - */ -static inline size_t copy_token(const char **buf, - char *token, - size_t token_size) -{ - size_t len; - - len = next_token(buf); - if (len < token_size) { - memcpy(token, *buf, len); - *(token + len) = '\0'; - } - *buf += len; - - return len; -} - /* * Finds the next token in *buf, dynamically allocates a buffer big * enough to hold a copy of it, and copies the token into the new -- cgit v1.2.3 From 73e39e4dba828fe1affefe6290456623319707bd Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 8 Jan 2015 20:18:22 +0300 Subject: rbd: fix error paths in rbd_dev_refresh() header_rwsem should be released on errors. Also remove useless rbd_dev->mapping.size != rbd_dev->header.image_size test. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ef070d7c5e3c..b85d52005a21 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3694,7 +3694,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) ret = rbd_dev_header_info(rbd_dev); if (ret) - return ret; + goto out; /* * If there is a parent, see if it has disappeared due to the @@ -3703,23 +3703,22 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) if (rbd_dev->parent) { ret = rbd_dev_v2_parent_info(rbd_dev); if (ret) - return ret; + goto out; } if (rbd_dev->spec->snap_id == CEPH_NOSNAP) { - if (rbd_dev->mapping.size != rbd_dev->header.image_size) - rbd_dev->mapping.size = rbd_dev->header.image_size; + rbd_dev->mapping.size = rbd_dev->header.image_size; } else { /* validate mapped snapshot's EXISTS flag */ rbd_exists_validate(rbd_dev); } +out: up_write(&rbd_dev->header_rwsem); - - if (mapping_size != rbd_dev->mapping.size) + if (!ret && mapping_size != rbd_dev->mapping.size) rbd_dev_update_size(rbd_dev); - return 0; + return ret; } static int rbd_init_disk(struct rbd_device *rbd_dev) -- cgit v1.2.3 From cf32bd9c86b6917d8446c00ea0081dde6e716a82 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 19 Jan 2015 22:57:39 +0300 Subject: rbd: do not treat standalone as flatten If the clone is resized down to 0, it becomes standalone. If such resize is carried over while an image is mapped we would detect this and call rbd_dev_parent_put() which means "let go of all parent state, including the spec(s) of parent images(s)". This leads to a mismatch between "rbd info" and sysfs parent fields, so a fix is in order. # rbd create --image-format 2 --size 1 foo # rbd snap create foo@snap # rbd snap protect foo@snap # rbd clone foo@snap bar # DEV=$(rbd map bar) # rbd resize --allow-shrink --size 0 bar # rbd resize --size 1 bar # rbd info bar | grep parent parent: rbd/foo@snap Before: # cat /sys/bus/rbd/devices/0/parent (no parent image) After: # cat /sys/bus/rbd/devices/0/parent pool_id 0 pool_name rbd image_id 10056b8b4567 image_name foo snap_id 2 snap_name snap overlap 0 Signed-off-by: Ilya Dryomov Reviewed-by: Josh Durgin Reviewed-by: Alex Elder --- drivers/block/rbd.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b85d52005a21..e818c2a6ffb1 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4273,32 +4273,22 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) } /* - * We always update the parent overlap. If it's zero we - * treat it specially. + * We always update the parent overlap. If it's zero we issue + * a warning, as we will proceed as if there was no parent. */ - rbd_dev->parent_overlap = overlap; if (!overlap) { - - /* A null parent_spec indicates it's the initial probe */ - if (parent_spec) { - /* - * The overlap has become zero, so the clone - * must have been resized down to 0 at some - * point. Treat this the same as a flatten. - */ - rbd_dev_parent_put(rbd_dev); - pr_info("%s: clone image now standalone\n", - rbd_dev->disk->disk_name); + /* refresh, careful to warn just once */ + if (rbd_dev->parent_overlap) + rbd_warn(rbd_dev, + "clone now standalone (overlap became 0)"); } else { - /* - * For the initial probe, if we find the - * overlap is zero we just pretend there was - * no parent image. - */ - rbd_warn(rbd_dev, "ignoring parent with overlap 0"); + /* initial probe */ + rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); } } + rbd_dev->parent_overlap = overlap; + out: ret = 0; out_err: -- cgit v1.2.3 From 7ad18afad02f9802f1eeade91cf880b97e7a9902 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Jan 2015 17:20:04 +0100 Subject: rbd: convert to blk-mq This converts the rbd driver to use the blk-mq infrastructure. Except for switching to a per-request work item this is almost mechanical. This was tested by Alexandre DERUMIER in November, and found to give him 120000 iops, although the only comparism available was an old 3.10 kernel which gave 80000iops. Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder [idryomov@gmail.com: context, blk_mq_init_queue() EH] Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 122 +++++++++++++++++++++++++++++----------------------- 1 file changed, 68 insertions(+), 54 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e818c2a6ffb1..b40af3203089 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -340,9 +341,7 @@ struct rbd_device { char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ - struct list_head rq_queue; /* incoming rq queue */ spinlock_t lock; /* queue, flags, open_count */ - struct work_struct rq_work; struct rbd_image_header header; unsigned long flags; /* possibly lock protected */ @@ -360,6 +359,9 @@ struct rbd_device { atomic_t parent_ref; struct rbd_device *parent; + /* Block layer tags. */ + struct blk_mq_tag_set tag_set; + /* protects updating the header */ struct rw_semaphore header_rwsem; @@ -1817,7 +1819,8 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, /* * We support a 64-bit length, but ultimately it has to be - * passed to blk_end_request(), which takes an unsigned int. + * passed to the block layer, which just supports a 32-bit + * length field. */ obj_request->xferred = osd_req->r_reply_op_len[0]; rbd_assert(obj_request->xferred < (u64)UINT_MAX); @@ -2275,7 +2278,10 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) more = obj_request->which < img_request->obj_request_count - 1; } else { rbd_assert(img_request->rq != NULL); - more = blk_end_request(img_request->rq, result, xferred); + + more = blk_update_request(img_request->rq, result, xferred); + if (!more) + __blk_mq_end_request(img_request->rq, result); } return more; @@ -3304,8 +3310,10 @@ out: return ret; } -static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) +static void rbd_queue_workfn(struct work_struct *work) { + struct request *rq = blk_mq_rq_from_pdu(work); + struct rbd_device *rbd_dev = rq->q->queuedata; struct rbd_img_request *img_request; struct ceph_snap_context *snapc = NULL; u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; @@ -3314,6 +3322,13 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) u64 mapping_size; int result; + if (rq->cmd_type != REQ_TYPE_FS) { + dout("%s: non-fs request type %d\n", __func__, + (int) rq->cmd_type); + result = -EIO; + goto err; + } + if (rq->cmd_flags & REQ_DISCARD) op_type = OBJ_OP_DISCARD; else if (rq->cmd_flags & REQ_WRITE) @@ -3359,6 +3374,8 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) goto err_rq; /* Shouldn't happen */ } + blk_mq_start_request(rq); + down_read(&rbd_dev->header_rwsem); mapping_size = rbd_dev->mapping.size; if (op_type != OBJ_OP_READ) { @@ -3404,53 +3421,18 @@ err_rq: rbd_warn(rbd_dev, "%s %llx at %llx result %d", obj_op_name(op_type), length, offset, result); ceph_put_snap_context(snapc); - blk_end_request_all(rq, result); +err: + blk_mq_end_request(rq, result); } -static void rbd_request_workfn(struct work_struct *work) +static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { - struct rbd_device *rbd_dev = - container_of(work, struct rbd_device, rq_work); - struct request *rq, *next; - LIST_HEAD(requests); - - spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */ - list_splice_init(&rbd_dev->rq_queue, &requests); - spin_unlock_irq(&rbd_dev->lock); + struct request *rq = bd->rq; + struct work_struct *work = blk_mq_rq_to_pdu(rq); - list_for_each_entry_safe(rq, next, &requests, queuelist) { - list_del_init(&rq->queuelist); - rbd_handle_request(rbd_dev, rq); - } -} - -/* - * Called with q->queue_lock held and interrupts disabled, possibly on - * the way to schedule(). Do not sleep here! - */ -static void rbd_request_fn(struct request_queue *q) -{ - struct rbd_device *rbd_dev = q->queuedata; - struct request *rq; - int queued = 0; - - rbd_assert(rbd_dev); - - while ((rq = blk_fetch_request(q))) { - /* Ignore any non-FS requests that filter through. */ - if (rq->cmd_type != REQ_TYPE_FS) { - dout("%s: non-fs request type %d\n", __func__, - (int) rq->cmd_type); - __blk_end_request_all(rq, 0); - continue; - } - - list_add_tail(&rq->queuelist, &rbd_dev->rq_queue); - queued++; - } - - if (queued) - queue_work(rbd_wq, &rbd_dev->rq_work); + queue_work(rbd_wq, work); + return BLK_MQ_RQ_QUEUE_OK; } /* @@ -3511,6 +3493,7 @@ static void rbd_free_disk(struct rbd_device *rbd_dev) del_gendisk(disk); if (disk->queue) blk_cleanup_queue(disk->queue); + blk_mq_free_tag_set(&rbd_dev->tag_set); } put_disk(disk); } @@ -3721,11 +3704,28 @@ out: return ret; } +static int rbd_init_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int request_idx, + unsigned int numa_node) +{ + struct work_struct *work = blk_mq_rq_to_pdu(rq); + + INIT_WORK(work, rbd_queue_workfn); + return 0; +} + +static struct blk_mq_ops rbd_mq_ops = { + .queue_rq = rbd_queue_rq, + .map_queue = blk_mq_map_queue, + .init_request = rbd_init_request, +}; + static int rbd_init_disk(struct rbd_device *rbd_dev) { struct gendisk *disk; struct request_queue *q; u64 segment_size; + int err; /* create gendisk info */ disk = alloc_disk(single_major ? @@ -3743,10 +3743,25 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) disk->fops = &rbd_bd_ops; disk->private_data = rbd_dev; - q = blk_init_queue(rbd_request_fn, &rbd_dev->lock); - if (!q) + memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); + rbd_dev->tag_set.ops = &rbd_mq_ops; + rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ; + rbd_dev->tag_set.numa_node = NUMA_NO_NODE; + rbd_dev->tag_set.flags = + BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + rbd_dev->tag_set.nr_hw_queues = 1; + rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); + + err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); + if (err) goto out_disk; + q = blk_mq_init_queue(&rbd_dev->tag_set); + if (IS_ERR(q)) { + err = PTR_ERR(q); + goto out_tag_set; + } + /* We use the default size, but let's be explicit about it. */ blk_queue_physical_block_size(q, SECTOR_SIZE); @@ -3772,10 +3787,11 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) rbd_dev->disk = disk; return 0; +out_tag_set: + blk_mq_free_tag_set(&rbd_dev->tag_set); out_disk: put_disk(disk); - - return -ENOMEM; + return err; } /* @@ -4032,8 +4048,6 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, return NULL; spin_lock_init(&rbd_dev->lock); - INIT_LIST_HEAD(&rbd_dev->rq_queue); - INIT_WORK(&rbd_dev->rq_work, rbd_request_workfn); rbd_dev->flags = 0; atomic_set(&rbd_dev->parent_ref, 0); INIT_LIST_HEAD(&rbd_dev->node); -- cgit v1.2.3