summaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-07-18 11:05:25 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2019-07-18 11:05:25 -0700
commitd9b9c893048e9d308a833619f0866f1f52778cf5 (patch)
tree29090d6871a39fdf35b6e5b22fe49750e9cf7bb3 /drivers/block
parent0fe49f70a08d7d25acee3b066a88c654fea26121 (diff)
parentd31d07b97a5e76f41e00eb81dcca740e84aa7782 (diff)
Merge tag 'ceph-for-5.3-rc1' of git://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov: "Lots of exciting things this time! - support for rbd object-map and fast-diff features (myself). This will speed up reads, discards and things like snap diffs on sparse images. - ceph.snap.btime vxattr to expose snapshot creation time (David Disseldorp). This will be used to integrate with "Restore Previous Versions" feature added in Windows 7 for folks who reexport ceph through SMB. - security xattrs for ceph (Zheng Yan). Only selinux is supported for now due to the limitations of ->dentry_init_security(). - support for MSG_ADDR2, FS_BTIME and FS_CHANGE_ATTR features (Jeff Layton). This is actually a single feature bit which was missing because of the filesystem pieces. With this in, the kernel client will finally be reported as "luminous" by "ceph features" -- it is still being reported as "jewel" even though all required Luminous features were implemented in 4.13. - stop NULL-terminating ceph vxattrs (Jeff Layton). The convention with xattrs is to not terminate and this was causing inconsistencies with ceph-fuse. - change filesystem time granularity from 1 us to 1 ns, again fixing an inconsistency with ceph-fuse (Luis Henriques). On top of this there are some additional dentry name handling and cap flushing fixes from Zheng. Finally, Jeff is formally taking over for Zheng as the filesystem maintainer" * tag 'ceph-for-5.3-rc1' of git://github.com/ceph/ceph-client: (71 commits) ceph: fix end offset in truncate_inode_pages_range call ceph: use generic_delete_inode() for ->drop_inode ceph: use ceph_evict_inode to cleanup inode's resource ceph: initialize superblock s_time_gran to 1 MAINTAINERS: take over for Zheng as CephFS kernel client maintainer rbd: setallochint only if object doesn't exist rbd: support for object-map and fast-diff rbd: call rbd_dev_mapping_set() from rbd_dev_image_probe() libceph: export osd_req_op_data() macro libceph: change ceph_osdc_call() to take page vector for response libceph: bump CEPH_MSG_MAX_DATA_LEN (again) rbd: new exclusive lock wait/wake code rbd: quiescing lock should wait for image requests rbd: lock should be quiesced on reacquire rbd: introduce copyup state machine rbd: rename rbd_obj_setup_*() to rbd_obj_init_*() rbd: move OSD request allocation into object request state machines rbd: factor out __rbd_osd_setup_discard_ops() rbd: factor out rbd_osd_setup_copyup() rbd: introduce obj_req->osd_reqs list ...
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/rbd.c2188
-rw-r--r--drivers/block/rbd_types.h10
2 files changed, 1610 insertions, 588 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index e5009a34f9c2..3327192bb71f 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -115,6 +115,8 @@ static int atomic_dec_return_safe(atomic_t *v)
#define RBD_FEATURE_LAYERING (1ULL<<0)
#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
+#define RBD_FEATURE_OBJECT_MAP (1ULL<<3)
+#define RBD_FEATURE_FAST_DIFF (1ULL<<4)
#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5)
#define RBD_FEATURE_DATA_POOL (1ULL<<7)
#define RBD_FEATURE_OPERATIONS (1ULL<<8)
@@ -122,6 +124,8 @@ static int atomic_dec_return_safe(atomic_t *v)
#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
RBD_FEATURE_STRIPINGV2 | \
RBD_FEATURE_EXCLUSIVE_LOCK | \
+ RBD_FEATURE_OBJECT_MAP | \
+ RBD_FEATURE_FAST_DIFF | \
RBD_FEATURE_DEEP_FLATTEN | \
RBD_FEATURE_DATA_POOL | \
RBD_FEATURE_OPERATIONS)
@@ -203,6 +207,11 @@ struct rbd_client {
struct list_head node;
};
+struct pending_result {
+ int result; /* first nonzero result */
+ int num_pending;
+};
+
struct rbd_img_request;
enum obj_request_type {
@@ -219,6 +228,18 @@ enum obj_operation_type {
OBJ_OP_ZEROOUT,
};
+#define RBD_OBJ_FLAG_DELETION (1U << 0)
+#define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1)
+#define RBD_OBJ_FLAG_COPYUP_ZEROS (1U << 2)
+#define RBD_OBJ_FLAG_MAY_EXIST (1U << 3)
+#define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT (1U << 4)
+
+enum rbd_obj_read_state {
+ RBD_OBJ_READ_START = 1,
+ RBD_OBJ_READ_OBJECT,
+ RBD_OBJ_READ_PARENT,
+};
+
/*
* Writes go through the following state machine to deal with
* layering:
@@ -245,17 +266,28 @@ enum obj_operation_type {
* even if there is a parent).
*/
enum rbd_obj_write_state {
- RBD_OBJ_WRITE_FLAT = 1,
- RBD_OBJ_WRITE_GUARD,
- RBD_OBJ_WRITE_READ_FROM_PARENT,
- RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC,
- RBD_OBJ_WRITE_COPYUP_OPS,
+ RBD_OBJ_WRITE_START = 1,
+ RBD_OBJ_WRITE_PRE_OBJECT_MAP,
+ RBD_OBJ_WRITE_OBJECT,
+ __RBD_OBJ_WRITE_COPYUP,
+ RBD_OBJ_WRITE_COPYUP,
+ RBD_OBJ_WRITE_POST_OBJECT_MAP,
+};
+
+enum rbd_obj_copyup_state {
+ RBD_OBJ_COPYUP_START = 1,
+ RBD_OBJ_COPYUP_READ_PARENT,
+ __RBD_OBJ_COPYUP_OBJECT_MAPS,
+ RBD_OBJ_COPYUP_OBJECT_MAPS,
+ __RBD_OBJ_COPYUP_WRITE_OBJECT,
+ RBD_OBJ_COPYUP_WRITE_OBJECT,
};
struct rbd_obj_request {
struct ceph_object_extent ex;
+ unsigned int flags; /* RBD_OBJ_FLAG_* */
union {
- bool tried_parent; /* for reads */
+ enum rbd_obj_read_state read_state; /* for reads */
enum rbd_obj_write_state write_state; /* for writes */
};
@@ -271,14 +303,15 @@ struct rbd_obj_request {
u32 bvec_idx;
};
};
+
+ enum rbd_obj_copyup_state copyup_state;
struct bio_vec *copyup_bvecs;
u32 copyup_bvec_count;
- struct ceph_osd_request *osd_req;
-
- u64 xferred; /* bytes transferred */
- int result;
+ struct list_head osd_reqs; /* w/ r_private_item */
+ struct mutex state_mutex;
+ struct pending_result pending;
struct kref kref;
};
@@ -287,11 +320,19 @@ enum img_req_flags {
IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
};
+enum rbd_img_state {
+ RBD_IMG_START = 1,
+ RBD_IMG_EXCLUSIVE_LOCK,
+ __RBD_IMG_OBJECT_REQUESTS,
+ RBD_IMG_OBJECT_REQUESTS,
+};
+
struct rbd_img_request {
struct rbd_device *rbd_dev;
enum obj_operation_type op_type;
enum obj_request_type data_type;
unsigned long flags;
+ enum rbd_img_state state;
union {
u64 snap_id; /* for reads */
struct ceph_snap_context *snapc; /* for writes */
@@ -300,13 +341,14 @@ struct rbd_img_request {
struct request *rq; /* block request */
struct rbd_obj_request *obj_request; /* obj req initiator */
};
- spinlock_t completion_lock;
- u64 xferred;/* aggregate bytes transferred */
- int result; /* first nonzero obj_request result */
+ struct list_head lock_item;
struct list_head object_extents; /* obj_req.ex structs */
- u32 pending_count;
+ struct mutex state_mutex;
+ struct pending_result pending;
+ struct work_struct work;
+ int work_result;
struct kref kref;
};
@@ -380,7 +422,17 @@ struct rbd_device {
struct work_struct released_lock_work;
struct delayed_work lock_dwork;
struct work_struct unlock_work;
- wait_queue_head_t lock_waitq;
+ spinlock_t lock_lists_lock;
+ struct list_head acquiring_list;
+ struct list_head running_list;
+ struct completion acquire_wait;
+ int acquire_err;
+ struct completion releasing_wait;
+
+ spinlock_t object_map_lock;
+ u8 *object_map;
+ u64 object_map_size; /* in objects */
+ u64 object_map_flags;
struct workqueue_struct *task_wq;
@@ -408,12 +460,10 @@ struct rbd_device {
* Flag bits for rbd_dev->flags:
* - REMOVING (which is coupled with rbd_dev->open_count) is protected
* by rbd_dev->lock
- * - BLACKLISTED is protected by rbd_dev->lock_rwsem
*/
enum rbd_dev_flags {
RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
- RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
};
static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
@@ -466,6 +516,8 @@ static int minor_to_rbd_dev_id(int minor)
static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
{
+ lockdep_assert_held(&rbd_dev->lock_rwsem);
+
return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
}
@@ -583,6 +635,26 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
u8 *order, u64 *snap_size);
static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
u64 *snap_features);
+static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
+
+static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
+static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
+
+/*
+ * Return true if nothing else is pending.
+ */
+static bool pending_result_dec(struct pending_result *pending, int *result)
+{
+ rbd_assert(pending->num_pending > 0);
+
+ if (*result && !pending->result)
+ pending->result = *result;
+ if (--pending->num_pending)
+ return false;
+
+ *result = pending->result;
+ return true;
+}
static int rbd_open(struct block_device *bdev, fmode_t mode)
{
@@ -1317,6 +1389,8 @@ static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
u32 bytes)
{
+ dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
+
switch (obj_req->img_request->data_type) {
case OBJ_REQUEST_BIO:
zero_bios(&obj_req->bio_pos, off, bytes);
@@ -1339,13 +1413,6 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
kref_put(&obj_request->kref, rbd_obj_request_destroy);
}
-static void rbd_img_request_get(struct rbd_img_request *img_request)
-{
- dout("%s: img %p (was %d)\n", __func__, img_request,
- kref_read(&img_request->kref));
- kref_get(&img_request->kref);
-}
-
static void rbd_img_request_destroy(struct kref *kref);
static void rbd_img_request_put(struct rbd_img_request *img_request)
{
@@ -1362,7 +1429,6 @@ static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
/* Image request now owns object's original reference */
obj_request->img_request = img_request;
- img_request->pending_count++;
dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
}
@@ -1375,13 +1441,13 @@ static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
rbd_obj_request_put(obj_request);
}
-static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
+static void rbd_osd_submit(struct ceph_osd_request *osd_req)
{
- struct ceph_osd_request *osd_req = obj_request->osd_req;
+ struct rbd_obj_request *obj_req = osd_req->r_priv;
- dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
- obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off,
- obj_request->ex.oe_len, osd_req);
+ dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
+ __func__, osd_req, obj_req, obj_req->ex.oe_objno,
+ obj_req->ex.oe_off, obj_req->ex.oe_len);
ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
}
@@ -1457,41 +1523,38 @@ static bool rbd_img_is_write(struct rbd_img_request *img_req)
}
}
-static void rbd_obj_handle_request(struct rbd_obj_request *obj_req);
-
static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
{
struct rbd_obj_request *obj_req = osd_req->r_priv;
+ int result;
dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
osd_req->r_result, obj_req);
- rbd_assert(osd_req == obj_req->osd_req);
- obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0;
- if (!obj_req->result && !rbd_img_is_write(obj_req->img_request))
- obj_req->xferred = osd_req->r_result;
+ /*
+ * Writes aren't allowed to return a data payload. In some
+ * guarded write cases (e.g. stat + zero on an empty object)
+ * a stat response makes it through, but we don't care.
+ */
+ if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
+ result = 0;
else
- /*
- * Writes aren't allowed to return a data payload. In some
- * guarded write cases (e.g. stat + zero on an empty object)
- * a stat response makes it through, but we don't care.
- */
- obj_req->xferred = 0;
+ result = osd_req->r_result;
- rbd_obj_handle_request(obj_req);
+ rbd_obj_handle_request(obj_req, result);
}
-static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
+static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
{
- struct ceph_osd_request *osd_req = obj_request->osd_req;
+ struct rbd_obj_request *obj_request = osd_req->r_priv;
osd_req->r_flags = CEPH_OSD_FLAG_READ;
osd_req->r_snapid = obj_request->img_request->snap_id;
}
-static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
+static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
{
- struct ceph_osd_request *osd_req = obj_request->osd_req;
+ struct rbd_obj_request *obj_request = osd_req->r_priv;
osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
ktime_get_real_ts64(&osd_req->r_mtime);
@@ -1499,19 +1562,21 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
}
static struct ceph_osd_request *
-__rbd_osd_req_create(struct rbd_obj_request *obj_req,
- struct ceph_snap_context *snapc, unsigned int num_ops)
+__rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
+ struct ceph_snap_context *snapc, int num_ops)
{
struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
struct ceph_osd_request *req;
const char *name_format = rbd_dev->image_format == 1 ?
RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
+ int ret;
req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
if (!req)
- return NULL;
+ return ERR_PTR(-ENOMEM);
+ list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
req->r_callback = rbd_osd_req_callback;
req->r_priv = obj_req;
@@ -1522,27 +1587,20 @@ __rbd_osd_req_create(struct rbd_obj_request *obj_req,
ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
req->r_base_oloc.pool = rbd_dev->layout.pool_id;
- if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
- rbd_dev->header.object_prefix, obj_req->ex.oe_objno))
- goto err_req;
+ ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
+ rbd_dev->header.object_prefix,
+ obj_req->ex.oe_objno);
+ if (ret)
+ return ERR_PTR(ret);
return req;
-
-err_req:
- ceph_osdc_put_request(req);
- return NULL;
}
static struct ceph_osd_request *
-rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops)
+rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
{
- return __rbd_osd_req_create(obj_req, obj_req->img_request->snapc,
- num_ops);
-}
-
-static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
-{
- ceph_osdc_put_request(osd_req);
+ return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
+ num_ops);
}
static struct rbd_obj_request *rbd_obj_request_create(void)
@@ -1554,6 +1612,8 @@ static struct rbd_obj_request *rbd_obj_request_create(void)
return NULL;
ceph_object_extent_init(&obj_request->ex);
+ INIT_LIST_HEAD(&obj_request->osd_reqs);
+ mutex_init(&obj_request->state_mutex);
kref_init(&obj_request->kref);
dout("%s %p\n", __func__, obj_request);
@@ -1563,14 +1623,19 @@ static struct rbd_obj_request *rbd_obj_request_create(void)
static void rbd_obj_request_destroy(struct kref *kref)
{
struct rbd_obj_request *obj_request;
+ struct ceph_osd_request *osd_req;
u32 i;
obj_request = container_of(kref, struct rbd_obj_request, kref);
dout("%s: obj %p\n", __func__, obj_request);
- if (obj_request->osd_req)
- rbd_osd_req_destroy(obj_request->osd_req);
+ while (!list_empty(&obj_request->osd_reqs)) {
+ osd_req = list_first_entry(&obj_request->osd_reqs,
+ struct ceph_osd_request, r_private_item);
+ list_del_init(&osd_req->r_private_item);
+ ceph_osdc_put_request(osd_req);
+ }
switch (obj_request->img_request->data_type) {
case OBJ_REQUEST_NODATA:
@@ -1684,8 +1749,9 @@ static struct rbd_img_request *rbd_img_request_create(
if (rbd_dev_parent_get(rbd_dev))
img_request_layered_set(img_request);
- spin_lock_init(&img_request->completion_lock);
+ INIT_LIST_HEAD(&img_request->lock_item);
INIT_LIST_HEAD(&img_request->object_extents);
+ mutex_init(&img_request->state_mutex);
kref_init(&img_request->kref);
dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
@@ -1703,6 +1769,7 @@ static void rbd_img_request_destroy(struct kref *kref)
dout("%s: img %p\n", __func__, img_request);
+ WARN_ON(!list_empty(&img_request->lock_item));
for_each_obj_request_safe(img_request, obj_request, next_obj_request)
rbd_img_obj_request_del(img_request, obj_request);
@@ -1717,6 +1784,466 @@ static void rbd_img_request_destroy(struct kref *kref)
kmem_cache_free(rbd_img_request_cache, img_request);
}
+#define BITS_PER_OBJ 2
+#define OBJS_PER_BYTE (BITS_PER_BYTE / BITS_PER_OBJ)
+#define OBJ_MASK ((1 << BITS_PER_OBJ) - 1)
+
+static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
+ u64 *index, u8 *shift)
+{
+ u32 off;
+
+ rbd_assert(objno < rbd_dev->object_map_size);
+ *index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
+ *shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
+}
+
+static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
+{
+ u64 index;
+ u8 shift;
+
+ lockdep_assert_held(&rbd_dev->object_map_lock);
+ __rbd_object_map_index(rbd_dev, objno, &index, &shift);
+ return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
+}
+
+static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
+{
+ u64 index;
+ u8 shift;
+ u8 *p;
+
+ lockdep_assert_held(&rbd_dev->object_map_lock);
+ rbd_assert(!(val & ~OBJ_MASK));
+
+ __rbd_object_map_index(rbd_dev, objno, &index, &shift);
+ p = &rbd_dev->object_map[index];
+ *p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
+}
+
+static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
+{
+ u8 state;
+
+ spin_lock(&rbd_dev->object_map_lock);
+ state = __rbd_object_map_get(rbd_dev, objno);
+ spin_unlock(&rbd_dev->object_map_lock);
+ return state;
+}
+
+static bool use_object_map(struct rbd_device *rbd_dev)
+{
+ return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
+ !(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
+}
+
+static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
+{
+ u8 state;
+
+ /* fall back to default logic if object map is disabled or invalid */
+ if (!use_object_map(rbd_dev))
+ return true;
+
+ state = rbd_object_map_get(rbd_dev, objno);
+ return state != OBJECT_NONEXISTENT;
+}
+
+static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
+ struct ceph_object_id *oid)
+{
+ if (snap_id == CEPH_NOSNAP)
+ ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
+ rbd_dev->spec->image_id);
+ else
+ ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
+ rbd_dev->spec->image_id, snap_id);
+}
+
+static int rbd_object_map_lock(struct rbd_device *rbd_dev)
+{
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ CEPH_DEFINE_OID_ONSTACK(oid);
+ u8 lock_type;
+ char *lock_tag;
+ struct ceph_locker *lockers;
+ u32 num_lockers;
+ bool broke_lock = false;
+ int ret;
+
+ rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
+
+again:
+ ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
+ CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
+ if (ret != -EBUSY || broke_lock) {
+ if (ret == -EEXIST)
+ ret = 0; /* already locked by myself */
+ if (ret)
+ rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
+ return ret;
+ }
+
+ ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
+ RBD_LOCK_NAME, &lock_type, &lock_tag,
+ &lockers, &num_lockers);
+ if (ret) {
+ if (ret == -ENOENT)
+ goto again;
+
+ rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
+ return ret;
+ }
+
+ kfree(lock_tag);
+ if (num_lockers == 0)
+ goto again;
+
+ rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
+ ENTITY_NAME(lockers[0].id.name));
+
+ ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
+ RBD_LOCK_NAME, lockers[0].id.cookie,
+ &lockers[0].id.name);
+ ceph_free_lockers(lockers, num_lockers);
+ if (ret) {
+ if (ret == -ENOENT)
+ goto again;
+
+ rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
+ return ret;
+ }
+
+ broke_lock = true;
+ goto again;
+}
+
+static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
+{
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ CEPH_DEFINE_OID_ONSTACK(oid);
+ int ret;
+
+ rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
+
+ ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
+ "");
+ if (ret && ret != -ENOENT)
+ rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
+}
+
+static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
+{
+ u8 struct_v;
+ u32 struct_len;
+ u32 header_len;
+ void *header_end;
+ int ret;
+
+ ceph_decode_32_safe(p, end, header_len, e_inval);
+ header_end = *p + header_len;
+
+ ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
+ &struct_len);
+ if (ret)
+ return ret;
+
+ ceph_decode_64_safe(p, end, *object_map_size, e_inval);
+
+ *p = header_end;
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int __rbd_object_map_load(struct rbd_device *rbd_dev)
+{
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ CEPH_DEFINE_OID_ONSTACK(oid);
+ struct page **pages;
+ void *p, *end;
+ size_t reply_len;
+ u64 num_objects;
+ u64 object_map_bytes;
+ u64 object_map_size;
+ int num_pages;
+ int ret;
+
+ rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
+
+ num_objects = ceph_get_num_objects(&rbd_dev->layout,
+ rbd_dev->mapping.size);
+ object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
+ BITS_PER_BYTE);
+ num_pages = calc_pages_for(0, object_map_bytes) + 1;
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ reply_len = num_pages * PAGE_SIZE;
+ rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
+ ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
+ "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
+ NULL, 0, pages, &reply_len);
+ if (ret)
+ goto out;
+
+ p = page_address(pages[0]);
+ end = p + min(reply_len, (size_t)PAGE_SIZE);
+ ret = decode_object_map_header(&p, end, &object_map_size);
+ if (ret)
+ goto out;
+
+ if (object_map_size != num_objects) {
+ rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
+ object_map_size, num_objects);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (offset_in_page(p) + object_map_bytes > reply_len) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
+ if (!rbd_dev->object_map) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ rbd_dev->object_map_size = object_map_size;
+ ceph_copy_from_page_vector(pages, rbd_dev->object_map,
+ offset_in_page(p), object_map_bytes);
+
+out:
+ ceph_release_page_vector(pages, num_pages);
+ return ret;
+}
+
+static void rbd_object_map_free(struct rbd_device *rbd_dev)
+{
+ kvfree(rbd_dev->object_map);
+ rbd_dev->object_map = NULL;
+ rbd_dev->object_map_size = 0;
+}
+
+static int rbd_object_map_load(struct rbd_device *rbd_dev)
+{
+ int ret;
+
+ ret = __rbd_object_map_load(rbd_dev);
+ if (ret)
+ return ret;
+
+ ret = rbd_dev_v2_get_flags(rbd_dev);
+ if (ret) {
+ rbd_object_map_free(rbd_dev);
+ return ret;
+ }
+
+ if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
+ rbd_warn(rbd_dev, "object map is invalid");
+
+ return 0;
+}
+
+static int rbd_object_map_open(struct rbd_device *rbd_dev)
+{
+ int ret;
+
+ ret = rbd_object_map_lock(rbd_dev);
+ if (ret)
+ return ret;
+
+ ret = rbd_object_map_load(rbd_dev);
+ if (ret) {
+ rbd_object_map_unlock(rbd_dev);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void rbd_object_map_close(struct rbd_device *rbd_dev)
+{
+ rbd_object_map_free(rbd_dev);
+ rbd_object_map_unlock(rbd_dev);
+}
+
+/*
+ * This function needs snap_id (or more precisely just something to
+ * distinguish between HEAD and snapshot object maps), new_state and
+ * current_state that were passed to rbd_object_map_update().
+ *
+ * To avoid allocating and stashing a context we piggyback on the OSD
+ * request. A HEAD update has two ops (assert_locked). For new_state
+ * and current_state we decode our own object_map_update op, encoded in
+ * rbd_cls_object_map_update().
+ */
+static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
+ struct ceph_osd_request *osd_req)
+{
+ struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+ struct ceph_osd_data *osd_data;
+ u64 objno;
+ u8 state, new_state, current_state;
+ bool has_current_state;
+ void *p;
+
+ if (osd_req->r_result)
+ return osd_req->r_result;
+
+ /*
+ * Nothing to do for a snapshot object map.
+ */
+ if (osd_req->r_num_ops == 1)
+ return 0;
+
+ /*
+ * Update in-memory HEAD object map.
+ */
+ rbd_assert(osd_req->r_num_ops == 2);
+ osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
+ rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
+
+ p = page_address(osd_data->pages[0]);
+ objno = ceph_decode_64(&p);
+ rbd_assert(objno == obj_req->ex.oe_objno);
+ rbd_assert(ceph_decode_64(&p) == objno + 1);
+ new_state = ceph_decode_8(&p);
+ has_current_state = ceph_decode_8(&p);
+ if (has_current_state)
+ current_state = ceph_decode_8(&p);
+
+ spin_lock(&rbd_dev->object_map_lock);
+ state = __rbd_object_map_get(rbd_dev, objno);
+ if (!has_current_state || current_state == state ||
+ (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
+ __rbd_object_map_set(rbd_dev, objno, new_state);
+ spin_unlock(&rbd_dev->object_map_lock);
+
+ return 0;
+}
+
+static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
+{
+ struct rbd_obj_request *obj_req = osd_req->r_priv;
+ int result;
+
+ dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
+ osd_req->r_result, obj_req);
+
+ result = rbd_object_map_update_finish(obj_req, osd_req);
+ rbd_obj_handle_request(obj_req, result);
+}
+
+static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
+{
+ u8 state = rbd_object_map_get(rbd_dev, objno);
+
+ if (state == new_state ||
+ (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
+ (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
+ return false;
+
+ return true;
+}
+
+static int rbd_cls_object_map_update(struct ceph_osd_request *req,
+ int which, u64 objno, u8 new_state,
+ const u8 *current_state)
+{
+ struct page **pages;
+ void *p, *start;
+ int ret;
+
+ ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
+ if (ret)
+ return ret;
+
+ pages = ceph_alloc_page_vector(1, GFP_NOIO);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ p = start = page_address(pages[0]);
+ ceph_encode_64(&p, objno);
+ ceph_encode_64(&p, objno + 1);
+ ceph_encode_8(&p, new_state);
+ if (current_state) {
+ ceph_encode_8(&p, 1);
+ ceph_encode_8(&p, *current_state);
+ } else {
+ ceph_encode_8(&p, 0);
+ }
+
+ osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
+ false, true);
+ return 0;
+}
+
+/*
+ * Return:
+ * 0 - object map update sent
+ * 1 - object map update isn't needed
+ * <0 - error
+ */
+static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
+ u8 new_state, const u8 *current_state)
+{
+ struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ struct ceph_osd_request *req;
+ int num_ops = 1;
+ int which = 0;
+ int ret;
+
+ if (snap_id == CEPH_NOSNAP) {
+ if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
+ return 1;
+
+ num_ops++; /* assert_locked */
+ }
+
+ req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
+ if (!req)
+ return -ENOMEM;
+
+ list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
+ req->r_callback = rbd_object_map_callback;
+ req->r_priv = obj_req;
+
+ rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
+ ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
+ req->r_flags = CEPH_OSD_FLAG_WRITE;
+ ktime_get_real_ts64(&req->r_mtime);
+
+ if (snap_id == CEPH_NOSNAP) {
+ /*
+ * Protect against possible race conditions during lock
+ * ownership transitions.
+ */
+ ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
+ CEPH_CLS_LOCK_EXCLUSIVE, "", "");
+ if (ret)
+ return ret;
+ }
+
+ ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
+ new_state, current_state);
+ if (ret)
+ return ret;
+
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ if (ret)
+ return ret;
+
+ ceph_osdc_start_request(osdc, req, false);
+ return 0;
+}
+
static void prune_extents(struct ceph_file_extent *img_extents,
u32 *num_img_extents, u64 overlap)
{
@@ -1764,11 +2291,13 @@ static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
return 0;
}
-static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
+static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
{
+ struct rbd_obj_request *obj_req = osd_req->r_priv;
+
switch (obj_req->img_request->data_type) {
case OBJ_REQUEST_BIO:
- osd_req_op_extent_osd_data_bio(obj_req->osd_req, which,
+ osd_req_op_extent_osd_data_bio(osd_req, which,
&obj_req->bio_pos,
obj_req->ex.oe_len);
break;
@@ -1777,7 +2306,7 @@ static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
rbd_assert(obj_req->bvec_pos.iter.bi_size ==
obj_req->ex.oe_len);
rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
- osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which,
+ osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
&obj_req->bvec_pos);
break;
default:
@@ -1785,22 +2314,7 @@ static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
}
}
-static int rbd_obj_setup_read(struct rbd_obj_request *obj_req)
-{
- obj_req->osd_req = __rbd_osd_req_create(obj_req, NULL, 1);
- if (!obj_req->osd_req)
- return -ENOMEM;
-
- osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ,
- obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
- rbd_osd_req_setup_data(obj_req, 0);
-
- rbd_osd_req_format_read(obj_req);
- return 0;
-}
-
-static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
- unsigned int which)
+static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
{
struct page **pages;
@@ -1816,45 +2330,60 @@ static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
if (IS_ERR(pages))
return PTR_ERR(pages);
- osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0);
- osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages,
+ osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
+ osd_req_op_raw_data_in_pages(osd_req, which, pages,
8 + sizeof(struct ceph_timespec),
0, false, true);
return 0;
}
-static int count_write_ops(struct rbd_obj_request *obj_req)
+static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
+ u32 bytes)
+{
+ struct rbd_obj_request *obj_req = osd_req->r_priv;
+ int ret;
+
+ ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
+ if (ret)
+ return ret;
+
+ osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
+ obj_req->copyup_bvec_count, bytes);
+ return 0;
+}
+
+static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
{
- return 2; /* setallochint + write/writefull */
+ obj_req->read_state = RBD_OBJ_READ_START;
+ return 0;
}
-static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req,
- unsigned int which)
+static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
+ int which)
{
+ struct rbd_obj_request *obj_req = osd_req->r_priv;
struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
u16 opcode;
- osd_req_op_alloc_hint_init(obj_req->osd_req, which++,
- rbd_dev->layout.object_size,
- rbd_dev->layout.object_size);
+ if (!use_object_map(rbd_dev) ||
+ !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) {
+ osd_req_op_alloc_hint_init(osd_req, which++,
+ rbd_dev->layout.object_size,
+ rbd_dev->layout.object_size);
+ }
if (rbd_obj_is_entire(obj_req))
opcode = CEPH_OSD_OP_WRITEFULL;
else
opcode = CEPH_OSD_OP_WRITE;
- osd_req_op_extent_init(obj_req->osd_req, which, opcode,
+ osd_req_op_extent_init(osd_req, which, opcode,
obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
- rbd_osd_req_setup_data(obj_req, which++);
-
- rbd_assert(which == obj_req->osd_req->r_num_ops);
- rbd_osd_req_format_write(obj_req);
+ rbd_osd_setup_data(osd_req, which);
}
-static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
+static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
{
- unsigned int num_osd_ops, which = 0;
- bool need_guard;
int ret;
/* reverse map the entire object onto the parent */
@@ -1862,24 +2391,10 @@ static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
if (ret)
return ret;
- need_guard = rbd_obj_copyup_enabled(obj_req);
- num_osd_ops = need_guard + count_write_ops(obj_req);
-
- obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
- if (!obj_req->osd_req)
- return -ENOMEM;
-
- if (need_guard) {
- ret = __rbd_obj_setup_stat(obj_req, which++);
- if (ret)
- return ret;
+ if (rbd_obj_copyup_enabled(obj_req))
+ obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
- obj_req