summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-11-04 21:12:47 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2015-11-04 21:12:47 -0800
commitac322de6bf5416cb145b58599297b8be73cd86ac (patch)
tree1a1be9f8b9241159fb4cde14a548eba9a4155b28 /drivers/md
parentccf21b69a83afaee4d5499e0d03eacf23946e08c (diff)
parent339421def582abb14c2217aa8c8f28bb2e299174 (diff)
Merge tag 'md/4.4' of git://neil.brown.name/md
Pull md updates from Neil Brown: "Two major components to this update. 1) The clustered-raid1 support from SUSE is nearly complete. There are a few outstanding issues being worked on. Maybe half a dozen patches will bring this to a usable state. 2) The first stage of journalled-raid5 support from Facebook makes an appearance. With a journal device configured (typically NVRAM or SSD), the "RAID5 write hole" should be closed - a crash during degraded operations cannot result in data corruption. The next stage will be to use the journal as a write-behind cache so that latency can be reduced and in some cases throughput increased by performing more full-stripe writes. * tag 'md/4.4' of git://neil.brown.name/md: (66 commits) MD: when RAID journal is missing/faulty, block RESTART_ARRAY_RW MD: set journal disk ->raid_disk MD: kick out journal disk if it's not fresh raid5-cache: start raid5 readonly if journal is missing MD: add new bit to indicate raid array with journal raid5-cache: IO error handling raid5: journal disk can't be removed raid5-cache: add trim support for log MD: fix info output for journal disk raid5-cache: use bio chaining raid5-cache: small log->seq cleanup raid5-cache: new helper: r5_reserve_log_entry raid5-cache: inline r5l_alloc_io_unit into r5l_new_meta raid5-cache: take rdev->data_offset into account early on raid5-cache: refactor bio allocation raid5-cache: clean up r5l_get_meta raid5-cache: simplify state machine when caches flushes are not needed raid5-cache: factor out a helper to run all stripes for an I/O unit raid5-cache: rename flushed_ios to finished_ios raid5-cache: free I/O units earlier ...
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Makefile2
-rw-r--r--drivers/md/bitmap.c14
-rw-r--r--drivers/md/bitmap.h4
-rw-r--r--drivers/md/md-cluster.c228
-rw-r--r--drivers/md/md-cluster.h12
-rw-r--r--drivers/md/md.c495
-rw-r--r--drivers/md/md.h17
-rw-r--r--drivers/md/raid1.c41
-rw-r--r--drivers/md/raid1.h7
-rw-r--r--drivers/md/raid10.c2
-rw-r--r--drivers/md/raid5-cache.c1191
-rw-r--r--drivers/md/raid5.c189
-rw-r--r--drivers/md/raid5.h24
13 files changed, 1917 insertions, 309 deletions
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 462f443a4f85..f34979cd141a 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -17,7 +17,7 @@ dm-cache-smq-y += dm-cache-policy-smq.o
dm-cache-cleaner-y += dm-cache-policy-cleaner.o
dm-era-y += dm-era-target.o
md-mod-y += md.o bitmap.o
-raid456-y += raid5.o
+raid456-y += raid5.o raid5-cache.o
# Note: link order is important. All raid personalities
# and must come before md.o, as they each initialise
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 48b5890c28e3..4f22e919787a 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -613,12 +613,10 @@ re_read:
daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
write_behind = le32_to_cpu(sb->write_behind);
sectors_reserved = le32_to_cpu(sb->sectors_reserved);
- /* XXX: This is a hack to ensure that we don't use clustering
- * in case:
- * - dm-raid is in use and
- * - the nodes written in bitmap_sb is erroneous.
+ /* Setup nodes/clustername only if bitmap version is
+ * cluster-compatible
*/
- if (!bitmap->mddev->sync_super) {
+ if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) {
nodes = le32_to_cpu(sb->nodes);
strlcpy(bitmap->mddev->bitmap_info.cluster_name,
sb->cluster_name, 64);
@@ -628,7 +626,7 @@ re_read:
if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
reason = "bad magic";
else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
- le32_to_cpu(sb->version) > BITMAP_MAJOR_HI)
+ le32_to_cpu(sb->version) > BITMAP_MAJOR_CLUSTERED)
reason = "unrecognized superblock version";
else if (chunksize < 512)
reason = "bitmap chunksize too small";
@@ -1572,7 +1570,7 @@ void bitmap_close_sync(struct bitmap *bitmap)
}
EXPORT_SYMBOL(bitmap_close_sync);
-void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
+void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
{
sector_t s = 0;
sector_t blocks;
@@ -1583,7 +1581,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
bitmap->last_end_sync = jiffies;
return;
}
- if (time_before(jiffies, (bitmap->last_end_sync
+ if (!force && time_before(jiffies, (bitmap->last_end_sync
+ bitmap->mddev->bitmap_info.daemon_sleep)))
return;
wait_event(bitmap->mddev->recovery_wait,
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index f1f4dd01090d..7d5c3a610ca5 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -9,8 +9,10 @@
#define BITMAP_MAJOR_LO 3
/* version 4 insists the bitmap is in little-endian order
* with version 3, it is host-endian which is non-portable
+ * Version 5 is currently set only for clustered devices
*/
#define BITMAP_MAJOR_HI 4
+#define BITMAP_MAJOR_CLUSTERED 5
#define BITMAP_MAJOR_HOSTENDIAN 3
/*
@@ -255,7 +257,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded);
void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted);
void bitmap_close_sync(struct bitmap *bitmap);
-void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
+void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force);
void bitmap_unplug(struct bitmap *bitmap);
void bitmap_daemon_work(struct mddev *mddev);
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 11e3bc9d2a4b..d6a1126d85ce 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -28,6 +28,7 @@ struct dlm_lock_resource {
struct completion completion; /* completion for synchronized locking */
void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
struct mddev *mddev; /* pointing back to mddev. */
+ int mode;
};
struct suspend_info {
@@ -53,8 +54,8 @@ struct md_cluster_info {
dlm_lockspace_t *lockspace;
int slot_number;
struct completion completion;
- struct mutex sb_mutex;
struct dlm_lock_resource *bitmap_lockres;
+ struct dlm_lock_resource *resync_lockres;
struct list_head suspend_list;
spinlock_t suspend_lock;
struct md_thread *recovery_thread;
@@ -79,20 +80,20 @@ enum msg_type {
};
struct cluster_msg {
- int type;
- int slot;
+ __le32 type;
+ __le32 slot;
/* TODO: Unionize this for smaller footprint */
- sector_t low;
- sector_t high;
+ __le64 low;
+ __le64 high;
char uuid[16];
- int raid_slot;
+ __le32 raid_slot;
};
static void sync_ast(void *arg)
{
struct dlm_lock_resource *res;
- res = (struct dlm_lock_resource *) arg;
+ res = arg;
complete(&res->completion);
}
@@ -106,6 +107,8 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
if (ret)
return ret;
wait_for_completion(&res->completion);
+ if (res->lksb.sb_status == 0)
+ res->mode = mode;
return res->lksb.sb_status;
}
@@ -127,6 +130,7 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
init_completion(&res->completion);
res->ls = cinfo->lockspace;
res->mddev = mddev;
+ res->mode = DLM_LOCK_IV;
namelen = strlen(name);
res->name = kzalloc(namelen + 1, GFP_KERNEL);
if (!res->name) {
@@ -191,8 +195,8 @@ retry:
kfree(res);
}
-static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres,
- sector_t lo, sector_t hi)
+static void add_resync_info(struct dlm_lock_resource *lockres,
+ sector_t lo, sector_t hi)
{
struct resync_info *ri;
@@ -210,7 +214,7 @@ static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_loc
dlm_lock_sync(lockres, DLM_LOCK_CR);
memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
hi = le64_to_cpu(ri.hi);
- if (ri.hi > 0) {
+ if (hi > 0) {
s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
if (!s)
goto out;
@@ -345,7 +349,7 @@ static const struct dlm_lockspace_ops md_ls_ops = {
*/
static void ack_bast(void *arg, int mode)
{
- struct dlm_lock_resource *res = (struct dlm_lock_resource *)arg;
+ struct dlm_lock_resource *res = arg;
struct md_cluster_info *cinfo = res->mddev->cluster_info;
if (mode == DLM_LOCK_EX)
@@ -358,29 +362,32 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
if (slot == s->slot) {
- pr_info("%s:%d Deleting suspend_info: %d\n",
- __func__, __LINE__, slot);
list_del(&s->list);
kfree(s);
break;
}
}
-static void remove_suspend_info(struct md_cluster_info *cinfo, int slot)
+static void remove_suspend_info(struct mddev *mddev, int slot)
{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
spin_lock_irq(&cinfo->suspend_lock);
__remove_suspend_info(cinfo, slot);
spin_unlock_irq(&cinfo->suspend_lock);
+ mddev->pers->quiesce(mddev, 2);
}
-static void process_suspend_info(struct md_cluster_info *cinfo,
+static void process_suspend_info(struct mddev *mddev,
int slot, sector_t lo, sector_t hi)
{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
struct suspend_info *s;
if (!hi) {
- remove_suspend_info(cinfo, slot);
+ remove_suspend_info(mddev, slot);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
return;
}
s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
@@ -389,11 +396,14 @@ static void process_suspend_info(struct md_cluster_info *cinfo,
s->slot = slot;
s->lo = lo;
s->hi = hi;
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
spin_lock_irq(&cinfo->suspend_lock);
/* Remove existing entry (if exists) before adding */
__remove_suspend_info(cinfo, slot);
list_add(&s->list, &cinfo->suspend_list);
spin_unlock_irq(&cinfo->suspend_lock);
+ mddev->pers->quiesce(mddev, 2);
}
static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
@@ -407,7 +417,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
sprintf(disk_uuid + len, "%pU", cmsg->uuid);
- snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
+ snprintf(raid_slot, 16, "RAID_DISK=%d", le32_to_cpu(cmsg->raid_slot));
pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
init_completion(&cinfo->newdisk_completion);
set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
@@ -421,64 +431,59 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
-
- md_reload_sb(mddev);
+ md_reload_sb(mddev, le32_to_cpu(msg->raid_slot));
dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
}
static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
{
- struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
+ struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
+ le32_to_cpu(msg->raid_slot));
if (rdev)
md_kick_rdev_from_array(rdev);
else
- pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot);
+ pr_warn("%s: %d Could not find disk(%d) to REMOVE\n",
+ __func__, __LINE__, le32_to_cpu(msg->raid_slot));
}
static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
{
- struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
+ struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
+ le32_to_cpu(msg->raid_slot));
if (rdev && test_bit(Faulty, &rdev->flags))
clear_bit(Faulty, &rdev->flags);
else
- pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot);
+ pr_warn("%s: %d Could not find disk(%d) which is faulty",
+ __func__, __LINE__, le32_to_cpu(msg->raid_slot));
}
static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
{
- switch (msg->type) {
+ if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot),
+ "node %d received it's own msg\n", le32_to_cpu(msg->slot)))
+ return;
+ switch (le32_to_cpu(msg->type)) {
case METADATA_UPDATED:
- pr_info("%s: %d Received message: METADATA_UPDATE from %d\n",
- __func__, __LINE__, msg->slot);
process_metadata_update(mddev, msg);
break;
case RESYNCING:
- pr_info("%s: %d Received message: RESYNCING from %d\n",
- __func__, __LINE__, msg->slot);
- process_suspend_info(mddev->cluster_info, msg->slot,
- msg->low, msg->high);
+ process_suspend_info(mddev, le32_to_cpu(msg->slot),
+ le64_to_cpu(msg->low),
+ le64_to_cpu(msg->high));
break;
case NEWDISK:
- pr_info("%s: %d Received message: NEWDISK from %d\n",
- __func__, __LINE__, msg->slot);
process_add_new_disk(mddev, msg);
break;
case REMOVE:
- pr_info("%s: %d Received REMOVE from %d\n",
- __func__, __LINE__, msg->slot);
process_remove_disk(mddev, msg);
break;
case RE_ADD:
- pr_info("%s: %d Received RE_ADD from %d\n",
- __func__, __LINE__, msg->slot);
process_readd_disk(mddev, msg);
break;
case BITMAP_NEEDS_SYNC:
- pr_info("%s: %d Received BITMAP_NEEDS_SYNC from %d\n",
- __func__, __LINE__, msg->slot);
- __recover_slot(mddev, msg->slot);
+ __recover_slot(mddev, le32_to_cpu(msg->slot));
break;
default:
pr_warn("%s:%d Received unknown message from %d\n",
@@ -528,11 +533,17 @@ static void recv_daemon(struct md_thread *thread)
/* lock_comm()
* Takes the lock on the TOKEN lock resource so no other
* node can communicate while the operation is underway.
+ * If called again, and the TOKEN lock is alread in EX mode
+ * return success. However, care must be taken that unlock_comm()
+ * is called only once.
*/
static int lock_comm(struct md_cluster_info *cinfo)
{
int error;
+ if (cinfo->token_lockres->mode == DLM_LOCK_EX)
+ return 0;
+
error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
if (error)
pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
@@ -542,6 +553,7 @@ static int lock_comm(struct md_cluster_info *cinfo)
static void unlock_comm(struct md_cluster_info *cinfo)
{
+ WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX);
dlm_unlock_sync(cinfo->token_lockres);
}
@@ -696,7 +708,6 @@ static int join(struct mddev *mddev, int nodes)
init_completion(&cinfo->completion);
set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
- mutex_init(&cinfo->sb_mutex);
mddev->cluster_info = cinfo;
memset(str, 0, 64);
@@ -753,6 +764,10 @@ static int join(struct mddev *mddev, int nodes)
goto err;
}
+ cinfo->resync_lockres = lockres_init(mddev, "resync", NULL, 0);
+ if (!cinfo->resync_lockres)
+ goto err;
+
ret = gather_all_resync_info(mddev, nodes);
if (ret)
goto err;
@@ -763,6 +778,7 @@ err:
lockres_free(cinfo->token_lockres);
lockres_free(cinfo->ack_lockres);
lockres_free(cinfo->no_new_dev_lockres);
+ lockres_free(cinfo->resync_lockres);
lockres_free(cinfo->bitmap_lockres);
if (cinfo->lockspace)
dlm_release_lockspace(cinfo->lockspace, 2);
@@ -771,12 +787,32 @@ err:
return ret;
}
+static void resync_bitmap(struct mddev *mddev)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ struct cluster_msg cmsg = {0};
+ int err;
+
+ cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
+ err = sendmsg(cinfo, &cmsg);
+ if (err)
+ pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n",
+ __func__, __LINE__, err);
+}
+
static int leave(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
if (!cinfo)
return 0;
+
+ /* BITMAP_NEEDS_SYNC message should be sent when node
+ * is leaving the cluster with dirty bitmap, also we
+ * can only deliver it when dlm connection is available */
+ if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
+ resync_bitmap(mddev);
+
md_unregister_thread(&cinfo->recovery_thread);
md_unregister_thread(&cinfo->recv_thread);
lockres_free(cinfo->message_lockres);
@@ -799,15 +835,6 @@ static int slot_number(struct mddev *mddev)
return cinfo->slot_number - 1;
}
-static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
-{
- struct md_cluster_info *cinfo = mddev->cluster_info;
-
- add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
- /* Re-acquire the lock to refresh LVB */
- dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
-}
-
static int metadata_update_start(struct mddev *mddev)
{
return lock_comm(mddev->cluster_info);
@@ -817,59 +844,62 @@ static int metadata_update_finish(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg;
- int ret;
+ struct md_rdev *rdev;
+ int ret = 0;
+ int raid_slot = -1;
memset(&cmsg, 0, sizeof(cmsg));
cmsg.type = cpu_to_le32(METADATA_UPDATED);
- ret = __sendmsg(cinfo, &cmsg);
+ /* Pick up a good active device number to send.
+ */
+ rdev_for_each(rdev, mddev)
+ if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) {
+ raid_slot = rdev->desc_nr;
+ break;
+ }
+ if (raid_slot >= 0) {
+ cmsg.raid_slot = cpu_to_le32(raid_slot);
+ ret = __sendmsg(cinfo, &cmsg);
+ } else
+ pr_warn("md-cluster: No good device id found to send\n");
unlock_comm(cinfo);
return ret;
}
-static int metadata_update_cancel(struct mddev *mddev)
+static void metadata_update_cancel(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
+ unlock_comm(cinfo);
+}
- return dlm_unlock_sync(cinfo->token_lockres);
+static int resync_start(struct mddev *mddev)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ cinfo->resync_lockres->flags |= DLM_LKF_NOQUEUE;
+ return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX);
}
-static int resync_send(struct mddev *mddev, enum msg_type type,
- sector_t lo, sector_t hi)
+static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
- struct cluster_msg cmsg;
- int slot = cinfo->slot_number - 1;
+ struct cluster_msg cmsg = {0};
- pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__,
- (unsigned long long)lo,
- (unsigned long long)hi);
- resync_info_update(mddev, lo, hi);
- cmsg.type = cpu_to_le32(type);
- cmsg.slot = cpu_to_le32(slot);
+ add_resync_info(cinfo->bitmap_lockres, lo, hi);
+ /* Re-acquire the lock to refresh LVB */
+ dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
+ cmsg.type = cpu_to_le32(RESYNCING);
cmsg.low = cpu_to_le64(lo);
cmsg.high = cpu_to_le64(hi);
- return sendmsg(cinfo, &cmsg);
-}
-static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi)
-{
- pr_info("%s:%d\n", __func__, __LINE__);
- return resync_send(mddev, RESYNCING, lo, hi);
+ return sendmsg(cinfo, &cmsg);
}
-static void resync_finish(struct mddev *mddev)
+static int resync_finish(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
- struct cluster_msg cmsg;
- int slot = cinfo->slot_number - 1;
-
- pr_info("%s:%d\n", __func__, __LINE__);
- resync_send(mddev, RESYNCING, 0, 0);
- if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
- cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
- cmsg.slot = cpu_to_le32(slot);
- sendmsg(cinfo, &cmsg);
- }
+ cinfo->resync_lockres->flags &= ~DLM_LKF_NOQUEUE;
+ dlm_unlock_sync(cinfo->resync_lockres);
+ return resync_info_update(mddev, 0, 0);
}
static int area_resyncing(struct mddev *mddev, int direction,
@@ -896,7 +926,11 @@ out:
return ret;
}
-static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
+/* add_new_disk() - initiates a disk add
+ * However, if this fails before writing md_update_sb(),
+ * add_new_disk_cancel() must be called to release token lock
+ */
+static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg;
@@ -907,7 +941,7 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
memset(&cmsg, 0, sizeof(cmsg));
cmsg.type = cpu_to_le32(NEWDISK);
memcpy(cmsg.uuid, uuid, 16);
- cmsg.raid_slot = rdev->desc_nr;
+ cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
lock_comm(cinfo);
ret = __sendmsg(cinfo, &cmsg);
if (ret)
@@ -918,22 +952,17 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
/* Some node does not "see" the device */
if (ret == -EAGAIN)
ret = -ENOENT;
+ if (ret)
+ unlock_comm(cinfo);
else
dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
return ret;
}
-static int add_new_disk_finish(struct mddev *mddev)
+static void add_new_disk_cancel(struct mddev *mddev)
{
- struct cluster_msg cmsg;
struct md_cluster_info *cinfo = mddev->cluster_info;
- int ret;
- /* Write sb and inform others */
- md_update_sb(mddev, 1);
- cmsg.type = METADATA_UPDATED;
- ret = __sendmsg(cinfo, &cmsg);
unlock_comm(cinfo);
- return ret;
}
static int new_disk_ack(struct mddev *mddev, bool ack)
@@ -953,10 +982,10 @@ static int new_disk_ack(struct mddev *mddev, bool ack)
static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
{
- struct cluster_msg cmsg;
+ struct cluster_msg cmsg = {0};
struct md_cluster_info *cinfo = mddev->cluster_info;
- cmsg.type = REMOVE;
- cmsg.raid_slot = rdev->desc_nr;
+ cmsg.type = cpu_to_le32(REMOVE);
+ cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
return __sendmsg(cinfo, &cmsg);
}
@@ -964,12 +993,12 @@ static int gather_bitmaps(struct md_rdev *rdev)
{
int sn, err;
sector_t lo, hi;
- struct cluster_msg cmsg;
+ struct cluster_msg cmsg = {0};
struct mddev *mddev = rdev->mddev;
struct md_cluster_info *cinfo = mddev->cluster_info;
- cmsg.type = RE_ADD;
- cmsg.raid_slot = rdev->desc_nr;
+ cmsg.type = cpu_to_le32(RE_ADD);
+ cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
err = sendmsg(cinfo, &cmsg);
if (err)
goto out;
@@ -993,15 +1022,15 @@ static struct md_cluster_operations cluster_ops = {
.join = join,
.leave = leave,
.slot_number = slot_number,
- .resync_info_update = resync_info_update,
.resync_start = resync_start,
.resync_finish = resync_finish,
+ .resync_info_update = resync_info_update,
.metadata_update_start = metadata_update_start,
.metadata_update_finish = metadata_update_finish,
.metadata_update_cancel = metadata_update_cancel,
.area_resyncing = area_resyncing,
- .add_new_disk_start = add_new_disk_start,
- .add_new_disk_finish = add_new_disk_finish,
+ .add_new_disk = add_new_disk,
+ .add_new_disk_cancel = add_new_disk_cancel,
.new_disk_ack = new_disk_ack,
.remove_disk = remove_disk,
.gather_bitmaps = gather_bitmaps,
@@ -1022,5 +1051,6 @@ static void cluster_exit(void)
module_init(cluster_init);
module_exit(cluster_exit);
+MODULE_AUTHOR("SUSE");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Clustering support for MD");
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 00defe2badbc..e75ea2613184 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -12,15 +12,15 @@ struct md_cluster_operations {
int (*join)(struct mddev *mddev, int nodes);
int (*leave)(struct mddev *mddev);
int (*slot_number)(struct mddev *mddev);
- void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
- int (*resync_start)(struct mddev *mddev, sector_t lo, sector_t hi);
- void (*resync_finish)(struct mddev *mddev);
+ int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
int (*metadata_update_start)(struct mddev *mddev);
int (*metadata_update_finish)(struct mddev *mddev);
- int (*metadata_update_cancel)(struct mddev *mddev);
+ void (*metadata_update_cancel)(struct mddev *mddev);
+ int (*resync_start)(struct mddev *mddev);
+ int (*resync_finish)(struct mddev *mddev);
int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi);
- int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev);
- int (*add_new_disk_finish)(struct mddev *mddev);
+ int (*add_new_disk)(struct mddev *mddev, struct md_rdev *rdev);
+ void (*add_new_disk_cancel)(struct mddev *mddev);
int (*new_disk_ack)(struct mddev *mddev, bool ack);
int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
int (*gather_bitmaps)(struct md_rdev *rdev);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 740ee9bc5ad3..3f9a514b5b9d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1608,7 +1608,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
++ev1;
if (rdev->desc_nr >= 0 &&
rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
- le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe)
+ (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
+ le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
if (ev1 < mddev->events)
return -EINVAL;
} else if (mddev->bitmap) {
@@ -1628,16 +1629,29 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
int role;
if (rdev->desc_nr < 0 ||
rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
- role = 0xffff;
+ role = MD_DISK_ROLE_SPARE;
rdev->desc_nr = -1;
} else
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
switch(role) {
- case 0xffff: /* spare */
+ case MD_DISK_ROLE_SPARE: /* spare */
break;
- case 0xfffe: /* faulty */
+ case MD_DISK_ROLE_FAULTY: /* faulty */
set_bit(Faulty, &rdev->flags);
break;
+ case MD_DISK_ROLE_JOURNAL: /* journal device */
+ if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
+ /* journal device without journal feature */
+ printk(KERN_WARNING
+ "md: journal device provided without journal feature, ignoring the device\n");
+ return -EINVAL;
+ }
+ set_bit(Journal, &rdev->flags);
+ rdev->journal_tail = le64_to_cpu(sb->journal_tail);
+ if (mddev->recovery_cp == MaxSector)
+ set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
+ rdev->raid_disk = mddev->raid_disks;
+ break;
default:
rdev->saved_raid_disk = role;
if ((le32_to_cpu(sb->feature_map) &
@@ -1655,6 +1669,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
set_bit(WriteMostly, &rdev->flags);
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
set_bit(Replacement, &rdev->flags);
+ if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
+ set_bit(MD_HAS_JOURNAL, &mddev->flags);
} else /* MULTIPATH are always insync */
set_bit(In_sync, &rdev->flags);
@@ -1679,6 +1695,8 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->events = cpu_to_le64(mddev->events);
if (mddev->in_sync)
sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
+ else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
+ sb->resync_offset = cpu_to_le64(MaxSector);
else
sb->resync_offset = cpu_to_le64(0);
@@ -1702,7 +1720,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
}
- if (rdev->raid_disk >= 0 &&
+ if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags)) {
sb->feature_map |=
cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
@@ -1712,6 +1730,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->feature_map |=
cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
}
+ /* Note: recovery_offset and journal_tail share space */
+ if (test_bit(Journal, &rdev->flags))
+ sb->journal_tail = cpu_to_le64(rdev->journal_tail);
if (test_bit(Replacement, &rdev->flags))
sb->feature_map |=
cpu_to_le32(MD_FEATURE_REPLACEMENT);
@@ -1735,6 +1756,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
}
}
+ if (mddev_is_clustered(mddev))
+ sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
+
if (rdev->badblocks.count == 0)
/* Nothing to do for bad blocks*/ ;
else if (sb->bblog_offset == 0)
@@ -1785,18 +1809,23 @@ retry:
max_dev = le32_to_cpu(sb->max_dev);
for (i=0; i<max_dev;i++)
- sb->dev_roles[i] = cpu_to_le16(0xfffe);
+ sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
+
+ if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
+ sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
rdev_for_each(rdev2, mddev) {
i = rdev2->desc_nr;
if (test_bit(Faulty, &rdev2->flags))
- sb->dev_roles[i] = cpu_to_le16(0xfffe);
+ sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
else if (test_bit(In_sync, &rdev2->flags))
sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
+ else if (test_bit(Journal, &rdev2->flags))
+ sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
else if (rdev2->raid_disk >= 0)
sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
else
- sb->dev_roles[i] = cpu_to_le16(0xffff);
+ sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
}
sb->sb_csum = calc_sb_1_csum(sb);
@@ -1912,13 +1941,23 @@ static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
struct md_rdev *rdev, *rdev2;
rcu_read_lock();
- rdev_for_each_rcu(rdev, mddev1)
- rdev_for_each_rcu(rdev2, mddev2)
+ rdev_for_each_rcu(rdev, mddev1) {
+ if (test_bit(Faulty, &rdev->flags) ||
+ test_bit(Journal, &rdev->flags) ||
+ rdev->raid_disk == -1)
+ continue;
+ rdev_for_each_rcu(rdev2, mddev2) {
+ if (test_bit(Faulty, &rdev2->flags) ||
+ test_bit(Journal, &rdev2->flags) ||
+ rdev2->raid_disk == -1)
+ continue;
if (rdev->bdev->bd_contains ==
rdev2->bdev->bd_contains) {
rcu_read_unlock();
return 1;
}
+ }
+ }
rcu_read_unlock();
return 0;
}
@@ -2194,23 +2233,77 @@ static void sync_sbs(struct mddev *mddev, int nospares)
}
}
+static bool does_sb_need_changing(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+ struct mdp_superblock_1 *sb;
+ int role;
+
+ /* Find a good rdev */
+ rdev_for_each(rdev, mddev)
+ if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
+ break;
+
+ /* No good device found. */
+ if (!rdev)
+ return false;
+
+ sb = page_address(rdev->sb_page);
+ /* Check if a device has become faulty or a spare become active */
+ rdev_for_each(rdev, mddev) {
+ role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+ /* Device activated? */
+ if (role == 0xffff && rdev->raid_disk >=0 &&
+ !test_bit(Faulty, &rdev->flags))
+ return true;
+ /* Device turned faulty? */
+ if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
+ return true;
+ }
+
+ /* Check if any mddev parameters have changed */
+ if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
+ (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
+ (mddev->layout != le64_to_cpu(sb->layout)) ||
+ (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
+ (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
+ return true;
+
+ return false;
+}
+
void md_update_sb(struct mddev *mddev, int force_change)
{
struct md_rdev *rdev;
int sync_req;
int nospares = 0;
int any_badblocks_changed = 0;
+ int ret = -1;
if (mddev->ro) {
if (force_change)
set_bit(MD_CHANGE_DEVS, &mddev->flags);
return;
}
+
+ if (mddev_is_clustered(mddev)) {
+ if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
+ force_change = 1;
+ ret = md_cluster_ops->metadata_update_start(mddev);
+ /* Has someone else has updated the sb */
+ if (!does_sb_need_changing(mddev)) {
+ if (ret == 0)
+ md_cluster_ops->metadata_update_cancel(mddev);
+ clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+ return;
+ }
+ }
repeat:
/* First make sure individual recovery_offsets are correct */
rdev_for_each(rdev, mddev) {
if (rdev->raid_disk >= 0 &&
mddev->delta_disks >= 0 &&
+ !test_bit(Journal, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
mddev->curr_resync_completed > rdev->recovery_offset)
rdev->recovery_offset = mddev->curr_resync_completed;
@@ -2354,6 +2447,9 @@ repeat:
clear_bit(BlockedBadBlocks, &rdev->flags);
wake_up(&rdev->blocked_wait);
}
+
+ if (mddev_is_clustered(mddev) && ret == 0)
+ md_cluster_ops->metadata_update_finish(mddev);
}
EXPORT_SYMBOL(md_update_sb);
@@ -2429,6 +2525,10 @@ state_show(struct md_rdev *rdev, char *page)
len += sprintf(page+len, "%sin_sync",sep);
sep = ",";
}
+ if (test_bit(Journal, &flags)) {
+ len += sprintf(page+len, "%sjournal",sep);
+ sep = ",";
+ }
if (test_bit(WriteMostly, &flags)) {
len += sprintf(page+len, "%swrite_mostly",sep);
sep = ",";
@@ -2440,6 +2540,7 @@ state_show(struct md_rdev *rdev, char *page)
sep = ",";
}
if (!test_bit(Faulty, &flags) &&
+ !test_bit(Journal, &flags) &&
!test_bit(In_sync, &