diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-04 21:12:47 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-04 21:12:47 -0800 |
commit | ac322de6bf5416cb145b58599297b8be73cd86ac (patch) | |
tree | 1a1be9f8b9241159fb4cde14a548eba9a4155b28 /drivers/md | |
parent | ccf21b69a83afaee4d5499e0d03eacf23946e08c (diff) | |
parent | 339421def582abb14c2217aa8c8f28bb2e299174 (diff) |
Merge tag 'md/4.4' of git://neil.brown.name/md
Pull md updates from Neil Brown:
"Two major components to this update.
1) The clustered-raid1 support from SUSE is nearly complete. There
are a few outstanding issues being worked on. Maybe half a dozen
patches will bring this to a usable state.
2) The first stage of journalled-raid5 support from Facebook makes an
appearance. With a journal device configured (typically NVRAM or
SSD), the "RAID5 write hole" should be closed - a crash during
degraded operations cannot result in data corruption.
The next stage will be to use the journal as a write-behind cache
so that latency can be reduced and in some cases throughput
increased by performing more full-stripe writes.
* tag 'md/4.4' of git://neil.brown.name/md: (66 commits)
MD: when RAID journal is missing/faulty, block RESTART_ARRAY_RW
MD: set journal disk ->raid_disk
MD: kick out journal disk if it's not fresh
raid5-cache: start raid5 readonly if journal is missing
MD: add new bit to indicate raid array with journal
raid5-cache: IO error handling
raid5: journal disk can't be removed
raid5-cache: add trim support for log
MD: fix info output for journal disk
raid5-cache: use bio chaining
raid5-cache: small log->seq cleanup
raid5-cache: new helper: r5_reserve_log_entry
raid5-cache: inline r5l_alloc_io_unit into r5l_new_meta
raid5-cache: take rdev->data_offset into account early on
raid5-cache: refactor bio allocation
raid5-cache: clean up r5l_get_meta
raid5-cache: simplify state machine when caches flushes are not needed
raid5-cache: factor out a helper to run all stripes for an I/O unit
raid5-cache: rename flushed_ios to finished_ios
raid5-cache: free I/O units earlier
...
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Makefile | 2 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 14 | ||||
-rw-r--r-- | drivers/md/bitmap.h | 4 | ||||
-rw-r--r-- | drivers/md/md-cluster.c | 228 | ||||
-rw-r--r-- | drivers/md/md-cluster.h | 12 | ||||
-rw-r--r-- | drivers/md/md.c | 495 | ||||
-rw-r--r-- | drivers/md/md.h | 17 | ||||
-rw-r--r-- | drivers/md/raid1.c | 41 | ||||
-rw-r--r-- | drivers/md/raid1.h | 7 | ||||
-rw-r--r-- | drivers/md/raid10.c | 2 | ||||
-rw-r--r-- | drivers/md/raid5-cache.c | 1191 | ||||
-rw-r--r-- | drivers/md/raid5.c | 189 | ||||
-rw-r--r-- | drivers/md/raid5.h | 24 |
13 files changed, 1917 insertions, 309 deletions
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 462f443a4f85..f34979cd141a 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -17,7 +17,7 @@ dm-cache-smq-y += dm-cache-policy-smq.o dm-cache-cleaner-y += dm-cache-policy-cleaner.o dm-era-y += dm-era-target.o md-mod-y += md.o bitmap.o -raid456-y += raid5.o +raid456-y += raid5.o raid5-cache.o # Note: link order is important. All raid personalities # and must come before md.o, as they each initialise diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 48b5890c28e3..4f22e919787a 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -613,12 +613,10 @@ re_read: daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; write_behind = le32_to_cpu(sb->write_behind); sectors_reserved = le32_to_cpu(sb->sectors_reserved); - /* XXX: This is a hack to ensure that we don't use clustering - * in case: - * - dm-raid is in use and - * - the nodes written in bitmap_sb is erroneous. + /* Setup nodes/clustername only if bitmap version is + * cluster-compatible */ - if (!bitmap->mddev->sync_super) { + if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) { nodes = le32_to_cpu(sb->nodes); strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64); @@ -628,7 +626,7 @@ re_read: if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) reason = "bad magic"; else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || - le32_to_cpu(sb->version) > BITMAP_MAJOR_HI) + le32_to_cpu(sb->version) > BITMAP_MAJOR_CLUSTERED) reason = "unrecognized superblock version"; else if (chunksize < 512) reason = "bitmap chunksize too small"; @@ -1572,7 +1570,7 @@ void bitmap_close_sync(struct bitmap *bitmap) } EXPORT_SYMBOL(bitmap_close_sync); -void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) +void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) { sector_t s = 0; sector_t blocks; @@ -1583,7 +1581,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) bitmap->last_end_sync = jiffies; return; } - if (time_before(jiffies, (bitmap->last_end_sync + if (!force && time_before(jiffies, (bitmap->last_end_sync + bitmap->mddev->bitmap_info.daemon_sleep))) return; wait_event(bitmap->mddev->recovery_wait, diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index f1f4dd01090d..7d5c3a610ca5 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -9,8 +9,10 @@ #define BITMAP_MAJOR_LO 3 /* version 4 insists the bitmap is in little-endian order * with version 3, it is host-endian which is non-portable + * Version 5 is currently set only for clustered devices */ #define BITMAP_MAJOR_HI 4 +#define BITMAP_MAJOR_CLUSTERED 5 #define BITMAP_MAJOR_HOSTENDIAN 3 /* @@ -255,7 +257,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); void bitmap_close_sync(struct bitmap *bitmap); -void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); +void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force); void bitmap_unplug(struct bitmap *bitmap); void bitmap_daemon_work(struct mddev *mddev); diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 11e3bc9d2a4b..d6a1126d85ce 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -28,6 +28,7 @@ struct dlm_lock_resource { struct completion completion; /* completion for synchronized locking */ void (*bast)(void *arg, int mode); /* blocking AST function pointer*/ struct mddev *mddev; /* pointing back to mddev. */ + int mode; }; struct suspend_info { @@ -53,8 +54,8 @@ struct md_cluster_info { dlm_lockspace_t *lockspace; int slot_number; struct completion completion; - struct mutex sb_mutex; struct dlm_lock_resource *bitmap_lockres; + struct dlm_lock_resource *resync_lockres; struct list_head suspend_list; spinlock_t suspend_lock; struct md_thread *recovery_thread; @@ -79,20 +80,20 @@ enum msg_type { }; struct cluster_msg { - int type; - int slot; + __le32 type; + __le32 slot; /* TODO: Unionize this for smaller footprint */ - sector_t low; - sector_t high; + __le64 low; + __le64 high; char uuid[16]; - int raid_slot; + __le32 raid_slot; }; static void sync_ast(void *arg) { struct dlm_lock_resource *res; - res = (struct dlm_lock_resource *) arg; + res = arg; complete(&res->completion); } @@ -106,6 +107,8 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode) if (ret) return ret; wait_for_completion(&res->completion); + if (res->lksb.sb_status == 0) + res->mode = mode; return res->lksb.sb_status; } @@ -127,6 +130,7 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev, init_completion(&res->completion); res->ls = cinfo->lockspace; res->mddev = mddev; + res->mode = DLM_LOCK_IV; namelen = strlen(name); res->name = kzalloc(namelen + 1, GFP_KERNEL); if (!res->name) { @@ -191,8 +195,8 @@ retry: kfree(res); } -static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres, - sector_t lo, sector_t hi) +static void add_resync_info(struct dlm_lock_resource *lockres, + sector_t lo, sector_t hi) { struct resync_info *ri; @@ -210,7 +214,7 @@ static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_loc dlm_lock_sync(lockres, DLM_LOCK_CR); memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info)); hi = le64_to_cpu(ri.hi); - if (ri.hi > 0) { + if (hi > 0) { s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); if (!s) goto out; @@ -345,7 +349,7 @@ static const struct dlm_lockspace_ops md_ls_ops = { */ static void ack_bast(void *arg, int mode) { - struct dlm_lock_resource *res = (struct dlm_lock_resource *)arg; + struct dlm_lock_resource *res = arg; struct md_cluster_info *cinfo = res->mddev->cluster_info; if (mode == DLM_LOCK_EX) @@ -358,29 +362,32 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot) list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list) if (slot == s->slot) { - pr_info("%s:%d Deleting suspend_info: %d\n", - __func__, __LINE__, slot); list_del(&s->list); kfree(s); break; } } -static void remove_suspend_info(struct md_cluster_info *cinfo, int slot) +static void remove_suspend_info(struct mddev *mddev, int slot) { + struct md_cluster_info *cinfo = mddev->cluster_info; spin_lock_irq(&cinfo->suspend_lock); __remove_suspend_info(cinfo, slot); spin_unlock_irq(&cinfo->suspend_lock); + mddev->pers->quiesce(mddev, 2); } -static void process_suspend_info(struct md_cluster_info *cinfo, +static void process_suspend_info(struct mddev *mddev, int slot, sector_t lo, sector_t hi) { + struct md_cluster_info *cinfo = mddev->cluster_info; struct suspend_info *s; if (!hi) { - remove_suspend_info(cinfo, slot); + remove_suspend_info(mddev, slot); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); return; } s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); @@ -389,11 +396,14 @@ static void process_suspend_info(struct md_cluster_info *cinfo, s->slot = slot; s->lo = lo; s->hi = hi; + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); spin_lock_irq(&cinfo->suspend_lock); /* Remove existing entry (if exists) before adding */ __remove_suspend_info(cinfo, slot); list_add(&s->list, &cinfo->suspend_list); spin_unlock_irq(&cinfo->suspend_lock); + mddev->pers->quiesce(mddev, 2); } static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) @@ -407,7 +417,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) len = snprintf(disk_uuid, 64, "DEVICE_UUID="); sprintf(disk_uuid + len, "%pU", cmsg->uuid); - snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot); + snprintf(raid_slot, 16, "RAID_DISK=%d", le32_to_cpu(cmsg->raid_slot)); pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot); init_completion(&cinfo->newdisk_completion); set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); @@ -421,64 +431,59 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg) { struct md_cluster_info *cinfo = mddev->cluster_info; - - md_reload_sb(mddev); + md_reload_sb(mddev, le32_to_cpu(msg->raid_slot)); dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); } static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg) { - struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot); + struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, + le32_to_cpu(msg->raid_slot)); if (rdev) md_kick_rdev_from_array(rdev); else - pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot); + pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", + __func__, __LINE__, le32_to_cpu(msg->raid_slot)); } static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg) { - struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot); + struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, + le32_to_cpu(msg->raid_slot)); if (rdev && test_bit(Faulty, &rdev->flags)) clear_bit(Faulty, &rdev->flags); else - pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot); + pr_warn("%s: %d Could not find disk(%d) which is faulty", + __func__, __LINE__, le32_to_cpu(msg->raid_slot)); } static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) { - switch (msg->type) { + if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot), + "node %d received it's own msg\n", le32_to_cpu(msg->slot))) + return; + switch (le32_to_cpu(msg->type)) { case METADATA_UPDATED: - pr_info("%s: %d Received message: METADATA_UPDATE from %d\n", - __func__, __LINE__, msg->slot); process_metadata_update(mddev, msg); break; case RESYNCING: - pr_info("%s: %d Received message: RESYNCING from %d\n", - __func__, __LINE__, msg->slot); - process_suspend_info(mddev->cluster_info, msg->slot, - msg->low, msg->high); + process_suspend_info(mddev, le32_to_cpu(msg->slot), + le64_to_cpu(msg->low), + le64_to_cpu(msg->high)); break; case NEWDISK: - pr_info("%s: %d Received message: NEWDISK from %d\n", - __func__, __LINE__, msg->slot); process_add_new_disk(mddev, msg); break; case REMOVE: - pr_info("%s: %d Received REMOVE from %d\n", - __func__, __LINE__, msg->slot); process_remove_disk(mddev, msg); break; case RE_ADD: - pr_info("%s: %d Received RE_ADD from %d\n", - __func__, __LINE__, msg->slot); process_readd_disk(mddev, msg); break; case BITMAP_NEEDS_SYNC: - pr_info("%s: %d Received BITMAP_NEEDS_SYNC from %d\n", - __func__, __LINE__, msg->slot); - __recover_slot(mddev, msg->slot); + __recover_slot(mddev, le32_to_cpu(msg->slot)); break; default: pr_warn("%s:%d Received unknown message from %d\n", @@ -528,11 +533,17 @@ static void recv_daemon(struct md_thread *thread) /* lock_comm() * Takes the lock on the TOKEN lock resource so no other * node can communicate while the operation is underway. + * If called again, and the TOKEN lock is alread in EX mode + * return success. However, care must be taken that unlock_comm() + * is called only once. */ static int lock_comm(struct md_cluster_info *cinfo) { int error; + if (cinfo->token_lockres->mode == DLM_LOCK_EX) + return 0; + error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); if (error) pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", @@ -542,6 +553,7 @@ static int lock_comm(struct md_cluster_info *cinfo) static void unlock_comm(struct md_cluster_info *cinfo) { + WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX); dlm_unlock_sync(cinfo->token_lockres); } @@ -696,7 +708,6 @@ static int join(struct mddev *mddev, int nodes) init_completion(&cinfo->completion); set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state); - mutex_init(&cinfo->sb_mutex); mddev->cluster_info = cinfo; memset(str, 0, 64); @@ -753,6 +764,10 @@ static int join(struct mddev *mddev, int nodes) goto err; } + cinfo->resync_lockres = lockres_init(mddev, "resync", NULL, 0); + if (!cinfo->resync_lockres) + goto err; + ret = gather_all_resync_info(mddev, nodes); if (ret) goto err; @@ -763,6 +778,7 @@ err: lockres_free(cinfo->token_lockres); lockres_free(cinfo->ack_lockres); lockres_free(cinfo->no_new_dev_lockres); + lockres_free(cinfo->resync_lockres); lockres_free(cinfo->bitmap_lockres); if (cinfo->lockspace) dlm_release_lockspace(cinfo->lockspace, 2); @@ -771,12 +787,32 @@ err: return ret; } +static void resync_bitmap(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + struct cluster_msg cmsg = {0}; + int err; + + cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC); + err = sendmsg(cinfo, &cmsg); + if (err) + pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n", + __func__, __LINE__, err); +} + static int leave(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; if (!cinfo) return 0; + + /* BITMAP_NEEDS_SYNC message should be sent when node + * is leaving the cluster with dirty bitmap, also we + * can only deliver it when dlm connection is available */ + if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) + resync_bitmap(mddev); + md_unregister_thread(&cinfo->recovery_thread); md_unregister_thread(&cinfo->recv_thread); lockres_free(cinfo->message_lockres); @@ -799,15 +835,6 @@ static int slot_number(struct mddev *mddev) return cinfo->slot_number - 1; } -static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) -{ - struct md_cluster_info *cinfo = mddev->cluster_info; - - add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi); - /* Re-acquire the lock to refresh LVB */ - dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW); -} - static int metadata_update_start(struct mddev *mddev) { return lock_comm(mddev->cluster_info); @@ -817,59 +844,62 @@ static int metadata_update_finish(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; struct cluster_msg cmsg; - int ret; + struct md_rdev *rdev; + int ret = 0; + int raid_slot = -1; memset(&cmsg, 0, sizeof(cmsg)); cmsg.type = cpu_to_le32(METADATA_UPDATED); - ret = __sendmsg(cinfo, &cmsg); + /* Pick up a good active device number to send. + */ + rdev_for_each(rdev, mddev) + if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) { + raid_slot = rdev->desc_nr; + break; + } + if (raid_slot >= 0) { + cmsg.raid_slot = cpu_to_le32(raid_slot); + ret = __sendmsg(cinfo, &cmsg); + } else + pr_warn("md-cluster: No good device id found to send\n"); unlock_comm(cinfo); return ret; } -static int metadata_update_cancel(struct mddev *mddev) +static void metadata_update_cancel(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; + unlock_comm(cinfo); +} - return dlm_unlock_sync(cinfo->token_lockres); +static int resync_start(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + cinfo->resync_lockres->flags |= DLM_LKF_NOQUEUE; + return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX); } -static int resync_send(struct mddev *mddev, enum msg_type type, - sector_t lo, sector_t hi) +static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) { struct md_cluster_info *cinfo = mddev->cluster_info; - struct cluster_msg cmsg; - int slot = cinfo->slot_number - 1; + struct cluster_msg cmsg = {0}; - pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__, - (unsigned long long)lo, - (unsigned long long)hi); - resync_info_update(mddev, lo, hi); - cmsg.type = cpu_to_le32(type); - cmsg.slot = cpu_to_le32(slot); + add_resync_info(cinfo->bitmap_lockres, lo, hi); + /* Re-acquire the lock to refresh LVB */ + dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW); + cmsg.type = cpu_to_le32(RESYNCING); cmsg.low = cpu_to_le64(lo); cmsg.high = cpu_to_le64(hi); - return sendmsg(cinfo, &cmsg); -} -static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi) -{ - pr_info("%s:%d\n", __func__, __LINE__); - return resync_send(mddev, RESYNCING, lo, hi); + return sendmsg(cinfo, &cmsg); } -static void resync_finish(struct mddev *mddev) +static int resync_finish(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; - struct cluster_msg cmsg; - int slot = cinfo->slot_number - 1; - - pr_info("%s:%d\n", __func__, __LINE__); - resync_send(mddev, RESYNCING, 0, 0); - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { - cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC); - cmsg.slot = cpu_to_le32(slot); - sendmsg(cinfo, &cmsg); - } + cinfo->resync_lockres->flags &= ~DLM_LKF_NOQUEUE; + dlm_unlock_sync(cinfo->resync_lockres); + return resync_info_update(mddev, 0, 0); } static int area_resyncing(struct mddev *mddev, int direction, @@ -896,7 +926,11 @@ out: return ret; } -static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev) +/* add_new_disk() - initiates a disk add + * However, if this fails before writing md_update_sb(), + * add_new_disk_cancel() must be called to release token lock + */ +static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev) { struct md_cluster_info *cinfo = mddev->cluster_info; struct cluster_msg cmsg; @@ -907,7 +941,7 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev) memset(&cmsg, 0, sizeof(cmsg)); cmsg.type = cpu_to_le32(NEWDISK); memcpy(cmsg.uuid, uuid, 16); - cmsg.raid_slot = rdev->desc_nr; + cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); lock_comm(cinfo); ret = __sendmsg(cinfo, &cmsg); if (ret) @@ -918,22 +952,17 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev) /* Some node does not "see" the device */ if (ret == -EAGAIN) ret = -ENOENT; + if (ret) + unlock_comm(cinfo); else dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); return ret; } -static int add_new_disk_finish(struct mddev *mddev) +static void add_new_disk_cancel(struct mddev *mddev) { - struct cluster_msg cmsg; struct md_cluster_info *cinfo = mddev->cluster_info; - int ret; - /* Write sb and inform others */ - md_update_sb(mddev, 1); - cmsg.type = METADATA_UPDATED; - ret = __sendmsg(cinfo, &cmsg); unlock_comm(cinfo); - return ret; } static int new_disk_ack(struct mddev *mddev, bool ack) @@ -953,10 +982,10 @@ static int new_disk_ack(struct mddev *mddev, bool ack) static int remove_disk(struct mddev *mddev, struct md_rdev *rdev) { - struct cluster_msg cmsg; + struct cluster_msg cmsg = {0}; struct md_cluster_info *cinfo = mddev->cluster_info; - cmsg.type = REMOVE; - cmsg.raid_slot = rdev->desc_nr; + cmsg.type = cpu_to_le32(REMOVE); + cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); return __sendmsg(cinfo, &cmsg); } @@ -964,12 +993,12 @@ static int gather_bitmaps(struct md_rdev *rdev) { int sn, err; sector_t lo, hi; - struct cluster_msg cmsg; + struct cluster_msg cmsg = {0}; struct mddev *mddev = rdev->mddev; struct md_cluster_info *cinfo = mddev->cluster_info; - cmsg.type = RE_ADD; - cmsg.raid_slot = rdev->desc_nr; + cmsg.type = cpu_to_le32(RE_ADD); + cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); err = sendmsg(cinfo, &cmsg); if (err) goto out; @@ -993,15 +1022,15 @@ static struct md_cluster_operations cluster_ops = { .join = join, .leave = leave, .slot_number = slot_number, - .resync_info_update = resync_info_update, .resync_start = resync_start, .resync_finish = resync_finish, + .resync_info_update = resync_info_update, .metadata_update_start = metadata_update_start, .metadata_update_finish = metadata_update_finish, .metadata_update_cancel = metadata_update_cancel, .area_resyncing = area_resyncing, - .add_new_disk_start = add_new_disk_start, - .add_new_disk_finish = add_new_disk_finish, + .add_new_disk = add_new_disk, + .add_new_disk_cancel = add_new_disk_cancel, .new_disk_ack = new_disk_ack, .remove_disk = remove_disk, .gather_bitmaps = gather_bitmaps, @@ -1022,5 +1051,6 @@ static void cluster_exit(void) module_init(cluster_init); module_exit(cluster_exit); +MODULE_AUTHOR("SUSE"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Clustering support for MD"); diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h index 00defe2badbc..e75ea2613184 100644 --- a/drivers/md/md-cluster.h +++ b/drivers/md/md-cluster.h @@ -12,15 +12,15 @@ struct md_cluster_operations { int (*join)(struct mddev *mddev, int nodes); int (*leave)(struct mddev *mddev); int (*slot_number)(struct mddev *mddev); - void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); - int (*resync_start)(struct mddev *mddev, sector_t lo, sector_t hi); - void (*resync_finish)(struct mddev *mddev); + int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); int (*metadata_update_start)(struct mddev *mddev); int (*metadata_update_finish)(struct mddev *mddev); - int (*metadata_update_cancel)(struct mddev *mddev); + void (*metadata_update_cancel)(struct mddev *mddev); + int (*resync_start)(struct mddev *mddev); + int (*resync_finish)(struct mddev *mddev); int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi); - int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev); - int (*add_new_disk_finish)(struct mddev *mddev); + int (*add_new_disk)(struct mddev *mddev, struct md_rdev *rdev); + void (*add_new_disk_cancel)(struct mddev *mddev); int (*new_disk_ack)(struct mddev *mddev, bool ack); int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev); int (*gather_bitmaps)(struct md_rdev *rdev); diff --git a/drivers/md/md.c b/drivers/md/md.c index 740ee9bc5ad3..3f9a514b5b9d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1608,7 +1608,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) ++ev1; if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && - le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) + (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) if (ev1 < mddev->events) return -EINVAL; } else if (mddev->bitmap) { @@ -1628,16 +1629,29 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) int role; if (rdev->desc_nr < 0 || rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { - role = 0xffff; + role = MD_DISK_ROLE_SPARE; rdev->desc_nr = -1; } else role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); switch(role) { - case 0xffff: /* spare */ + case MD_DISK_ROLE_SPARE: /* spare */ break; - case 0xfffe: /* faulty */ + case MD_DISK_ROLE_FAULTY: /* faulty */ set_bit(Faulty, &rdev->flags); break; + case MD_DISK_ROLE_JOURNAL: /* journal device */ + if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { + /* journal device without journal feature */ + printk(KERN_WARNING + "md: journal device provided without journal feature, ignoring the device\n"); + return -EINVAL; + } + set_bit(Journal, &rdev->flags); + rdev->journal_tail = le64_to_cpu(sb->journal_tail); + if (mddev->recovery_cp == MaxSector) + set_bit(MD_JOURNAL_CLEAN, &mddev->flags); + rdev->raid_disk = mddev->raid_disks; + break; default: rdev->saved_raid_disk = role; if ((le32_to_cpu(sb->feature_map) & @@ -1655,6 +1669,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) set_bit(WriteMostly, &rdev->flags); if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) set_bit(Replacement, &rdev->flags); + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) + set_bit(MD_HAS_JOURNAL, &mddev->flags); } else /* MULTIPATH are always insync */ set_bit(In_sync, &rdev->flags); @@ -1679,6 +1695,8 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->events = cpu_to_le64(mddev->events); if (mddev->in_sync) sb->resync_offset = cpu_to_le64(mddev->recovery_cp); + else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) + sb->resync_offset = cpu_to_le64(MaxSector); else sb->resync_offset = cpu_to_le64(0); @@ -1702,7 +1720,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); } - if (rdev->raid_disk >= 0 && + if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags)) { sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); @@ -1712,6 +1730,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); } + /* Note: recovery_offset and journal_tail share space */ + if (test_bit(Journal, &rdev->flags)) + sb->journal_tail = cpu_to_le64(rdev->journal_tail); if (test_bit(Replacement, &rdev->flags)) sb->feature_map |= cpu_to_le32(MD_FEATURE_REPLACEMENT); @@ -1735,6 +1756,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) } } + if (mddev_is_clustered(mddev)) + sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); + if (rdev->badblocks.count == 0) /* Nothing to do for bad blocks*/ ; else if (sb->bblog_offset == 0) @@ -1785,18 +1809,23 @@ retry: max_dev = le32_to_cpu(sb->max_dev); for (i=0; i<max_dev;i++) - sb->dev_roles[i] = cpu_to_le16(0xfffe); + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); + + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) + sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); rdev_for_each(rdev2, mddev) { i = rdev2->desc_nr; if (test_bit(Faulty, &rdev2->flags)) - sb->dev_roles[i] = cpu_to_le16(0xfffe); + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); else if (test_bit(In_sync, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); + else if (test_bit(Journal, &rdev2->flags)) + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); else if (rdev2->raid_disk >= 0) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else - sb->dev_roles[i] = cpu_to_le16(0xffff); + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); } sb->sb_csum = calc_sb_1_csum(sb); @@ -1912,13 +1941,23 @@ static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) struct md_rdev *rdev, *rdev2; rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev1) - rdev_for_each_rcu(rdev2, mddev2) + rdev_for_each_rcu(rdev, mddev1) { + if (test_bit(Faulty, &rdev->flags) || + test_bit(Journal, &rdev->flags) || + rdev->raid_disk == -1) + continue; + rdev_for_each_rcu(rdev2, mddev2) { + if (test_bit(Faulty, &rdev2->flags) || + test_bit(Journal, &rdev2->flags) || + rdev2->raid_disk == -1) + continue; if (rdev->bdev->bd_contains == rdev2->bdev->bd_contains) { rcu_read_unlock(); return 1; } + } + } rcu_read_unlock(); return 0; } @@ -2194,23 +2233,77 @@ static void sync_sbs(struct mddev *mddev, int nospares) } } +static bool does_sb_need_changing(struct mddev *mddev) +{ + struct md_rdev *rdev; + struct mdp_superblock_1 *sb; + int role; + + /* Find a good rdev */ + rdev_for_each(rdev, mddev) + if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags)) + break; + + /* No good device found. */ + if (!rdev) + return false; + + sb = page_address(rdev->sb_page); + /* Check if a device has become faulty or a spare become active */ + rdev_for_each(rdev, mddev) { + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + /* Device activated? */ + if (role == 0xffff && rdev->raid_disk >=0 && + !test_bit(Faulty, &rdev->flags)) + return true; + /* Device turned faulty? */ + if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd)) + return true; + } + + /* Check if any mddev parameters have changed */ + if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || + (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || + (mddev->layout != le64_to_cpu(sb->layout)) || + (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || + (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) + return true; + + return false; +} + void md_update_sb(struct mddev *mddev, int force_change) { struct md_rdev *rdev; int sync_req; int nospares = 0; int any_badblocks_changed = 0; + int ret = -1; if (mddev->ro) { if (force_change) set_bit(MD_CHANGE_DEVS, &mddev->flags); return; } + + if (mddev_is_clustered(mddev)) { + if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) + force_change = 1; + ret = md_cluster_ops->metadata_update_start(mddev); + /* Has someone else has updated the sb */ + if (!does_sb_need_changing(mddev)) { + if (ret == 0) + md_cluster_ops->metadata_update_cancel(mddev); + clear_bit(MD_CHANGE_PENDING, &mddev->flags); + return; + } + } repeat: /* First make sure individual recovery_offsets are correct */ rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && + !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && mddev->curr_resync_completed > rdev->recovery_offset) rdev->recovery_offset = mddev->curr_resync_completed; @@ -2354,6 +2447,9 @@ repeat: clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); } + + if (mddev_is_clustered(mddev) && ret == 0) + md_cluster_ops->metadata_update_finish(mddev); } EXPORT_SYMBOL(md_update_sb); @@ -2429,6 +2525,10 @@ state_show(struct md_rdev *rdev, char *page) len += sprintf(page+len, "%sin_sync",sep); sep = ","; } + if (test_bit(Journal, &flags)) { + len += sprintf(page+len, "%sjournal",sep); + sep = ","; + } if (test_bit(WriteMostly, &flags)) { len += sprintf(page+len, "%swrite_mostly",sep); sep = ","; @@ -2440,6 +2540,7 @@ state_show(struct md_rdev *rdev, char *page) sep = ","; } if (!test_bit(Faulty, &flags) && + !test_bit(Journal, &flags) && !test_bit(In_sync, & |