summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/md.txt29
-rw-r--r--drivers/md/bitmap.c137
-rw-r--r--drivers/md/bitmap.h5
-rw-r--r--drivers/md/md.c871
-rw-r--r--drivers/md/md.h110
-rw-r--r--drivers/md/raid1.c962
-rw-r--r--drivers/md/raid1.h26
-rw-r--r--drivers/md/raid10.c1183
-rw-r--r--drivers/md/raid10.h21
-rw-r--r--drivers/md/raid5.c1015
-rw-r--r--drivers/md/raid5.h99
-rw-r--r--include/linux/raid/md_p.h14
12 files changed, 3093 insertions, 1379 deletions
diff --git a/Documentation/md.txt b/Documentation/md.txt
index f0eee83ff78a..fc94770f44ab 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -360,18 +360,20 @@ Each directory contains:
A file recording the current state of the device in the array
which can be a comma separated list of
faulty - device has been kicked from active use due to
- a detected fault
+ a detected fault or it has unacknowledged bad
+ blocks
in_sync - device is a fully in-sync member of the array
writemostly - device will only be subject to read
requests if there are no other options.
This applies only to raid1 arrays.
- blocked - device has failed, metadata is "external",
- and the failure hasn't been acknowledged yet.
+ blocked - device has failed, and the failure hasn't been
+ acknowledged yet by the metadata handler.
Writes that would write to this device if
it were not faulty are blocked.
spare - device is working, but not a full member.
This includes spares that are in the process
of being recovered to
+ write_error - device has ever seen a write error.
This list may grow in future.
This can be written to.
Writing "faulty" simulates a failure on the device.
@@ -379,9 +381,11 @@ Each directory contains:
Writing "writemostly" sets the writemostly flag.
Writing "-writemostly" clears the writemostly flag.
Writing "blocked" sets the "blocked" flag.
- Writing "-blocked" clears the "blocked" flag and allows writes
- to complete.
+ Writing "-blocked" clears the "blocked" flags and allows writes
+ to complete and possibly simulates an error.
Writing "in_sync" sets the in_sync flag.
+ Writing "write_error" sets writeerrorseen flag.
+ Writing "-write_error" clears writeerrorseen flag.
This file responds to select/poll. Any change to 'faulty'
or 'blocked' causes an event.
@@ -419,7 +423,6 @@ Each directory contains:
written, it will be rejected.
recovery_start
-
When the device is not 'in_sync', this records the number of
sectors from the start of the device which are known to be
correct. This is normally zero, but during a recovery
@@ -435,6 +438,20 @@ Each directory contains:
Setting this to 'none' is equivalent to setting 'in_sync'.
Setting to any other value also clears the 'in_sync' flag.
+ bad_blocks
+ This gives the list of all known bad blocks in the form of
+ start address and length (in sectors respectively). If output
+ is too big to fit in a page, it will be truncated. Writing
+ "sector length" to this file adds new acknowledged (i.e.
+ recorded to disk safely) bad blocks.
+
+ unacknowledged_bad_blocks
+ This gives the list of known-but-not-yet-saved-to-disk bad
+ blocks in the same form of 'bad_blocks'. If output is too big
+ to fit in a page, it will be truncated. Writing to this file
+ adds bad blocks without acknowledging them. This is largely
+ for testing.
+
An active md device will also contain and entry for each active device
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 574b09afedd3..0dc6546b77a8 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -29,7 +29,6 @@
#include "md.h"
#include "bitmap.h"
-#include <linux/dm-dirty-log.h>
/* debug macros */
#define DEBUG 0
@@ -775,10 +774,8 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon
* 0 or page 1
*/
static inline struct page *filemap_get_page(struct bitmap *bitmap,
- unsigned long chunk)
+ unsigned long chunk)
{
- if (bitmap->filemap == NULL)
- return NULL;
if (file_page_index(bitmap, chunk) >= bitmap->file_pages)
return NULL;
return bitmap->filemap[file_page_index(bitmap, chunk)
@@ -878,28 +875,19 @@ enum bitmap_page_attr {
static inline void set_page_attr(struct bitmap *bitmap, struct page *page,
enum bitmap_page_attr attr)
{
- if (page)
- __set_bit((page->index<<2) + attr, bitmap->filemap_attr);
- else
- __set_bit(attr, &bitmap->logattrs);
+ __set_bit((page->index<<2) + attr, bitmap->filemap_attr);
}
static inline void clear_page_attr(struct bitmap *bitmap, struct page *page,
enum bitmap_page_attr attr)
{
- if (page)
- __clear_bit((page->index<<2) + attr, bitmap->filemap_attr);
- else
- __clear_bit(attr, &bitmap->logattrs);
+ __clear_bit((page->index<<2) + attr, bitmap->filemap_attr);
}
static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page,
enum bitmap_page_attr attr)
{
- if (page)
- return test_bit((page->index<<2) + attr, bitmap->filemap_attr);
- else
- return test_bit(attr, &bitmap->logattrs);
+ return test_bit((page->index<<2) + attr, bitmap->filemap_attr);
}
/*
@@ -912,30 +900,26 @@ static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *p
static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
{
unsigned long bit;
- struct page *page = NULL;
+ struct page *page;
void *kaddr;
unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap);
- if (!bitmap->filemap) {
- struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log;
- if (log)
- log->type->mark_region(log, chunk);
- } else {
+ if (!bitmap->filemap)
+ return;
- page = filemap_get_page(bitmap, chunk);
- if (!page)
- return;
- bit = file_page_offset(bitmap, chunk);
+ page = filemap_get_page(bitmap, chunk);
+ if (!page)
+ return;
+ bit = file_page_offset(bitmap, chunk);
- /* set the bit */
- kaddr = kmap_atomic(page, KM_USER0);
- if (bitmap->flags & BITMAP_HOSTENDIAN)
- set_bit(bit, kaddr);
- else
- __test_and_set_bit_le(bit, kaddr);
- kunmap_atomic(kaddr, KM_USER0);
- PRINTK("set file bit %lu page %lu\n", bit, page->index);
- }
+ /* set the bit */
+ kaddr = kmap_atomic(page, KM_USER0);
+ if (bitmap->flags & BITMAP_HOSTENDIAN)
+ set_bit(bit, kaddr);
+ else
+ __set_bit_le(bit, kaddr);
+ kunmap_atomic(kaddr, KM_USER0);
+ PRINTK("set file bit %lu page %lu\n", bit, page->index);
/* record page number so it gets flushed to disk when unplug occurs */
set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
}
@@ -952,16 +936,6 @@ void bitmap_unplug(struct bitmap *bitmap)
if (!bitmap)
return;
- if (!bitmap->filemap) {
- /* Must be using a dirty_log */
- struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log;
- dirty = test_and_clear_bit(BITMAP_PAGE_DIRTY, &bitmap->logattrs);
- need_write = test_and_clear_bit(BITMAP_PAGE_NEEDWRITE, &bitmap->logattrs);
- if (dirty || need_write)
- if (log->type->flush(log))
- bitmap->flags |= BITMAP_WRITE_ERROR;
- goto out;
- }
/* look at each page to see if there are any set bits that need to be
* flushed out to disk */
@@ -990,7 +964,6 @@ void bitmap_unplug(struct bitmap *bitmap)
else
md_super_wait(bitmap->mddev);
}
-out:
if (bitmap->flags & BITMAP_WRITE_ERROR)
bitmap_file_kick(bitmap);
}
@@ -1199,7 +1172,6 @@ void bitmap_daemon_work(mddev_t *mddev)
struct page *page = NULL, *lastpage = NULL;
sector_t blocks;
void *paddr;
- struct dm_dirty_log *log = mddev->bitmap_info.log;
/* Use a mutex to guard daemon_work against
* bitmap_destroy.
@@ -1224,12 +1196,11 @@ void bitmap_daemon_work(mddev_t *mddev)
spin_lock_irqsave(&bitmap->lock, flags);
for (j = 0; j < bitmap->chunks; j++) {
bitmap_counter_t *bmc;
- if (!bitmap->filemap) {
- if (!log)
- /* error or shutdown */
- break;
- } else
- page = filemap_get_page(bitmap, j);
+ if (!bitmap->filemap)
+ /* error or shutdown */
+ break;
+
+ page = filemap_get_page(bitmap, j);
if (page != lastpage) {
/* skip this page unless it's marked as needing cleaning */
@@ -1298,17 +1269,16 @@ void bitmap_daemon_work(mddev_t *mddev)
-1);
/* clear the bit */
- if (page) {
- paddr = kmap_atomic(page, KM_USER0);
- if (bitmap->flags & BITMAP_HOSTENDIAN)
- clear_bit(file_page_offset(bitmap, j),
- paddr);
- else
- __test_and_clear_bit_le(file_page_offset(bitmap, j),
- paddr);
- kunmap_atomic(paddr, KM_USER0);
- } else
- log->type->clear_region(log, j);
+ paddr = kmap_atomic(page, KM_USER0);
+ if (bitmap->flags & BITMAP_HOSTENDIAN)
+ clear_bit(file_page_offset(bitmap, j),
+ paddr);
+ else
+ __clear_bit_le(
+ file_page_offset(bitmap,
+ j),
+ paddr);
+ kunmap_atomic(paddr, KM_USER0);
}
} else
j |= PAGE_COUNTER_MASK;
@@ -1316,16 +1286,12 @@ void bitmap_daemon_work(mddev_t *mddev)
spin_unlock_irqrestore(&bitmap->lock, flags);
/* now sync the final page */
- if (lastpage != NULL || log != NULL) {
+ if (lastpage != NULL) {
spin_lock_irqsave(&bitmap->lock, flags);
if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) {
clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
spin_unlock_irqrestore(&bitmap->lock, flags);
- if (lastpage)
- write_page(bitmap, lastpage, 0);
- else
- if (log->type->flush(log))
- bitmap->flags |= BITMAP_WRITE_ERROR;
+ write_page(bitmap, lastpage, 0);
} else {
set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -1767,12 +1733,10 @@ int bitmap_create(mddev_t *mddev)
BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
if (!file
- && !mddev->bitmap_info.offset
- && !mddev->bitmap_info.log) /* bitmap disabled, nothing to do */
+ && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */
return 0;
BUG_ON(file && mddev->bitmap_info.offset);
- BUG_ON(mddev->bitmap_info.offset && mddev->bitmap_info.log);
bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
if (!bitmap)
@@ -1863,6 +1827,7 @@ int bitmap_create(mddev_t *mddev)
int bitmap_load(mddev_t *mddev)
{
int err = 0;
+ sector_t start = 0;
sector_t sector = 0;
struct bitmap *bitmap = mddev->bitmap;
@@ -1881,24 +1846,14 @@ int bitmap_load(mddev_t *mddev)
}
bitmap_close_sync(bitmap);
- if (mddev->bitmap_info.log) {
- unsigned long i;
- struct dm_dirty_log *log = mddev->bitmap_info.log;
- for (i = 0; i < bitmap->chunks; i++)
- if (!log->type->in_sync(log, i, 1))
- bitmap_set_memory_bits(bitmap,
- (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap),
- 1);
- } else {
- sector_t start = 0;
- if (mddev->degraded == 0
- || bitmap->events_cleared == mddev->events)
- /* no need to keep dirty bits to optimise a
- * re-add of a missing device */
- start = mddev->recovery_cp;
-
- err = bitmap_init_from_disk(bitmap, start);
- }
+ if (mddev->degraded == 0
+ || bitmap->events_cleared == mddev->events)
+ /* no need to keep dirty bits to optimise a
+ * re-add of a missing device */
+ start = mddev->recovery_cp;
+
+ err = bitmap_init_from_disk(bitmap, start);
+
if (err)
goto out;
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index b2a127e891ac..a28f2e5588c6 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -212,10 +212,6 @@ struct bitmap {
unsigned long file_pages; /* number of pages in the file */
int last_page_size; /* bytes in the last page */
- unsigned long logattrs; /* used when filemap_attr doesn't exist
- * because we are working with a dirty_log
- */
-
unsigned long flags;
int allclean;
@@ -237,7 +233,6 @@ struct bitmap {
wait_queue_head_t behind_wait;
struct sysfs_dirent *sysfs_can_clear;
-
};
/* the bitmap API */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index dfc9425db70b..8e221a20f5d9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -215,6 +215,55 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
}
EXPORT_SYMBOL_GPL(bio_clone_mddev);
+void md_trim_bio(struct bio *bio, int offset, int size)
+{
+ /* 'bio' is a cloned bio which we need to trim to match
+ * the given offset and size.
+ * This requires adjusting bi_sector, bi_size, and bi_io_vec
+ */
+ int i;
+ struct bio_vec *bvec;
+ int sofar = 0;
+
+ size <<= 9;
+ if (offset == 0 && size == bio->bi_size)
+ return;
+
+ bio->bi_sector += offset;
+ bio->bi_size = size;
+ offset <<= 9;
+ clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+
+ while (bio->bi_idx < bio->bi_vcnt &&
+ bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
+ /* remove this whole bio_vec */
+ offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
+ bio->bi_idx++;
+ }
+ if (bio->bi_idx < bio->bi_vcnt) {
+ bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
+ bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
+ }
+ /* avoid any complications with bi_idx being non-zero*/
+ if (bio->bi_idx) {
+ memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
+ (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
+ bio->bi_vcnt -= bio->bi_idx;
+ bio->bi_idx = 0;
+ }
+ /* Make sure vcnt and last bv are not too big */
+ bio_for_each_segment(bvec, bio, i) {
+ if (sofar + bvec->bv_len > size)
+ bvec->bv_len = size - sofar;
+ if (bvec->bv_len == 0) {
+ bio->bi_vcnt = i;
+ break;
+ }
+ sofar += bvec->bv_len;
+ }
+}
+EXPORT_SYMBOL_GPL(md_trim_bio);
+
/*
* We have a system wide 'event count' that is incremented
* on any 'interesting' event, and readers of /proc/mdstat
@@ -757,6 +806,10 @@ static void free_disk_sb(mdk_rdev_t * rdev)
rdev->sb_start = 0;
rdev->sectors = 0;
}
+ if (rdev->bb_page) {
+ put_page(rdev->bb_page);
+ rdev->bb_page = NULL;
+ }
}
@@ -1025,7 +1078,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
ret = -EINVAL;
bdevname(rdev->bdev, b);
- sb = (mdp_super_t*)page_address(rdev->sb_page);
+ sb = page_address(rdev->sb_page);
if (sb->md_magic != MD_SB_MAGIC) {
printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
@@ -1054,6 +1107,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
rdev->preferred_minor = sb->md_minor;
rdev->data_offset = 0;
rdev->sb_size = MD_SB_BYTES;
+ rdev->badblocks.shift = -1;
if (sb->level == LEVEL_MULTIPATH)
rdev->desc_nr = -1;
@@ -1064,7 +1118,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
ret = 1;
} else {
__u64 ev1, ev2;
- mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
+ mdp_super_t *refsb = page_address(refdev->sb_page);
if (!uuid_equal(refsb, sb)) {
printk(KERN_WARNING "md: %s has different UUID to %s\n",
b, bdevname(refdev->bdev,b2));
@@ -1099,7 +1153,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
{
mdp_disk_t *desc;
- mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
+ mdp_super_t *sb = page_address(rdev->sb_page);
__u64 ev1 = md_event(sb);
rdev->raid_disk = -1;
@@ -1230,7 +1284,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
rdev->sb_size = MD_SB_BYTES;
- sb = (mdp_super_t*)page_address(rdev->sb_page);
+ sb = page_address(rdev->sb_page);
memset(sb, 0, sizeof(*sb));
@@ -1395,6 +1449,8 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
return cpu_to_le32(csum);
}
+static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
+ int acknowledged);
static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
{
struct mdp_superblock_1 *sb;
@@ -1435,7 +1491,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
if (ret) return ret;
- sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+ sb = page_address(rdev->sb_page);
if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
sb->major_version != cpu_to_le32(1) ||
@@ -1473,12 +1529,52 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
else
rdev->desc_nr = le32_to_cpu(sb->dev_number);
+ if (!rdev->bb_page) {
+ rdev->bb_page = alloc_page(GFP_KERNEL);
+ if (!rdev->bb_page)
+ return -ENOMEM;
+ }
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
+ rdev->badblocks.count == 0) {
+ /* need to load the bad block list.
+ * Currently we limit it to one page.
+ */
+ s32 offset;
+ sector_t bb_sector;
+ u64 *bbp;
+ int i;
+ int sectors = le16_to_cpu(sb->bblog_size);
+ if (sectors > (PAGE_SIZE / 512))
+ return -EINVAL;
+ offset = le32_to_cpu(sb->bblog_offset);
+ if (offset == 0)
+ return -EINVAL;
+ bb_sector = (long long)offset;
+ if (!sync_page_io(rdev, bb_sector, sectors << 9,
+ rdev->bb_page, READ, true))
+ return -EIO;
+ bbp = (u64 *)page_address(rdev->bb_page);
+ rdev->badblocks.shift = sb->bblog_shift;
+ for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
+ u64 bb = le64_to_cpu(*bbp);
+ int count = bb & (0x3ff);
+ u64 sector = bb >> 10;
+ sector <<= sb->bblog_shift;
+ count <<= sb->bblog_shift;
+ if (bb + 1 == 0)
+ break;
+ if (md_set_badblocks(&rdev->badblocks,
+ sector, count, 1) == 0)
+ return -EINVAL;
+ }
+ } else if (sb->bblog_offset == 0)
+ rdev->badblocks.shift = -1;
+
if (!refdev) {
ret = 1;
} else {
__u64 ev1, ev2;
- struct mdp_superblock_1 *refsb =
- (struct mdp_superblock_1*)page_address(refdev->sb_page);
+ struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
sb->level != refsb->level ||
@@ -1513,7 +1609,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
{
- struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+ struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
__u64 ev1 = le64_to_cpu(sb->events);
rdev->raid_disk = -1;
@@ -1619,13 +1715,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
int max_dev, i;
/* make rdev->sb match mddev and rdev data. */
- sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+ sb = page_address(rdev->sb_page);
sb->feature_map = 0;
sb->pad0 = 0;
sb->recovery_offset = cpu_to_le64(0);
memset(sb->pad1, 0, sizeof(sb->pad1));
- memset(sb->pad2, 0, sizeof(sb->pad2));
memset(sb->pad3, 0, sizeof(sb->pad3));
sb->utime = cpu_to_le64((__u64)mddev->utime);
@@ -1665,6 +1760,40 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
}
+ if (rdev->badblocks.count == 0)
+ /* Nothing to do for bad blocks*/ ;
+ else if (sb->bblog_offset == 0)
+ /* Cannot record bad blocks on this device */
+ md_error(mddev, rdev);
+ else {
+ struct badblocks *bb = &rdev->badblocks;
+ u64 *bbp = (u64 *)page_address(rdev->bb_page);
+ u64 *p = bb->page;
+ sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
+ if (bb->changed) {
+ unsigned seq;
+
+retry:
+ seq = read_seqbegin(&bb->lock);
+
+ memset(bbp, 0xff, PAGE_SIZE);
+
+ for (i = 0 ; i < bb->count ; i++) {
+ u64 internal_bb = *p++;
+ u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
+ | BB_LEN(internal_bb));
+ *bbp++ = cpu_to_le64(store_bb);
+ }
+ if (read_seqretry(&bb->lock, seq))
+ goto retry;
+
+ bb->sector = (rdev->sb_start +
+ (int)le32_to_cpu(sb->bblog_offset));
+ bb->size = le16_to_cpu(sb->bblog_size);
+ bb->changed = 0;
+ }
+ }
+
max_dev = 0;
list_for_each_entry(rdev2, &mddev->disks, same_set)
if (rdev2->desc_nr+1 > max_dev)
@@ -1724,7 +1853,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
num_sectors = max_sectors;
rdev->sb_start = sb_start;
}
- sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
+ sb = page_address(rdev->sb_page);
sb->data_size = cpu_to_le64(num_sectors);
sb->super_offset = rdev->sb_start;
sb->sb_csum = calc_sb_1_csum(sb);
@@ -1922,7 +2051,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
bd_link_disk_holder(rdev->bdev, mddev->gendisk);
/* May as well allow recovery to be retried once */
- mddev->recovery_disabled = 0;
+ mddev->recovery_disabled++;
return 0;
@@ -1953,6 +2082,9 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
sysfs_remove_link(&rdev->kobj, "block");
sysfs_put(rdev->sysfs_state);
rdev->sysfs_state = NULL;
+ kfree(rdev->badblocks.page);
+ rdev->badblocks.count = 0;
+ rdev->badblocks.page = NULL;
/* We need to delay this, otherwise we can deadlock when
* writing to 'remove' to "dev/state". We also need
* to delay it due to rcu usage.
@@ -2127,10 +2259,10 @@ static void print_rdev(mdk_rdev_t *rdev, int major_version)
printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
switch (major_version) {
case 0:
- print_sb_90((mdp_super_t*)page_address(rdev->sb_page));
+ print_sb_90(page_address(rdev->sb_page));
break;
case 1:
- print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page));
+ print_sb_1(page_address(rdev->sb_page));
break;
}
} else
@@ -2194,6 +2326,7 @@ static void md_update_sb(mddev_t * mddev, int force_change)
mdk_rdev_t *rdev;
int sync_req;
int nospares = 0;
+ int any_badblocks_changed = 0;
repeat:
/* First make sure individual recovery_offsets are correct */
@@ -2208,8 +2341,18 @@ repeat:
if (!mddev->persistent) {
clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
clear_bit(MD_CHANGE_DEVS, &mddev->flags);
- if (!mddev->external)
+ if (!mddev->external) {
clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+ list_for_each_entry(rdev, &mddev->disks, same_set) {
+ if (rdev->badblocks.changed) {
+ md_ack_all_badblocks(&rdev->badblocks);
+ md_error(mddev, rdev);
+ }
+ clear_bit(Blocked, &rdev->flags);
+ clear_bit(BlockedBadBlocks, &rdev->flags);
+ wake_up(&rdev->blocked_wait);
+ }
+ }
wake_up(&mddev->sb_wait);
return;
}
@@ -2265,6 +2408,14 @@ repeat:
MD_BUG();
mddev->events --;
}
+
+ list_for_each_entry(rdev, &mddev->disks, same_set) {
+ if (rdev->badblocks.changed)
+ any_badblocks_changed++;
+ if (test_bit(Faulty, &rdev->flags))
+ set_bit(FaultRecorded, &rdev->flags);
+ }
+
sync_sbs(mddev, nospares);
spin_unlock_irq(&mddev->write_lock);
@@ -2290,6 +2441,13 @@ repeat:
bdevname(rdev->bdev,b),
(unsigned long long)rdev->sb_start);
rdev->sb_events = mddev->events;
+ if (rdev->badblocks.size) {
+ md_super_write(mddev, rdev,
+ rdev->badblocks.sector,
+ rdev->badblocks.size << 9,
+ rdev->bb_page);
+ rdev->badblocks.size = 0;
+ }
} else
dprintk(")\n");
@@ -2313,6 +2471,15 @@ repeat:
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ list_for_each_entry(rdev, &mddev->disks, same_set) {
+ if (test_and_clear_bit(FaultRecorded, &rdev->flags))
+ clear_bit(Blocked, &rdev->flags);
+
+ if (any_badblocks_changed)
+ md_ack_all_badblocks(&rdev->badblocks);
+ clear_bit(BlockedBadBlocks, &rdev->flags);
+ wake_up(&rdev->blocked_wait);
+ }
}
/* words written to sysfs files may, or may not, be \n terminated.
@@ -2347,7 +2514,8 @@ state_show(mdk_rdev_t *rdev, char *page)
char *sep = "";
size_t len = 0;
- if (test_bit(Faulty, &rdev->flags)) {
+ if (test_bit(Faulty, &rdev->flags) ||
+ rdev->badblocks.unacked_exist) {
len+= sprintf(page+len, "%sfaulty",sep);
sep = ",";
}
@@ -2359,7 +2527,8 @@ state_show(mdk_rdev_t *rdev, char *page)
len += sprintf(page+len, "%swrite_mostly",sep);
sep = ",";
}
- if (test_bit(Blocked, &rdev->flags)) {
+ if (test_bit(Blocked, &rdev->flags) ||
+ rdev->badblocks.unacked_exist) {
len += sprintf(page+len, "%sblocked", sep);
sep = ",";
}
@@ -2368,6 +2537,10 @@ state_show(mdk_rdev_t *rdev, char *page)
len += sprintf(page+len, "%sspare", sep);
sep = ",";
}
+ if (test_bit(WriteErrorSeen, &rdev->flags)) {
+ len += sprintf(page+len, "%swrite_error", sep);
+ sep = ",";
+ }
return len+sprintf(page+len, "\n");
}
@@ -2375,13 +2548,15 @@ static ssize_t
state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
{
/* can write
- * faulty - simulates and error
+ * faulty - simulates an error
* remove - disconnects the device
* writemostly - sets write_mostly
* -writemostly - clears write_mostly
- * blocked - sets the Blocked flag
- * -blocked - clears the Blocked flag
+ * blocked - sets the Blocked flags
+ * -blocked - clears the Blocked and possibly simulates an error
* insync - sets Insync providing device isn't active
+ * write_error - sets WriteErrorSeen
+ * -write_error - clears WriteErrorSeen
*/
int err = -EINVAL;
if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
@@ -2408,7 +2583,15 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
set_bit(Blocked, &rdev->flags);
err = 0;
} else if (cmd_match(buf, "-blocked")) {
+ if (!test_bit(Faulty, &rdev->flags) &&
+ test_bit(BlockedBadBlocks, &rdev->flags)) {
+ /* metadata handler doesn't understand badblocks,
+ * so we need to fail the device
+ */
+ md_error(rdev->mddev, rdev);
+ }
clear_bit(Blocked, &rdev->flags);
+ clear_bit(BlockedBadBlocks, &rdev->flags);
wake_up(&rdev->blocked_wait);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread);
@@ -2417,6 +2600,12 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
set_bit(In_sync, &rdev->flags);
err = 0;
+ } else if (cmd_match(buf, "write_error")) {
+ set_bit(WriteErrorSeen, &rdev->flags);
+ err = 0;
+ } else if (cmd_match(buf, "-write_error")) {
+ clear_bit(WriteErrorSeen, &rdev->flags);
+ err = 0;
}
if (!err)
sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -2459,7 +2648,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
{
char *e;
int err;
- char nm[20];
int slot = simple_strtoul(buf, &e, 10);
if (strncmp(buf, "none", 4)==0)
slot = -1;
@@ -2482,8 +2670,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
hot_remove_disk(rdev->mddev, rdev->raid_disk);
if (err)
return err;
- sprintf(nm, "rd%d", rdev->raid_disk);
- sysfs_remove_link(&rdev->mddev->kobj, nm);
+ sysfs_unlink_rdev(rdev->mddev, rdev);
rdev->raid_disk = -1;
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread);
@@ -2522,8 +2709,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
return err;
} else
sysfs_notify_dirent_safe(rdev->sysfs_state);
- sprintf(nm, "rd%d", rdev->raid_disk);
- if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
+ if (sysfs_link_rdev(rdev->mddev, rdev))
/* failure here is OK */;
/* don't wakeup anyone, leave that to userspace. */
} else {
@@ -2712,6 +2898,39 @@ static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t le
static struct rdev_sysfs_entry rdev_recovery_start =
__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
+
+static ssize_t
+badblocks_show(struct badblocks *bb, char *page, int unack);
+static ssize_t
+badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
+
+static ssize_t bb_show(mdk_rdev_t *rdev, char *page)
+{
+ return badblocks_show(&rdev->badblocks, page, 0);
+}
+static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len)
+{
+ int rv = badblocks_store(&rdev->badblocks, page, len, 0);
+ /* Maybe that ack was all we needed */
+ if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
+ wake_up(&rdev->blocked_wait);
+ return rv;
+}
+static struct rdev_sysfs_entry rdev_bad_blocks =
+__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
+
+
+static ssize_t ubb_show(mdk_rdev_t *rdev, char *page)
+{
+ return badblocks_show(&rdev->badblocks, page, 1);
+}
+static ssize_t ubb_store(mdk_rdev_t *rdev, const char *page, size_t len)
+{
+ return badblocks_store(&rdev->badblocks, page, len, 1);
+}
+static struct rdev_sysfs_entry rdev_unack_bad_blocks =
+__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
+
static struct attribute *rdev_default_attrs[] = {
&rdev_state.attr,
&rdev_errors.attr,
@@ -2719,6 +2938,8 @@ static struct attribute *rdev_default_attrs[] = {
&rdev_offset.attr,
&rdev_size.attr,
&rdev_recovery_start.attr,
+ &rdev_bad_blocks.attr,
+ &rdev_unack_bad_blocks.attr,
NULL,
};
static ssize_t
@@ -2782,7 +3003,7 @@ static struct kobj_type rdev_ktype = {
.default_attrs = rdev_default_attrs,
};
-void md_rdev_init(mdk_rdev_t *rdev)
+int md_rdev_init(mdk_rdev_t *rdev)
{
rdev->desc_nr = -1;
rdev->saved_raid_disk = -1;
@@ -2792,12 +3013,27 @@ void md_rdev_init(mdk_rdev_t *rdev)
rdev->sb_events = 0;
rdev->last_read_error.tv_sec = 0;
rdev->last_read_error.tv_nsec = 0;
+ rdev->sb_loaded = 0;
+ rdev->bb_page = NULL;
atomic_set(&rdev->nr_pending, 0);
atomic_set(&rdev->read_errors, 0);
atomic_set(&rdev->corrected_errors, 0);
INIT_LIST_HEAD(&rdev->same_set);
init_waitqueue_head(&rdev->blocked_wait);
+
+ /* Add space to store bad block list.
+ * This reserves the space even on arrays where it cannot
+ * be used - I wonder if that matters
+ */
+ rdev->badblocks.count = 0;
+ rdev->badblocks.shift = 0;
+ rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ seqlock_init(&rdev->badblocks.lock);
+ if (rdev->badblocks.page == NULL)
+ return -ENOMEM;
+
+ return 0;
}
EXPORT_SYMBOL_GPL(md_rdev_init);
/*
@@ -2823,8 +3059,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
return ERR_PTR(-ENOMEM);
}
- md_rdev_init(rdev);
- if ((err = alloc_disk_sb(rdev)))
+ err = md_rdev_init(rdev);
+ if (err)
+ goto abort_free;
+ err = alloc_disk_sb(rdev);
+ if (err)
goto abort_free;
err = lock_rdev(rdev, newdev, super_format == -2);
@@ -2860,15 +3099,17 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
goto abort_free;
}
}
+ if (super_format == -1)
+ /* hot-add for 0.90, or non-persistent: so no badblocks */
+ rdev->badblocks.shift = -1;
return rdev;
abort_free:
- if (rdev->sb_page) {
- if (rdev->bdev)
- unlock_rdev(rdev);
- free_disk_sb(rdev);
- }
+ if (rdev->bdev)
+ unlock_rdev(rdev);
+ free_disk_sb(rdev);
+ kfree(rdev->badblocks.page);
kfree(rdev);
return ERR_PTR(err);
}
@@ -3149,15 +3390,13 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
}
list_for_each_entry(rdev, &mddev->disks, same_set) {
- char nm[20];
if (rdev->raid_disk < 0)
continue;
if (rdev->new_raid_disk >= mddev->raid_disks)
rdev->new_raid_disk = -1;
if (rdev->new_raid_disk == rdev->raid_disk)
continue;
- sprintf(nm, "rd%d", rdev->raid_disk);
- sysfs_remove_link(&mddev->kobj, nm);
+ sysfs_unlink_rdev(mddev, rdev);
}
list_for_each_entry(rdev, &mddev->disks, same_set) {
if (rdev->raid_disk < 0)
@@ -3168,11 +3407,10 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
if (rdev->raid_disk < 0)
clear_bit(In_sync, &rdev->flags);
else {
- char nm[20];
- sprintf(nm, "rd%d", rdev->raid_disk);
- if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
- printk("md: cannot register %s for %s after level change\n",
- nm, mdname(mddev));
+ if (sysfs_link_rdev(mddev, rdev))
+ printk(KERN_WARNING "md: cannot register rd%d"
+ " for %s after level change\n",
+ rdev->raid_disk, mdname(mddev));
}
}
@@ -4504,7 +4742,8 @@ int md_run(mddev_t *mddev)
}
if (mddev->bio_set == NULL)
- mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev));
+ mddev->bio_set = bioset_create(BIO_POOL_SIZE,
+ sizeof(mddev_t *));
spin_lock(&pers_lock);
pers = find_pers(mddev->level, mddev->clevel);
@@ -4621,12 +4860,9 @@ int md_run(mddev_t *mddev)
smp_wmb();
mddev->ready = 1;
list_for_each_entry(rdev, &mddev->disks, same_set)
- if (rdev->raid_disk >= 0) {
- char nm[20];
- sprintf(nm, "rd%d", rdev->raid_disk);
- if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
+ if (rdev->raid_disk >= 0)
+ if (sysfs_link_rdev(mddev, rdev))
/* failure here is OK */;
- }
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -4854,11 +5090,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open