Merge tag 'md/4.1' of git://neil.brown.name/md

Pull md updates from Neil Brown: "More updates that usual this time. A few have performance impacts which hould mostly be positive, but RAID5 (in particular) can be very work-load ensitive... We'll have to wait and see. Highlights: - "experimental" code for managing md/raid1 across a cluster using DLM. Code is not ready for general use and triggers a WARNING if used. However it is looking good and mostly done and having in mainline will help co-ordinate development. - RAID5/6 can now batch multiple (4K wide) stripe_heads so as to handle a full (chunk wide) stripe as a single unit. - RAID6 can now perform read-modify-write cycles which should help performance on larger arrays: 6 or more devices. - RAID5/6 stripe cache now grows and shrinks dynamically. The value set is used as a minimum. - Resync is now allowed to go a little faster than the 'mininum' when there is competing IO. How much faster depends on the speed of the devices, so the effective minimum should scale with device speed to some extent" * tag 'md/4.1' of git://neil.brown.name/md: (58 commits) md/raid5: don't do chunk aligned read on degraded array. md/raid5: allow the stripe_cache to grow and shrink. md/raid5: change ->inactive_blocked to a bit-flag. md/raid5: move max_nr_stripes management into grow_one_stripe and drop_one_stripe md/raid5: pass gfp_t arg to grow_one_stripe() md/raid5: introduce configuration option rmw_level md/raid5: activate raid6 rmw feature md/raid6 algorithms: xor_syndrome() for SSE2 md/raid6 algorithms: xor_syndrome() for generic int md/raid6 algorithms: improve test program md/raid6 algorithms: delta syndrome functions raid5: handle expansion/resync case with stripe batching raid5: handle io error of batch list RAID5: batch adjacent full stripe write raid5: track overwrite disk count raid5: add a new flag to track if a stripe can be batched raid5: use flex_array for scribble data md raid0: access mddev->queue (request queue member) conditionally because it is not set when accessed from dm-raid md: allow resync to go faster when there is competing IO. md: remove 'go_faster' option from ->sync_request() ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2015-04-24 09:28:01 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-04-24 09:28:01 -0700
commit: 474095e46cd14421821da3201a9fd6a4c070996b (patch)
tree: 7203d36f53c376a96099ed0310787b1fb0c4f7a5 /drivers/md
parent: d56a669ca59c37ed0a7282a251b2f2f22533343a (diff)
parent: 9ffc8f7cb9647b13dfe4d1ad0d5e1427bb8b46d6 (diff)
13 files changed, 2309 insertions, 279 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 6ddc983417d5..edcf4ab66e00 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -175,6 +175,22 @@ config MD_FAULTY
 
 	  In unsure, say N.
 
+
+config MD_CLUSTER
+	tristate "Cluster Support for MD (EXPERIMENTAL)"
+	depends on BLK_DEV_MD
+	depends on DLM
+	default n
+	---help---
+	Clustering support for MD devices. This enables locking and
+	synchronization across multiple systems on the cluster, so all
+	nodes in the cluster can access the MD devices simultaneously.
+
+	This brings the redundancy (and uptime) of RAID levels across the
+	nodes of the cluster.
+
+	If unsure, say N.
+
 source "drivers/md/bcache/Kconfig"
 
 config BLK_DEV_DM_BUILTIN
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 1863feaa5846..dba4db5985fb 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_MD_RAID10)		+= raid10.o
 obj-$(CONFIG_MD_RAID456)	+= raid456.o
 obj-$(CONFIG_MD_MULTIPATH)	+= multipath.o
 obj-$(CONFIG_MD_FAULTY)		+= faulty.o
+obj-$(CONFIG_MD_CLUSTER)	+= md-cluster.o
 obj-$(CONFIG_BCACHE)		+= bcache/
 obj-$(CONFIG_BLK_DEV_MD)	+= md-mod.o
 obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 3a5767968ba0..2bc56e2a3526 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -205,6 +205,10 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 	struct block_device *bdev;
 	struct mddev *mddev = bitmap->mddev;
 	struct bitmap_storage *store = &bitmap->storage;
+	int node_offset = 0;
+
+	if (mddev_is_clustered(bitmap->mddev))
+		node_offset = bitmap->cluster_slot * store->file_pages;
 
 	while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
 		int size = PAGE_SIZE;
@@ -433,6 +437,7 @@ void bitmap_update_sb(struct bitmap *bitmap)
 	/* This might have been changed by a reshape */
 	sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
 	sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize);
+	sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes);
 	sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
 					   bitmap_info.space);
 	kunmap_atomic(sb);
@@ -544,6 +549,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	bitmap_super_t *sb;
 	unsigned long chunksize, daemon_sleep, write_behind;
 	unsigned long long events;
+	int nodes = 0;
 	unsigned long sectors_reserved = 0;
 	int err = -EINVAL;
 	struct page *sb_page;
@@ -562,6 +568,22 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 		return -ENOMEM;
 	bitmap->storage.sb_page = sb_page;
 
+re_read:
+	/* If cluster_slot is set, the cluster is setup */
+	if (bitmap->cluster_slot >= 0) {
+		sector_t bm_blocks = bitmap->mddev->resync_max_sectors;
+
+		sector_div(bm_blocks,
+			   bitmap->mddev->bitmap_info.chunksize >> 9);
+		/* bits to bytes */
+		bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t);
+		/* to 4k blocks */
+		bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
+		bitmap->mddev->bitmap_info.offset += bitmap->cluster_slot * (bm_blocks << 3);
+		pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
+			bitmap->cluster_slot, (unsigned long long)bitmap->mddev->bitmap_info.offset);
+	}
+
 	if (bitmap->storage.file) {
 		loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);
 		int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;
@@ -577,12 +599,15 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	if (err)
 		return err;
 
+	err = -EINVAL;
 	sb = kmap_atomic(sb_page);
 
 	chunksize = le32_to_cpu(sb->chunksize);
 	daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
 	write_behind = le32_to_cpu(sb->write_behind);
 	sectors_reserved = le32_to_cpu(sb->sectors_reserved);
+	nodes = le32_to_cpu(sb->nodes);
+	strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64);
 
 	/* verify that the bitmap-specific fields are valid */
 	if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
@@ -619,7 +644,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 			goto out;
 		}
 		events = le64_to_cpu(sb->events);
-		if (events < bitmap->mddev->events) {
+		if (!nodes && (events < bitmap->mddev->events)) {
 			printk(KERN_INFO
 			       "%s: bitmap file is out of date (%llu < %llu) "
 			       "-- forcing full recovery\n",
@@ -634,20 +659,40 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
 		set_bit(BITMAP_HOSTENDIAN, &bitmap->flags);
 	bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
+	strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64);
 	err = 0;
+
 out:
 	kunmap_atomic(sb);
+	/* Assiging chunksize is required for "re_read" */
+	bitmap->mddev->bitmap_info.chunksize = chunksize;
+	if (nodes && (bitmap->cluster_slot < 0)) {
+		err = md_setup_cluster(bitmap->mddev, nodes);
+		if (err) {
+			pr_err("%s: Could not setup cluster service (%d)\n",
+					bmname(bitmap), err);
+			goto out_no_sb;
+		}
+		bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev);
+		goto re_read;
+	}
+
+
 out_no_sb:
 	if (test_bit(BITMAP_STALE, &bitmap->flags))
 		bitmap->events_cleared = bitmap->mddev->events;
 	bitmap->mddev->bitmap_info.chunksize = chunksize;
 	bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
 	bitmap->mddev->bitmap_info.max_write_behind = write_behind;
+	bitmap->mddev->bitmap_info.nodes = nodes;
 	if (bitmap->mddev->bitmap_info.space == 0 ||
 	    bitmap->mddev->bitmap_info.space > sectors_reserved)
 		bitmap->mddev->bitmap_info.space = sectors_reserved;
-	if (err)
+	if (err) {
 		bitmap_print_sb(bitmap);
+		if (bitmap->cluster_slot < 0)
+			md_cluster_stop(bitmap->mddev);
+	}
 	return err;
 }
 
@@ -692,9 +737,10 @@ static inline struct page *filemap_get_page(struct bitmap_storage *store,
 }
 
 static int bitmap_storage_alloc(struct bitmap_storage *store,
-				unsigned long chunks, int with_super)
+				unsigned long chunks, int with_super,
+				int slot_number)
 {
-	int pnum;
+	int pnum, offset = 0;
 	unsigned long num_pages;
 	unsigned long bytes;
 
@@ -703,6 +749,7 @@ static int bitmap_storage_alloc(struct bitmap_storage *store,
 		bytes += sizeof(bitmap_super_t);
 
 	num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
+	offset = slot_number * (num_pages - 1);
 
 	store->filemap = kmalloc(sizeof(struct page *)
 				 * num_pages, GFP_KERNEL);
@@ -713,20 +760,22 @@ static int bitmap_storage_alloc(struct bitmap_storage *store,
 		store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
 		if (store->sb_page == NULL)
 			return -ENOMEM;
-		store->sb_page->index = 0;
 	}
+
 	pnum = 0;
 	if (store->sb_page) {
 		store->filemap[0] = store->sb_page;
 		pnum = 1;
+		store->sb_page->index = offset;
 	}
+
 	for ( ; pnum < num_pages; pnum++) {
 		store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO);
 		if (!store->filemap[pnum]) {
 			store->file_pages = pnum;
 			return -ENOMEM;
 		}
-		store->filemap[pnum]->index = pnum;
+		store->filemap[pnum]->index = pnum + offset;
 	}
 	store->file_pages = pnum;
 
@@ -885,6 +934,28 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
 	}
 }
 
+static int bitmap_file_test_bit(struct bitmap *bitmap, sector_t block)
+{
+	unsigned long bit;
+	struct page *page;
+	void *paddr;
+	unsigned long chunk = block >> bitmap->counts.chunkshift;
+	int set = 0;
+
+	page = filemap_get_page(&bitmap->storage, chunk);
+	if (!page)
+		return -EINVAL;
+	bit = file_page_offset(&bitmap->storage, chunk);
+	paddr = kmap_atomic(page);
+	if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
+		set = test_bit(bit, paddr);
+	else
+		set = test_bit_le(bit, paddr);
+	kunmap_atomic(paddr);
+	return set;
+}
+
+
 /* this gets called when the md device is ready to unplug its underlying
  * (slave) device queues -- before we let any writes go down, we need to
  * sync the dirty pages of the bitmap file to disk */
@@ -935,7 +1006,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
  */
 static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 {
-	unsigned long i, chunks, index, oldindex, bit;
+	unsigned long i, chunks, index, oldindex, bit, node_offset = 0;
 	struct page *page = NULL;
 	unsigned long bit_cnt = 0;
 	struct file *file;
@@ -981,6 +1052,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 	if (!bitmap->mddev->bitmap_info.external)
 		offset = sizeof(bitmap_super_t);
 
+	if (mddev_is_clustered(bitmap->mddev))
+		node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE));
+
 	for (i = 0; i < chunks; i++) {
 		int b;
 		index = file_page_index(&bitmap->storage, i);
@@ -1001,7 +1075,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 					bitmap->mddev,
 					bitmap->mddev->bitmap_info.offset,
 					page,
-					index, count);
+					index + node_offset, count);
 
 			if (ret)
 				goto err;
@@ -1207,7 +1281,6 @@ void bitmap_daemon_work(struct mddev *mddev)
 	     j < bitmap->storage.file_pages
 		     && !test_bit(BITMAP_STALE, &bitmap->flags);
 	     j++) {
-
 		if (test_page_attr(bitmap, j,
 				   BITMAP_PAGE_DIRTY))
 			/* bitmap_unplug will handle the rest */
@@ -1530,11 +1603,13 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
 		return;
 	}
 	if (!*bmc) {
-		*bmc = 2 | (needed ? NEEDED_MASK : 0);
+		*bmc = 2;
 		bitmap_count_page(&bitmap->counts, offset, 1);
 		bitmap_set_pending(&bitmap->counts, offset);
 		bitmap->allclean = 0;
 	}
+	if (needed)
+		*bmc |= NEEDED_MASK;
 	spin_unlock_irq(&bitmap->counts.lock);
 }
 
@@ -1591,6 +1666,10 @@ static void bitmap_free(struct bitmap *bitmap)
 	if (!bitmap) /* there was no bitmap */
 		return;
 
+	if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info &&
+		bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev))
+		md_cluster_stop(bitmap->mddev);
+
 	/* Shouldn't be needed - but just in case.... */
 	wait_event(bitmap->write_wait,
 		   atomic_read(&bitmap->pending_writes) == 0);
@@ -1636,7 +1715,7 @@ void bitmap_destroy(struct mddev *mddev)
  * initialize the bitmap structure
  * if this returns an error, bitmap_destroy must be called to do clean up
  */
-int bitmap_create(struct mddev *mddev)
+struct bitmap *bitmap_create(struct mddev *mddev, int slot)
 {
 	struct bitmap *bitmap;
 	sector_t blocks = mddev->resync_max_sectors;
@@ -1650,7 +1729,7 @@ int bitmap_create(struct mddev *mddev)
 
 	bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
 	if (!bitmap)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	spin_lock_init(&bitmap->counts.lock);
 	atomic_set(&bitmap->pending_writes, 0);
@@ -1659,6 +1738,7 @@ int bitmap_create(struct mddev *mddev)
 	init_waitqueue_head(&bitmap->behind_wait);
 
 	bitmap->mddev = mddev;
+	bitmap->cluster_slot = slot;
 
 	if (mddev->kobj.sd)
 		bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap");
@@ -1706,12 +1786,14 @@ int bitmap_create(struct mddev *mddev)
 	printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",
 	       bitmap->counts.pages, bmname(bitmap));
 
-	mddev->bitmap = bitmap;
-	return test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
+	err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
+	if (err)
+		goto error;
 
+	return bitmap;
  error:
 	bitmap_free(bitmap);
-	return err;
+	return ERR_PTR(err);
 }
 
 int bitmap_load(struct mddev *mddev)
@@ -1765,6 +1847,60 @@ out:
 }
 EXPORT_SYMBOL_GPL(bitmap_load);
 
+/* Loads the bitmap associated with slot and copies the resync information
+ * to our bitmap
+ */
+int bitmap_copy_from_slot(struct mddev *mddev, int slot,
+		sector_t *low, sector_t *high, bool clear_bits)
+{
+	int rv = 0, i, j;
+	sector_t block, lo = 0, hi = 0;
+	struct bitmap_counts *counts;
+	struct bitmap *bitmap = bitmap_create(mddev, slot);
+
+	if (IS_ERR(bitmap))
+		return PTR_ERR(bitmap);
+
+	rv = bitmap_read_sb(bitmap);
+	if (rv)
+		goto err;
+
+	rv = bitmap_init_from_disk(bitmap, 0);
+	if (rv)
+		goto err;
+
+	counts = &bitmap->counts;
+	for (j = 0; j < counts->chunks; j++) {
+		block = (sector_t)j << counts->chunkshift;
+		if (bitmap_file_test_bit(bitmap, block)) {
+			if (!lo)
+				lo = block;
+			hi = block;
+			bitmap_file_clear_bit(bitmap, block);
+			bitmap_set_memory_bits(mddev->bitmap, block, 1);
+			bitmap_file_set_bit(mddev->bitmap, block);
+		}
+	}
+
+	if (clear_bits) {
+		bitmap_update_sb(bitmap);
+		/* Setting this for the ev_page should be enough.
+		 * And we do not require both write_all and PAGE_DIRT either
+		 */
+		for (i = 0; i < bitmap->storage.file_pages; i++)
+			set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
+		bitmap_write_all(bitmap);
+		bitmap_unplug(bitmap);
+	}
+	*low = lo;
+	*high = hi;
+err:
+	bitmap_free(bitmap);
+	return rv;
+}
+EXPORT_SYMBOL_GPL(bitmap_copy_from_slot);
+
+
 void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
 {
 	unsigned long chunk_kb;
@@ -1849,7 +1985,8 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
 	memset(&store, 0, sizeof(store));
 	if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file)
 		ret = bitmap_storage_alloc(&store, chunks,
-					   !bitmap->mddev->bitmap_info.external);
+					   !bitmap->mddev->bitmap_info.external,
+					   bitmap->cluster_slot);
 	if (ret)
 		goto err;
 
@@ -2021,13 +2158,18 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
 				return -EINVAL;
 			mddev->bitmap_info.offset = offset;
 			if (mddev->pers) {
+				struct bitmap *bitmap;
 				mddev->pers->quiesce(mddev, 1);
-				rv = bitmap_create(mddev);
-				if (!rv)
+				bitmap = bitmap_create(mddev, -1);
+				if (IS_ERR(bitmap))
+					rv = PTR_ERR(bitmap);
+				else {
+					mddev->bitmap = bitmap;
 					rv = bitmap_load(mddev);
-				if (rv) {
-					bitmap_destroy(mddev);
-					mddev->bitmap_info.offset = 0;
+					if (rv) {
+						bitmap_destroy(mddev);
+						mddev->bitmap_info.offset = 0;
+					}
 				}
 				mddev->pers->quiesce(mddev, 0);
 				if (rv)
@@ -2186,6 +2328,8 @@ __ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store);
 
 static ssize_t metadata_show(struct mddev *mddev, char *page)
 {
+	if (mddev_is_clustered(mddev))
+		return sprintf(page, "clustered\n");
 	return sprintf(page, "%s\n", (mddev->bitmap_info.external
 				      ? "external" : "internal"));
 }
@@ -2198,7 +2342,8 @@ static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len)
 		return -EBUSY;
 	if (strncmp(buf, "external", 8) == 0)
 		mddev->bitmap_info.external = 1;
-	else if (strncmp(buf, "internal", 8) == 0)
+	else if ((strncmp(buf, "internal", 8) == 0) ||
+			(strncmp(buf, "clustered", 9) == 0))
 		mddev->bitmap_info.external = 0;
 	else
 		return -EINVAL;
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 30210b9c4ef9..f1f4dd01090d 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -130,8 +130,9 @@ typedef struct bitmap_super_s {
 	__le32 write_behind; /* 60  number of outstanding write-behind writes */
 	__le32 sectors_reserved; /* 64 number of 512-byte sectors that are
 				  * reserved for the bitmap. */
-
-	__u8  pad[256 - 68]; /* set to zero */
+	__le32 nodes;        /* 68 the maximum number of nodes in cluster. */
+	__u8 cluster_name[64]; /* 72 cluster name to which this md belongs */
+	__u8  pad[256 - 136]; /* set to zero */
 } bitmap_super_t;
 
 /* notes:
@@ -226,12 +227,13 @@ struct bitmap {
 	wait_queue_head_t behind_wait;
 
 	struct kernfs_node *sysfs_can_clear;
+	int cluster_slot;		/* Slot offset for clustered env */
 };
 
 /* the bitmap API */
 
 /* these are used only by md/bitmap */
-int  bitmap_create(struct mddev *mddev);
+struct bitmap *bitmap_create(struct mddev *mddev, int slot);
 int bitmap_load(struct mddev *mddev);
 void bitmap_flush(struct mddev *mddev);
 void bitmap_destroy(struct mddev *mddev);
@@ -260,6 +262,8 @@ void bitmap_daemon_work(struct mddev *mddev);
 
 int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
 		  int chunksize, int init);
+int bitmap_copy_from_slot(struct mddev *mddev, int slot,
+				sector_t *lo, sector_t *hi, bool clear_bits);
 #endif
 
 #endif
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
new file mode 100644
index 000000000000..fcfc4b9b2672
--- /dev/null
+++ b/drivers/md/md-cluster.c
@@ -0,0 +1,965 @@
+/*
+ * Copyright (C) 2015, SUSE
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/dlm.h>
+#include <linux/sched.h>
+#include <linux/raid/md_p.h>
+#include "md.h"
+#include "bitmap.h"
+#include "md-cluster.h"
+
+#define LVB_SIZE	64
+#define NEW_DEV_TIMEOUT 5000
+
+struct dlm_lock_resource {
+	dlm_lockspace_t *ls;
+	struct dlm_lksb lksb;
+	char *name; /* lock name. */
+	uint32_t flags; /* flags to pass to dlm_lock() */
+	struct completion completion; /* completion for synchronized locking */
+	void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
+	struct mddev *mddev; /* pointing back to mddev. */
+};
+
+struct suspend_info {
+	int slot;
+	sector_t lo;
+	sector_t hi;
+	struct list_head list;
+};
+
+struct resync_info {
+	__le64 lo;
+	__le64 hi;
+};
+
+/* md_cluster_info flags */
+#define		MD_CLUSTER_WAITING_FOR_NEWDISK		1
+
+
+struct md_cluster_info {
+	/* dlm lock space and resources for clustered raid. */
+	dlm_lockspace_t *lockspace;
+	int slot_number;
+	struct completion completion;
+	struct dlm_lock_resource *sb_lock;
+	struct mutex sb_mutex;
+	struct dlm_lock_resource *bitmap_lockres;
+	struct list_head suspend_list;
+	spinlock_t suspend_lock;
+	struct md_thread *recovery_thread;
+	unsigned long recovery_map;
+	/* communication loc resources */
+	struct dlm_lock_resource *ack_lockres;
+	struct dlm_lock_resource *message_lockres;
+	struct dlm_lock_resource *token_lockres;
+	struct dlm_lock_resource *no_new_dev_lockres;
+	struct md_thread *recv_thread;
+	struct completion newdisk_completion;
+	unsigned long state;
+};
+
+enum msg_type {
+	METADATA_UPDATED = 0,
+	RESYNCING,
+	NEWDISK,
+	REMOVE,
+	RE_ADD,
+};
+
+struct cluster_msg {
+	int type;
+	int slot;
+	/* TODO: Unionize this for smaller footprint */
+	sector_t low;
+	sector_t high;
+	char uuid[16];
+	int raid_slot;
+};
+
+static void sync_ast(void *arg)
+{
+	struct dlm_lock_resource *res;
+
+	res = (struct dlm_lock_resource *) arg;
+	complete(&res->completion);
+}
+
+static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
+{
+	int ret = 0;
+
+	init_completion(&res->completion);
+	ret = dlm_lock(res->ls, mode, &res->lksb,
+			res->flags, res->name, strlen(res->name),
+			0, sync_ast, res, res->bast);
+	if (ret)
+		return ret;
+	wait_for_completion(&res->completion);
+	return res->lksb.sb_status;
+}
+
+static int dlm_unlock_sync(struct dlm_lock_resource *res)
+{
+	return dlm_lock_sync(res, DLM_LOCK_NL);
+}
+
+static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
+		char *name, void (*bastfn)(void *arg, int mode), int with_lvb)
+{
+	struct dlm_lock_resource *res = NULL;
+	int ret, namelen;
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
+	if (!res)
+		return NULL;
+	res->ls = cinfo->lockspace;
+	res->mddev = mddev;
+	namelen = strlen(name);
+	res->name = kzalloc(namelen + 1, GFP_KERNEL);
+	if (!res->name) {
+		pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name);
+		goto out_err;
+	}
+	strlcpy(res->name, name, namelen + 1);
+	if (with_lvb) {
+		res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL);
+		if (!res->lksb.sb_lvbptr) {
+			pr_err("md-cluster: Unable to allocate LVB for resource %s\n", name);
+			goto out_err;
+		}
+		res->flags = DLM_LKF_VALBLK;
+	}
+
+	if (bastfn)
+		res->bast = bastfn;
+
+	res->flags |= DLM_LKF_EXPEDITE;
+
+	ret = dlm_lock_sync(res, DLM_LOCK_NL);
+	if (ret) {
+		pr_err("md-cluster: Unable to lock NL on new lock resource %s\n", name);
+		goto out_err;
+	}
+	res->flags &= ~DLM_LKF_EXPEDITE;
+	res->flags |= DLM_LKF_CONVERT;
+
+	return res;
+out_err:
+	kfree(res->lksb.sb_lvbptr);
+	kfree(res->name);
+	kfree(res);
+	return NULL;
+}
+
+static void lockres_free(struct dlm_lock_resource *res)
+{
+	if (!res)
+		return;
+
+	init_completion(&res->completion);
+	dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
+	wait_for_completion(&res->completion);
+
+	kfree(res->name);
+	kfree(res->lksb.sb_lvbptr);
+	kfree(res);
+}
+
+static char *pretty_uuid(char *dest, char *src)
+{
+	int i, len = 0;
+
+	for (i = 0; i < 16; i++) {
+		if (i == 4 || i == 6 || i == 8 || i == 10)
+			len += sprintf(dest + len, "-");
+		len += sprintf(dest + len, "%02x", (__u8)src[i]);
+	}
+	return dest;
+}
+
+static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres,
+		sector_t lo, sector_t hi)
+{
+	struct resync_info *ri;
+
+	ri = (struct resync_info *)lockres->lksb.sb_lvbptr;
+	ri->lo = cpu_to_le64(lo);
+	ri->hi = cpu_to_le64(hi);
+}
+
+static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres)
+{
+	struct resync_info ri;
+	struct suspend_info *s = NULL;
+	sector_t hi = 0;
+
+	dlm_lock_sync(lockres, DLM_LOCK_CR);
+	memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
+	hi = le64_to_cpu(ri.hi);
+	if (ri.hi > 0) {
+		s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
+		if (!s)
+			goto out;
+		s->hi = hi;
+		s->lo = le64_to_cpu(ri.lo);
+	}
+	dlm_unlock_sync(lockres);
+out:
+	return s;
+}
+
+static void recover_bitmaps(struct md_thread *thread)
+{
+	struct mddev *mddev = thread->mddev;
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	struct dlm_lock_resource *bm_lockres;
+	char str[64];
+	int slot, ret;
+	struct suspend_info *s, *tmp;
+	sector_t lo, hi;
+
+	while (cinfo->recovery_map) {
+		slot = fls64((u64)cinfo->recovery_map) - 1;
+
+		/* Clear suspend_area associated with the bitmap */
+		spin_lock_irq(&cinfo->suspend_lock);
+		list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
+			if (slot == s->slot) {
+				list_del(&s->list);
+				kfree(s);
+			}
+		spin_unlock_irq(&cinfo->suspend_lock);
+
+		snprintf(str, 64, "bitmap%04d", slot);
+		bm_lockres = lockres_init(mddev, str, NULL, 1);
+		if (!bm_lockres) {
+			pr_err("md-cluster: Cannot initialize bitmaps\n");
+			goto clear_bit;
+		}
+
+		ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
+		if (ret) {
+			pr_err("md-cluster: Could not DLM lock %s: %d\n",
+					str, ret);
+			goto clear_bit;
+		}
+		ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi, true);
+		if (ret) {
+			pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
+			goto dlm_unlock;
+		}
+		if (hi > 0) {
+			/* TODO:Wait for current resync to get over */
+			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+			if (lo < mddev->recovery_cp)
+				mddev->recovery_cp = lo;
+			md_check_recovery(mddev);
+		}
+dlm_unlock:
+		dlm_unlock_sync(bm_lockres);
+clear_bit:
+		clear_bit(slot, &cinfo->recovery_map);
+	}
+}
+
+static void recover_prep(void *arg)
+{
+}
+
+static void recover_slot(void *arg, struct dlm_slot *slot)
+{
+	struct mddev *mddev = arg;
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
+			mddev->bitmap_info.cluster_name,
+			slot->nodeid, slot->slot,
+			cinfo->slot_number);
+	set_bit(slot->slot - 1, &cinfo->recovery_map);
+	if (!cinfo->recovery_thread) {
+		cinfo->recovery_thread = md_register_thread(recover_bitmaps,
+				mddev, "recover");
+		if (!cinfo->recovery_thread) {
+			pr_warn("md-cluster: Could not create recovery thread\n");
+			return;
+		}
+	}
+	md_wakeup_thread(cinfo->recovery_thread);
+}
+
+static void recover_done(void *arg, struct dlm_slot *slots,
+		int num_slots, int our_slot,
+		uint32_t generation)
+{
+	struct mddev *mddev = arg;
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	cinfo->slot_number = our_slot;
+	complete(&cinfo->completion);
+}
+
+static const struct dlm_lockspace_ops md_ls_ops = {
+	.recover_prep = recover_prep,
+	.recover_slot = recover_slot,
+	.recover_done = recover_done,
+};
+
+/*
+ * The BAST function for the ack lock resource
+ * This function wakes up the receive thread in
+ * order to receive and process the message.
+ */
+static void ack_bast(void *arg, int mode)
+{
+	struct dlm_lock_resource *res = (struct dlm_lock_resource *)arg;
+	struct md_cluster_info *cinfo = res->mddev->cluster_info;
+
+	if (mode == DLM_LOCK_EX)
+		md_wakeup_thread(cinfo->recv_thread);
+}
+
+static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
+{
+	struct suspend_info *s, *tmp;
+
+	list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
+		if (slot == s->slot) {
+			pr_info("%s:%d Deleting suspend_info: %d\n",
+					__func__, __LINE__, slot);
+			list_del(&s->list);
+			kfree(s);
+			break;
+		}
+}
+
+static void remove_suspend_info(struct md_cluster_info *cinfo, int slot)
+{
+	spin_lock_irq(&cinfo->suspend_lock);
+	__remove_suspend_info(cinfo, slot);
+	spin_unlock_irq(&cinfo->suspend_lock);
+}
+
+
+static void process_suspend_info(struct md_cluster_info *cinfo,
+		int slot, sector_t lo, sector_t hi)
+{
+	struct suspend_info *s;
+
+	if (!hi) {
+		remove_suspend_info(cinfo, slot);
+		return;
+	}
+	s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
+	if (!s)
+		return;
+	s->slot = slot;
+	s->lo = lo;
+	s->hi = hi;
+	spin_lock_irq(&cinfo->suspend_lock);
+	/* Remove existing entry (if exists) before adding */
+	__remove_suspend_info(cinfo, slot);
+	list_add(&s->list, &cinfo->suspend_list);
+	spin_unlock_irq(&cinfo->suspend_lock);
+}
+
+static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
+{
+	char disk_uuid[64];
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	char event_name[] = "EVENT=ADD_DEVICE";
+	char raid_slot[16];
+	char *envp[] = {event_name, disk_uuid, raid_slot, NULL};
+	int len;
+
+	len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
+	pretty_uuid(disk_uuid + len, cmsg->uuid);
+	snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
+	pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
+	init_completion(&cinfo->newdisk_completion);
+	set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
+	kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp);
+	wait_for_completion_timeout(&cinfo->newdisk_completion,
+			NEW_DEV_TIMEOUT);
+	clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
+}
+
+
+static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	md_reload_sb(mddev);
+	dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
+}
+
+static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
+{
+	struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
+
+	if (rdev)
+		md_kick_rdev_from_array(rdev);
+	else
+		pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot);
+}
+
+static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
+{
+	struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
+
+	if (rdev && test_bit(Faulty, &rdev->flags))
+		clear_bit(Faulty, &rdev->flags);
+	else
+		pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot);
+}
+
+static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
+{
+	switch (msg->type) {
+	case METADATA_UPDATED:
+		pr_info("%s: %d Received message: METADATA_UPDATE from %d\n",
+			__func__, __LINE__, msg->slot);
+		process_metadata_update(mddev, msg);
+		break;
+	case RESYNCING:
+		pr_info("%s: %d Received message: RESYNCING from %d\n",
+			__func__, __LINE__, msg->slot);
+		process_suspend_info(mddev->cluster_info, msg->slot,
+				msg->low, msg->high);
+		break;
+	case NEWDISK:
+		pr_info("%s: %d Received message: NEWDISK from %d\n",
+			__func__, __LINE__, msg->slot);
+		process_add_new_disk(mddev, msg);
+		break;
+	case REMOVE:
+		pr_info("%s: %d Received REMOVE from %d\n",
+			__func__, __LINE__, msg->slot);
+		process_remove_disk(mddev, msg);
+		break;
+	case RE_ADD:
+		pr_info("%s: %d Received RE_ADD from %d\n",
+			__func__, __LINE__, msg->slot);
+		process_readd_disk(mddev, msg);
+		break;
+	default:
+		pr_warn("%s:%d Received unknown message from %d\n",
+			__func__, __LINE__, msg->slot);
+	}
+}
+
+/*
+ * thread for receiving message
+ */
+static void recv_daemon(struct md_thread *thread)
+{
+	struct md_cluster_info *cinfo = thread->mddev->cluster_info;
+	struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres;
+	struct dlm_lock_resource *message_lockres = cinfo->message_lockres;
+	struct cluster_msg msg;
+
+	/*get CR on Message*/
+	if (dlm_l
author	Linus Torvalds <torvalds@linux-foundation.org>	2015-04-24 09:28:01 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-04-24 09:28:01 -0700
commit	474095e46cd14421821da3201a9fd6a4c070996b (patch)
tree	7203d36f53c376a96099ed0310787b1fb0c4f7a5 /drivers/md
parent	d56a669ca59c37ed0a7282a251b2f2f22533343a (diff)
parent	9ffc8f7cb9647b13dfe4d1ad0d5e1427bb8b46d6 (diff)