1 files changed, 2132 insertions, 0 deletions
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
new file mode 100644
index 000000000000..0c8eb1093f00
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -0,0 +1,2132 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmrecovery.c
+ *
+ * recovery stuff
+ *
+ * Copyright (C) 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/utsname.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/blkdev.h>
+#include <linux/socket.h>
+#include <linux/inet.h>
+#include <linux/timer.h>
+#include <linux/kthread.h>
+
+
+#include "cluster/heartbeat.h"
+#include "cluster/nodemanager.h"
+#include "cluster/tcp.h"
+
+#include "dlmapi.h"
+#include "dlmcommon.h"
+#include "dlmdomain.h"
+
+#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY)
+#include "cluster/masklog.h"
+
+static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
+
+static int dlm_recovery_thread(void *data);
+void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
+int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
+static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
+static int dlm_do_recovery(struct dlm_ctxt *dlm);
+
+static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
+static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node);
+static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
+static int dlm_request_all_locks(struct dlm_ctxt *dlm,
+				 u8 request_from, u8 dead_node);
+static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
+
+static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res);
+static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
+					const char *lockname, int namelen,
+					int total_locks, u64 cookie,
+					u8 flags, u8 master);
+static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
+				    struct dlm_migratable_lockres *mres,
+				    u8 send_to,
+				    struct dlm_lock_resource *res,
+				    int total_locks);
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      u8 *real_master);
+static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
+				     struct dlm_lock_resource *res,
+				     struct dlm_migratable_lockres *mres);
+static int dlm_do_master_requery(struct dlm_ctxt *dlm,
+				 struct dlm_lock_resource *res,
+				 u8 nodenum, u8 *real_master);
+static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm);
+static int dlm_send_all_done_msg(struct dlm_ctxt *dlm,
+				 u8 dead_node, u8 send_to);
+static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node);
+static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
+					struct list_head *list, u8 dead_node);
+static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
+					      u8 dead_node, u8 new_master);
+static void dlm_reco_ast(void *astdata);
+static void dlm_reco_bast(void *astdata, int blocked_type);
+static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st);
+static void dlm_request_all_locks_worker(struct dlm_work_item *item,
+					 void *data);
+static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
+
+static u64 dlm_get_next_mig_cookie(void);
+
+static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED;
+static u64 dlm_mig_cookie = 1;
+
+static u64 dlm_get_next_mig_cookie(void)
+{
+	u64 c;
+	spin_lock(&dlm_mig_cookie_lock);
+	c = dlm_mig_cookie;
+	if (dlm_mig_cookie == (~0ULL))
+		dlm_mig_cookie = 1;
+	else
+		dlm_mig_cookie++;
+	spin_unlock(&dlm_mig_cookie_lock);
+	return c;
+}
+
+static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm->spinlock);
+	clear_bit(dlm->reco.dead_node, dlm->recovery_map);
+	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+	dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+	spin_unlock(&dlm->spinlock);
+}
+
+/* Worker function used during recovery. */
+void dlm_dispatch_work(void *data)
+{
+	struct dlm_ctxt *dlm = (struct dlm_ctxt *)data;
+	LIST_HEAD(tmp_list);
+	struct list_head *iter, *iter2;
+	struct dlm_work_item *item;
+	dlm_workfunc_t *workfunc;
+
+	spin_lock(&dlm->work_lock);
+	list_splice_init(&dlm->work_list, &tmp_list);
+	spin_unlock(&dlm->work_lock);
+
+	list_for_each_safe(iter, iter2, &tmp_list) {
+		item = list_entry(iter, struct dlm_work_item, list);
+		workfunc = item->func;
+		list_del_init(&item->list);
+
+		/* already have ref on dlm to avoid having
+		 * it disappear.  just double-check. */
+		BUG_ON(item->dlm != dlm);
+
+		/* this is allowed to sleep and
+		 * call network stuff */
+		workfunc(item, item->data);
+
+		dlm_put(dlm);
+		kfree(item);
+	}
+}
+
+/*
+ * RECOVERY THREAD
+ */
+
+static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm)
+{
+	/* wake the recovery thread
+	 * this will wake the reco thread in one of three places
+	 * 1) sleeping with no recovery happening
+	 * 2) sleeping with recovery mastered elsewhere
+	 * 3) recovery mastered here, waiting on reco data */
+
+	wake_up(&dlm->dlm_reco_thread_wq);
+}
+
+/* Launch the recovery thread */
+int dlm_launch_recovery_thread(struct dlm_ctxt *dlm)
+{
+	mlog(0, "starting dlm recovery thread...\n");
+
+	dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm,
+						"dlm_reco_thread");
+	if (IS_ERR(dlm->dlm_reco_thread_task)) {
+		mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task));
+		dlm->dlm_reco_thread_task = NULL;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
+{
+	if (dlm->dlm_reco_thread_task) {
+		mlog(0, "waiting for dlm recovery thread to exit\n");
+		kthread_stop(dlm->dlm_reco_thread_task);
+		dlm->dlm_reco_thread_task = NULL;
+	}
+}
+
+
+
+/*
+ * this is lame, but here's how recovery works...
+ * 1) all recovery threads cluster wide will work on recovering
+ *    ONE node at a time
+ * 2) negotiate who will take over all the locks for the dead node.
+ *    thats right... ALL the locks.
+ * 3) once a new master is chosen, everyone scans all locks
+ *    and moves aside those mastered by the dead guy
+ * 4) each of these locks should be locked until recovery is done
+ * 5) the new master collects up all of secondary lock queue info
+ *    one lock at a time, forcing each node to communicate back
+ *    before continuing
+ * 6) each secondary lock queue responds with the full known lock info
+ * 7) once the new master has run all its locks, it sends a ALLDONE!
+ *    message to everyone
+ * 8) upon receiving this message, the secondary queue node unlocks
+ *    and responds to the ALLDONE
+ * 9) once the new master gets responses from everyone, he unlocks
+ *    everything and recovery for this dead node is done
+ *10) go back to 2) while there are still dead nodes
+ *
+ */
+
+
+#define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
+
+static int dlm_recovery_thread(void *data)
+{
+	int status;
+	struct dlm_ctxt *dlm = data;
+	unsigned long timeout = msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS);
+
+	mlog(0, "dlm thread running for %s...\n", dlm->name);
+
+	while (!kthread_should_stop()) {
+		if (dlm_joined(dlm)) {
+			status = dlm_do_recovery(dlm);
+			if (status == -EAGAIN) {
+				/* do not sleep, recheck immediately. */
+				continue;
+			}
+			if (status < 0)
+				mlog_errno(status);
+		}
+
+		wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
+						 kthread_should_stop(),
+						 timeout);
+	}
+
+	mlog(0, "quitting DLM recovery thread\n");
+	return 0;
+}
+
+/* callers of the top-level api calls (dlmlock/dlmunlock) should
+ * block on the dlm->reco.event when recovery is in progress.
+ * the dlm recovery thread will set this state when it begins
+ * recovering a dead node (as the new master or not) and clear
+ * the state and wake as soon as all affected lock resources have
+ * been marked with the RECOVERY flag */
+static int dlm_in_recovery(struct dlm_ctxt *dlm)
+{
+	int in_recovery;
+	spin_lock(&dlm->spinlock);
+	in_recovery = !!(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
+	spin_unlock(&dlm->spinlock);
+	return in_recovery;
+}
+
+
+void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
+{
+	wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
+}
+
+static void dlm_begin_recovery(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm->spinlock);
+	BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
+	dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
+	spin_unlock(&dlm->spinlock);
+}
+
+static void dlm_end_recovery(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm->spinlock);
+	BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
+	dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
+	spin_unlock(&dlm->spinlock);
+	wake_up(&dlm->reco.event);
+}
+
+static int dlm_do_recovery(struct dlm_ctxt *dlm)
+{
+	int status = 0;
+
+	spin_lock(&dlm->spinlock);
+
+	/* check to see if the new master has died */
+	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
+	    test_bit(dlm->reco.new_master, dlm->recovery_map)) {
+		mlog(0, "new master %u died while recovering %u!\n",
+		     dlm->reco.new_master, dlm->reco.dead_node);
+		/* unset the new_master, leave dead_node */
+		dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+	}
+
+	/* select a target to recover */
+	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
+		int bit;
+
+		bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
+		if (bit >= O2NM_MAX_NODES || bit < 0)
+			dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+		else
+			dlm->reco.dead_node = bit;
+	} else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
+		/* BUG? */
+		mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n",
+		     dlm->reco.dead_node);
+		dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+	}
+
+	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
+		// mlog(0, "nothing to recover!  sleeping now!\n");
+		spin_unlock(&dlm->spinlock);
+		/* return to main thread loop and sleep. */
+		return 0;
+	}
+	mlog(0, "recovery thread found node %u in the recovery map!\n",
+	     dlm->reco.dead_node);
+	spin_unlock(&dlm->spinlock);
+
+	/* take write barrier */
+	/* (stops the list reshuffling thread, proxy ast handling) */
+	dlm_begin_recovery(dlm);
+
+	if (dlm->reco.new_master == dlm->node_num)
+		goto master_here;
+
+	if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
+		/* choose a new master */
+		if (!dlm_pick_recovery_master(dlm)) {
+			/* already notified everyone.  go. */
+			dlm->reco.new_master = dlm->node_num;
+			goto master_here;
+		}
+		mlog(0, "another node will master this recovery session.\n");
+	}
+	mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n",
+	     dlm->name, dlm->reco.new_master,
+	     dlm->node_num, dlm->reco.dead_node);
+
+	/* it is safe to start everything back up here
+	 * because all of the dead node's lock resources
+	 * have been marked as in-recovery */
+	dlm_end_recovery(dlm);
+
+	/* sleep out in main dlm_recovery_thread loop. */
+	return 0;
+
+master_here:
+	mlog(0, "mastering recovery of %s:%u here(this=%u)!\n",
+	     dlm->name, dlm->reco.dead_node, dlm->node_num);
+
+	status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
+	if (status < 0) {
+		mlog(ML_ERROR, "error %d remastering locks for node %u, "
+		     "retrying.\n", status, dlm->reco.dead_node);
+	} else {
+		/* success!  see if any other nodes need recovery */
+		dlm_reset_recovery(dlm);
+	}
+	dlm_end_recovery(dlm);
+
+	/* continue and look for another dead node */
+	return -EAGAIN;
+}
+
+static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	int status = 0;
+	struct dlm_reco_node_data *ndata;
+	struct list_head *iter;
+	int all_nodes_done;
+	int destroy = 0;
+	int pass = 0;
+
+	status = dlm_init_recovery_area(dlm, dead_node);
+	if (status < 0)
+		goto leave;
+
+	/* safe to access the node data list without a lock, since this
+	 * process is the only one to change the list */
+	list_for_each(iter, &dlm->reco.node_data) {
+		ndata = list_entry (iter, struct dlm_reco_node_data, list);
+		BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
+		ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
+
+		mlog(0, "requesting lock info from node %u\n",
+		     ndata->node_num);
+
+		if (ndata->node_num == dlm->node_num) {
+			ndata->state = DLM_RECO_NODE_DATA_DONE;
+			continue;
+		}
+
+		status = dlm_request_all_locks(dlm, ndata->node_num, dead_node);
+		if (status < 0) {
+			mlog_errno(status);
+			if (dlm_is_host_down(status))
+				ndata->state = DLM_RECO_NODE_DATA_DEAD;
+			else {
+				destroy = 1;
+				goto leave;
+			}
+		}
+
+		switch (ndata->state) {
+			case DLM_RECO_NODE_DATA_INIT:
+			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+			case DLM_RECO_NODE_DATA_REQUESTED:
+				BUG();
+				break;
+			case DLM_RECO_NODE_DATA_DEAD:
+				mlog(0, "node %u died after requesting "
+				     "recovery info for node %u\n",
+				     ndata->node_num, dead_node);
+				// start all over
+				destroy = 1;
+				status = -EAGAIN;
+				goto leave;
+			case DLM_RECO_NODE_DATA_REQUESTING:
+				ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
+				mlog(0, "now receiving recovery data from "
+				     "node %u for dead node %u\n",
+				     ndata->node_num, dead_node);
+				break;
+			case DLM_RECO_NODE_DATA_RECEIVING:
+				mlog(0, "already receiving recovery data from "
+				     "node %u for dead node %u\n",
+				     ndata->node_num, dead_node);
+				break;
+			case DLM_RECO_NODE_DATA_DONE:
+				mlog(0, "already DONE receiving recovery data "
+				     "from node %u for dead node %u\n",
+				     ndata->node_num, dead_node);
+				break;
+		}
+	}
+
+	mlog(0, "done requesting all lock info\n");
+
+	/* nodes should be sending reco data now
+	 * just need to wait */
+
+	while (1) {
+		/* check all the nodes now to see if we are
+		 * done, or if anyone died */
+		all_nodes_done = 1;
+		spin_lock(&dlm_reco_state_lock);
+		list_for_each(iter, &dlm->reco.node_data) {
+			ndata = list_entry (iter, struct dlm_reco_node_data, list);
+
+			mlog(0, "checking recovery state of node %u\n",
+			     ndata->node_num);
+			switch (ndata->state) {
+				case DLM_RECO_NODE_DATA_INIT:
+				case DLM_RECO_NODE_DATA_REQUESTING:
+					mlog(ML_ERROR, "bad ndata state for "
+					     "node %u: state=%d\n",
+					     ndata->node_num, ndata->state);
+					BUG();
+					break;
+				case DLM_RECO_NODE_DATA_DEAD:
+					mlog(0, "node %u died after "
+					     "requesting recovery info for "
+					     "node %u\n", ndata->node_num,
+					     dead_node);
+					spin_unlock(&dlm_reco_state_lock);
+					// start all over
+					destroy = 1;
+					status = -EAGAIN;
+					goto leave;
+				case DLM_RECO_NODE_DATA_RECEIVING:
+				case DLM_RECO_NODE_DATA_REQUESTED:
+					all_nodes_done = 0;
+					break;
+				case DLM_RECO_NODE_DATA_DONE:
+					break;
+				case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+					break;
+			}
+		}
+		spin_unlock(&dlm_reco_state_lock);
+
+		mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass,
+		     all_nodes_done?"yes":"no");
+		if (all_nodes_done) {
+			int ret;
+
+			/* all nodes are now in DLM_RECO_NODE_DATA_DONE state
+	 		 * just send a finalize message to everyone and
+	 		 * clean up */
+			mlog(0, "all nodes are done! send finalize\n");
+			ret = dlm_send_finalize_reco_message(dlm);
+			if (ret < 0)
+				mlog_errno(ret);
+
+			spin_lock(&dlm->spinlock);
+			dlm_finish_local_lockres_recovery(dlm, dead_node,
+							  dlm->node_num);
+			spin_unlock(&dlm->spinlock);
+			mlog(0, "should be done with recovery!\n");
+
+			mlog(0, "finishing recovery of %s at %lu, "
+			     "dead=%u, this=%u, new=%u\n", dlm->name,
+			     jiffies, dlm->reco.dead_node,
+			     dlm->node_num, dlm->reco.new_master);
+			destroy = 1;
+			status = ret;
+			/* rescan everything marked dirty along the way */
+			dlm_kick_thread(dlm, NULL);
+			break;
+		}
+		/* wait to be signalled, with periodic timeout
+		 * to check for node death */
+		wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
+					 kthread_should_stop(),
+					 msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS));
+
+	}
+
+leave:
+	if (destroy)
+		dlm_destroy_recovery_area(dlm, dead_node);
+
+	mlog_exit(status);
+	return status;
+}
+
+static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	int num=0;
+	struct dlm_reco_node_data *ndata;
+
+	spin_lock(&dlm->spinlock);
+	memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map));
+	/* nodes can only be removed (by dying) after dropping
+	 * this lock, and death will be trapped later, so this should do */
+	spin_unlock(&dlm->spinlock);
+
+	while (1) {
+		num = find_next_bit (dlm->reco.node_map, O2NM_MAX_NODES, num);
+		if (num >= O2NM_MAX_NODES) {
+			break;
+		}
+		BUG_ON(num == dead_node);
+
+		ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL);
+		if (!ndata) {
+			dlm_destroy_recovery_area(dlm, dead_node);
+			return -ENOMEM;
+		}
+		ndata->node_num = num;
+		ndata->state = DLM_RECO_NODE_DATA_INIT;
+		spin_lock(&dlm_reco_state_lock);
+		list_add_tail(&ndata->list, &dlm->reco.node_data);
+		spin_unlock(&dlm_reco_state_lock);
+		num++;
+	}
+
+	return 0;
+}
+
+static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
+{
+	struct list_head *iter, *iter2;
+	struct dlm_reco_node_data *ndata;
+	LIST_HEAD(tmplist);
+
+	spin_lock(&dlm_reco_state_lock);
+	list_splice_init(&dlm->reco.node_data, &tmplist);
+	spin_unlock(&dlm_reco_state_lock);
+
+	list_for_each_safe(iter, iter2, &tmplist) {
+		ndata = list_entry (iter, struct dlm_reco_node_data, list);
+		list_del_init(&ndata->list);
+		kfree(ndata);
+	}
+}
+
+static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
+				 u8 dead_node)
+{
+	struct dlm_lock_request lr;
+	enum dlm_status ret;
+
+	mlog(0, "\n");
+
+
+	mlog(0, "dlm_request_all_locks: dead node is %u, sending request "
+		  "to %u\n", dead_node, request_from);
+
+	memset(&lr, 0, sizeof(lr));
+	lr.node_idx = dlm->node_num;
+	lr.dead_node = dead_node;
+
+	// send message
+	ret = DLM_NOLOCKMGR;
+	ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
+				 &lr, sizeof(lr), request_from, NULL);
+
+	/* negative status is handled by caller */
+	if (ret < 0)
+		mlog_errno(ret);
+
+	// return from here, then
+	// sleep until all received or error
+	return ret;
+
+}
+
+int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf;
+	char *buf = NULL;
+	struct dlm_work_item *item = NULL;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	BUG_ON(lr->dead_node != dlm->reco.dead_node);
+
+	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+	if (!item) {
+		dlm_put(dlm);
+		return -ENOMEM;
+	}
+
+	/* this will get freed by dlm_request_all_locks_worker */
+	buf = (char *) __get_free_page(GFP_KERNEL);
+	if (!buf) {
+		kfree(item);
+		dlm_put(dlm);
+		return -ENOMEM;
+	}
+
+	/* queue up work for dlm_request_all_locks_worker */
+	dlm_grab(dlm);  /* get an extra ref for the work item */
+	dlm_init_work_item(dlm, item, dlm_request_all_locks_worker, buf);
+	item->u.ral.reco_master = lr->node_idx;
+	item->u.ral.dead_node = lr->dead_node;
+	spin_lock(&dlm->work_lock);
+	list_add_tail(&item->list, &dlm->work_list);
+	spin_unlock(&dlm->work_lock);
+	schedule_work(&dlm->dispatched_work);
+
+	dlm_put(dlm);
+	return 0;
+}
+
+static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
+{
+	struct dlm_migratable_lockres *mres;
+	struct dlm_lock_resource *res;
+	struct dlm_ctxt *dlm;
+	LIST_HEAD(resources);
+	struct list_head *iter;
+	int ret;
+	u8 dead_node, reco_master;
+
+	dlm = item->dlm;
+	dead_node = item->u.ral.dead_node;
+	reco_master = item->u.ral.reco_master;
+	BUG_ON(dead_node != dlm->reco.dead_node);
+	BUG_ON(reco_master != dlm->reco.new_master);
+
+	mres = (struct dlm_migratable_lockres *)data;
+
+	/* lock resources should have already been moved to the
+ 	 * dlm->reco.resources list.  now move items from that list
+ 	 * to a temp list if the dead owner matches.  note that the
+	 * whole cluster recovers only one node at a time, so we
+	 * can safely move UNKNOWN lock resources for each recovery
+	 * session. */
+	dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
+
+	/* now we can begin blasting lockreses without the dlm lock */
+	list_for_each(iter, &resources) {
+		res = list_entry (iter, struct dlm_lock_resource, recovering);
+		ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
+				   	DLM_MRES_RECOVERY);
+		if (ret < 0)
+			mlog_errno(ret);
+	}
+
+	/* move the resources back to the list */
+	spin_lock(&dlm->spinlock);
+	list_splice_init(&resources, &dlm->reco.resources);
+	spin_unlock(&dlm->spinlock);
+
+	ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	free_page((unsigned long)data);
+}
+
+
+static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
+{
+	int ret, tmpret;
+	struct dlm_reco_data_done done_msg;
+
+	memset(&done_msg, 0, sizeof(done_msg));
+	done_msg.node_idx = dlm->node_num;
+	done_msg.dead_node = dead_node;
+	mlog(0, "sending DATA DONE message to %u, "
+	     "my node=%u, dead node=%u\n", send_to, done_msg.node_idx,
+	     done_msg.dead_node);
+
+	ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
+				 sizeof(done_msg), send_to, &tmpret);
+	/* negative status is ignored by the caller */
+	if (ret >= 0)
+		ret = tmpret;
+	return ret;
+}
+
+
+int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
+	struct list_head *iter;
+	struct dlm_reco_node_data *ndata = NULL;
+	int ret = -EINVAL;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
+	     "node_idx=%u, this node=%u\n", done->dead_node,
+	     dlm->reco.dead_node, done->node_idx, dlm->node_num);
+	BUG_ON(done->dead_node != dlm->reco.dead_node);
+
+	spin_lock(&dlm_reco_state_lock);
+	list_for_each(iter, &dlm->reco.node_data) {
+		ndata = list_entry (iter, struct dlm_reco_node_data, list);
+		if (ndata->node_num != done->node_idx)
+			continue;
+
+		switch (ndata->state) {
+			case DLM_RECO_NODE_DATA_INIT:
+			case DLM_RECO_NODE_DATA_DEAD:
+			case DLM_RECO_NODE_DATA_DONE:
+			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+				mlog(ML_ERROR, "bad ndata state for node %u:"
+				     " state=%d\n", ndata->node_num,
+				     ndata->state);
+				BUG();
+				break;
+			case DLM_RECO_NODE_DATA_RECEIVING:
+			case DLM_RECO_NODE_DATA_REQUESTED:
+			case DLM_RECO_NODE_DATA_REQUESTING:
+				mlog(0, "node %u is DONE sending "
+					  "recovery data!\n",
+					  ndata->node_num);
+
+				ndata->state = DLM_RECO_NODE_DATA_DONE;
+				ret = 0;
+				break;
+		}
+	}
+	spin_unlock(&dlm_reco_state_lock);
+
+	/* wake the recovery thread, some node is done */
+	if (!ret)
+		dlm_kick_recovery_thread(dlm);
+
+	if (ret < 0)
+		mlog(ML_ERROR, "failed to find recovery node data for node "
+		     "%u\n", done->node_idx);
+	dlm_put(dlm);
+
+	mlog(0, "leaving reco data done handler, ret=%d\n", ret);
+	return ret;
+}
+
+static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
+					struct list_head *list,
+				       	u8 dead_node)
+{
+	struct dlm_lock_resource *res;
+	struct list_head *iter, *iter2;
+
+	spin_lock(&dlm->spinlock);
+	list_for_each_safe(iter, iter2, &dlm->reco.resources) {
+		res = list_entry (iter, struct dlm_lock_resource, recovering);
+		if (dlm_is_recovery_lock(res->lockname.name,
+					 res->lockname.len))
+			continue;
+		if (res->owner == dead_node) {
+			mlog(0, "found lockres owned by dead node while "
+				  "doing recovery for node %u. sending it.\n",
+				  dead_node);
+			list_del_init(&res->recovering);
+			list_add_tail(&res->recovering, list);
+		} else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+			mlog(0, "found UNKNOWN owner while doing recovery "
+				  "for node %u. sending it.\n", dead_node);
+			list_del_init(&res->recovering);
+			list_add_tail(&res->recovering, list);
+		}
+	}
+	spin_unlock(&dlm->spinlock);
+}
+
+static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res)
+{
+	int total_locks = 0;
+	struct list_head *iter, *queue = &res->granted;
+	int i;
+
+	for (i=0; i<3; i++) {
+		list_for_each(iter, queue)
+			total_locks++;
+		queue++;
+	}
+	return total_locks;
+}
+
+
+static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
+				      struct dlm_migratable_lockres *mres,
+				      u8 send_to,
+				      struct dlm_lock_resource *res,
+				      int total_locks)
+{
+	u64 mig_cookie = be64_to_cpu(mres->mig_cookie);
+	int mres_total_locks = be32_to_cpu(mres->total_locks);
+	int sz, ret = 0, status = 0;
+	u8 orig_flags = mres->flags,
+	   orig_master = mres->master;
+
+	BUG_ON(mres->num_locks > DLM_MAX_MIGRATABLE_LOCKS);
+	if (!mres->num_locks)
+		return 0;
+
+	sz = sizeof(struct dlm_migratable_lockres) +
+		(mres->num_locks * sizeof(struct dlm_migratable_lock));
+
+	/* add an all-done flag if we reached the last lock */
+	orig_flags = mres->flags;
+	BUG_ON(total_locks > mres_total_locks);
+	if (total_locks == mres_total_locks)
+		mres->flags |= DLM_MRES_ALL_DONE;
+
+	/* send it */
+	ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres,
+				 sz, send_to, &status);
+	if (ret < 0) {
+		/* XXX: negative status is not handled.
+		 * this will end up killing this node. */
+		mlog_errno(ret);
+	} else {
+		/* might get an -ENOMEM back here */
+		ret = status;
+		if (ret < 0) {
+			mlog_errno(ret);
+
+			if (ret == -EFAULT) {
+				mlog(ML_ERROR, "node %u told me to kill "
+				     "myself!\n", send_to);
+				BUG();
+			}
+		}
+	}
+
+	/* zero and reinit the message buffer */
+	dlm_init_migratable_lockres(mres, res->lockname.name,
+				    res->lockname.len, mres_total_locks,
+				    mig_cookie, orig_flags, orig_master);
+	return ret;
+}
+
+static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
+					const char *lockname, int namelen,
+					int total_locks, u64 cookie,
+					u8 flags, u8 master)
+{
+	/* mres here is one full page */
+	memset(mres, 0, PAGE_SIZE);
+	mres->lockname_len = namelen;
+	memcpy(mres->lockname, lockname, namelen);
+	mres->num_locks = 0;
+	mres->total_locks = cpu_to_be32(total_locks);
+	mres->mig_cookie = cpu_to_be64(cookie);
+	mres->flags = flags;
+	mres->master = master;
+}
+
+
+/* returns 1 if this lock fills the network structure,
+ * 0 otherwise */
+static int dlm_add_lock_to_array(struct dlm_lock *lock,
+				 struct dlm_migratable_lockres *mres, int queue)
+{
+	struct dlm_migratable_lock *ml;
+	int lock_num = mres->num_locks;
+
+	ml = &(mres->ml[lock_num]);
+	ml->cookie = lock->ml.cookie;
+	ml->type = lock->ml.type;
+	ml->convert_type = lock->ml.convert_type;
+	ml->highest_blocked = lock->ml.highest_blocked;
+	ml->list = queue;
+	if (lock->lksb) {
+		ml->flags = lock->lksb->flags;
+		/* send our current lvb */
+		if (ml->type == LKM_EXMODE ||
+		    ml->type == LKM_PRMODE) {
+			/* if it is already set, this had better be a PR
+			 * and it has to match */
+			if (mres->lvb[0] && (ml->type == LKM_EXMODE ||
+			    memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
+				mlog(ML_ERROR, "mismatched lvbs!\n");
+				__dlm_print_one_lock_resource(lock->lockres);
+				BUG();
+			}
+			memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
+		}
+	}
+	ml->node = lock->ml.node;
+	mres->num_locks++;
+	/* we reached the max, send this network message */
+	if (mres->num_locks == DLM_MAX_MIGRATABLE_LOCKS)
+		return 1;
+	return 0;
+}
+
+
+int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			 struct dlm_migratable_lockres *mres,
+			 u8 send_to, u8 flags)
+{
+	struct list_head *queue, *iter;
+	int total_locks, i;
+	u64 mig_cookie = 0;
+	struct dlm_lock *lock;
+	int ret = 0;
+
+	BUG_ON(!(flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
+
+	mlog(0, "sending to %u\n", send_to);
+
+	total_locks = dlm_num_locks_in_lockres(res);
+	if (total_locks > DLM_MAX_MIGRATABLE_LOCKS) {
+		/* rare, but possible */
+		mlog(0, "argh.  lockres has %d locks.  this will "
+			  "require more than one network packet to "
+			  "migrate\n", total_locks);
+		mig_cookie = dlm_get_next_mig_cookie();
+	}
+
+	dlm_init_migratable_lockres(mres, res->lockname.name,
+				    res->lockname.len, total_locks,
+				    mig_cookie, flags, res->owner);
+
+	total_locks = 0;
+	for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
+		queue = dlm_list_idx_to_ptr(res, i);
+		list_for_each(iter, queue) {
+			lock = list_entry (iter, struct dlm_lock, list);
+
+			/* add another lock. */
+			total_locks++;
+			if (!dlm_add_lock_to_array(lock, mres, i))
+				continue;
+
+			/* this filled the lock message,
+			 * we must send it immediately. */
+			ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
+						       res, total_locks);
+			if (ret < 0) {
+				// TODO
+				mlog(ML_ERROR, "dlm_send_mig_lockres_msg "
+				     "returned %d, TODO\n", ret);
+				BUG();
+			}
+		}
+	}
+	/* flush any remaining locks */
+	ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
+	if (ret < 0) {
+		// TODO
+		mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, "
+		     "TODO\n", ret);
+		BUG();
+	}
+	return ret;
+}
+
+
+
+/*
+ * this message will contain no more than one page worth of
+ * recovery data, and it will work on only one lockres.
+ * there may be many locks in this page, and we may need to wait
+ * for additional packets to complete all the locks (rare, but
+ * possible).
+ */
+/*
+ * NOTE: the allocation error cases here are scary
+ * we really cannot afford to fail an alloc in recovery
+ * do we spin?  returning an error only delays the problem really
+ */
+
+int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
+{
+	struct dlm_ctxt *dlm = data;
+	struct dlm_migratable_lockres *mres =
+		(struct dlm_migratable_lockres *)msg->buf;
+	int ret = 0;
+	u8 real_master;
+	char *buf = NULL;
+	struct dlm_work_item *item = NULL;
+	struct dlm_lock_resource *res = NULL;
+
+	if (!dlm_grab(dlm))
+		return -EINVAL;
+
+	BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
+
+	real_master = mres->master;
+	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
+		/* cannot migrate a lockres with no master */
+		BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
+	}
+
+	mlog(0, "%s message received from node %u\n",
+		  (mres->flags & DLM_MRES_RECOVERY) ?
+		  "recovery" : "migration", mres->master);
+	if (mres->flags & DLM_MRES_ALL_DONE)
+		mlog(0, "all done flag.  all lockres data received!\n");
+
+	ret = -ENOMEM;
+	buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL);
+	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+	if (!buf || !item)
+		goto leave;
+
+	/* lookup the lock to see if we have a secondary queue for this
+	 * already...  just add the locks in and this will have its owner
+	 * and RECOVERY flag changed when it completes. */
+	res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len);
+	if (res) {
+	 	/* this will get a ref on res */
+		/* mark it as recovering/migrating and hash it */
+		spin_lock(&res->spinlock);
+		if (mres->flags & DLM_MRES_RECOVERY) {
+			res->state |= DLM_LOCK_RES_RECOVERING;
+		} else {
+			if (res->state & DLM_LOCK_RES_MIGRATING) {
+				/* this is at least the second
+				 * lockres message */
+				mlog(0, "lock %.*s is already migrating\n",
+					  mres->lockname_len,
+					  mres->lockname);
+			} else if (res->state & DLM_LOCK_RES_RECOVERING) {
+				/* caller should BUG */
+				mlog(ML_ERROR, "node is attempting to migrate "
+				     "lock %.*s, but marked as recovering!\n",
+				     mres->lockname_len, mres->lockname);
+				ret = -EFAULT;
+				spin_unlock(&res->spinlock);
+				goto leave;
+			}
+			res->state |= DLM_LOCK_RES_MIGRATING;
+		}
+		spin_unlock(&res->spinlock);
+	} else {
+		/* need to allocate, just like if it was
+		 * mastered here normally  */
+		res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);
+		if (!res)
+			goto leave;
+
+		/* to match the ref that we would have gotten if
+		 * dlm_lookup_lockres had succeeded */
+		dlm_lockres_get(res);
+
+		/* mark it as recovering/migrating and hash it */
+		if (mres->flags & DLM_MRES_RECOVERY)
+			res->state |= DLM_LOCK_RES_RECOVERING;
+		else
+			res->state |= DLM_LOCK_RES_MIGRATING;
+
+		spin_lock(&dlm->spinlock);
+		__dlm_insert_lockres(dlm, res);
+		spin_unlock(&dlm->spinlock);
+
+		/* now that the new lockres is inserted,
+		 * make it usable by other processes */
+		spin_lock(&res->spinlock);
+		res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
+		spin_unlock(&res->spinlock);
+
+		/* add an extra ref for just-allocated lockres 
+		 * otherwise the lockres will be purged immediately */
+		dlm_lockres_get(res);
+
+	}
+
+	/* at this point we have allocated everything we need,
+	 * and we have a hashed lockres with an extra ref and
+	 * the proper res->state flags. */
+	ret = 0;
+	if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) {
+		/* migration cannot have an unknown master */
+		BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
+		mlog(0, "recovery has passed me a lockres with an "
+			  "unknown owner.. will need to requery: "
+			  "%.*s\n",