Blame - fs/ocfs2/dlm/dlmrecovery.c - kernel/msm-4.9

blob: 325c9f5529c15d9d6ef2156dac7ab0d73528893f [file] [log] [blame]

Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1	/* -- mode: c; c-basic-offset: 8; --
				2	* vim: noexpandtab sw=8 ts=8 sts=0:
				3	*
				4	* dlmrecovery.c
				5	*
				6	* recovery stuff
				7	*
				8	* Copyright (C) 2004 Oracle. All rights reserved.
				9	*
				10	* This program is free software; you can redistribute it and/or
				11	* modify it under the terms of the GNU General Public
				12	* License as published by the Free Software Foundation; either
				13	* version 2 of the License, or (at your option) any later version.
				14	*
				15	* This program is distributed in the hope that it will be useful,
				16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				18	* General Public License for more details.
				19	*
				20	* You should have received a copy of the GNU General Public
				21	* License along with this program; if not, write to the
				22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				23	* Boston, MA 021110-1307, USA.
				24	*
				25	*/
				26
				27
				28	#include <linux/module.h>
				29	#include <linux/fs.h>
				30	#include <linux/types.h>
				31	#include <linux/slab.h>
				32	#include <linux/highmem.h>
				33	#include <linux/utsname.h>
				34	#include <linux/init.h>
				35	#include <linux/sysctl.h>
				36	#include <linux/random.h>
				37	#include <linux/blkdev.h>
				38	#include <linux/socket.h>
				39	#include <linux/inet.h>
				40	#include <linux/timer.h>
				41	#include <linux/kthread.h>
				42
				43
				44	#include "cluster/heartbeat.h"
				45	#include "cluster/nodemanager.h"
				46	#include "cluster/tcp.h"
				47
				48	#include "dlmapi.h"
				49	#include "dlmcommon.h"
				50	#include "dlmdomain.h"
				51
				52	#define MLOG_MASK_PREFIX (ML_DLM\|ML_DLM_RECOVERY)
				53	#include "cluster/masklog.h"
				54
				55	static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
				56
				57	static int dlm_recovery_thread(void *data);
				58	void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
				59	int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
				60	static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
				61	static int dlm_do_recovery(struct dlm_ctxt *dlm);
				62
				63	static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
				64	static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node);
				65	static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
				66	static int dlm_request_all_locks(struct dlm_ctxt *dlm,
				67	u8 request_from, u8 dead_node);
				68	static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
				69
				70	static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res);
				71	static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
				72	const char *lockname, int namelen,
				73	int total_locks, u64 cookie,
				74	u8 flags, u8 master);
				75	static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
				76	struct dlm_migratable_lockres *mres,
				77	u8 send_to,
				78	struct dlm_lock_resource *res,
				79	int total_locks);
				80	static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
				81	struct dlm_lock_resource *res,
				82	u8 *real_master);
				83	static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
				84	struct dlm_lock_resource *res,
				85	struct dlm_migratable_lockres *mres);
				86	static int dlm_do_master_requery(struct dlm_ctxt *dlm,
				87	struct dlm_lock_resource *res,
				88	u8 nodenum, u8 *real_master);
				89	static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm);
				90	static int dlm_send_all_done_msg(struct dlm_ctxt *dlm,
				91	u8 dead_node, u8 send_to);
				92	static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node);
				93	static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
				94	struct list_head *list, u8 dead_node);
				95	static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
				96	u8 dead_node, u8 new_master);
				97	static void dlm_reco_ast(void *astdata);
				98	static void dlm_reco_bast(void *astdata, int blocked_type);
				99	static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st);
				100	static void dlm_request_all_locks_worker(struct dlm_work_item *item,
				101	void *data);
				102	static void dlm_mig_lockres_worker(struct dlm_work_item item, void data);
				103
				104	static u64 dlm_get_next_mig_cookie(void);
				105
				106	static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED;
				107	static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED;
				108	static u64 dlm_mig_cookie = 1;
				109
				110	static u64 dlm_get_next_mig_cookie(void)
				111	{
				112	u64 c;
				113	spin_lock(&dlm_mig_cookie_lock);
				114	c = dlm_mig_cookie;
				115	if (dlm_mig_cookie == (~0ULL))
				116	dlm_mig_cookie = 1;
				117	else
				118	dlm_mig_cookie++;
				119	spin_unlock(&dlm_mig_cookie_lock);
				120	return c;
				121	}
				122
				123	static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
				124	{
				125	spin_lock(&dlm->spinlock);
				126	clear_bit(dlm->reco.dead_node, dlm->recovery_map);
				127	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
				128	dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
				129	spin_unlock(&dlm->spinlock);
				130	}
				131
				132	/* Worker function used during recovery. */
				133	void dlm_dispatch_work(void *data)
				134	{
				135	struct dlm_ctxt dlm = (struct dlm_ctxt )data;
				136	LIST_HEAD(tmp_list);
				137	struct list_head iter, iter2;
				138	struct dlm_work_item *item;
				139	dlm_workfunc_t *workfunc;
				140
				141	spin_lock(&dlm->work_lock);
				142	list_splice_init(&dlm->work_list, &tmp_list);
				143	spin_unlock(&dlm->work_lock);
				144
				145	list_for_each_safe(iter, iter2, &tmp_list) {
				146	item = list_entry(iter, struct dlm_work_item, list);
				147	workfunc = item->func;
				148	list_del_init(&item->list);
				149
				150	/* already have ref on dlm to avoid having
				151	* it disappear. just double-check. */
				152	BUG_ON(item->dlm != dlm);
				153
				154	/* this is allowed to sleep and
				155	* call network stuff */
				156	workfunc(item, item->data);
				157
				158	dlm_put(dlm);
				159	kfree(item);
				160	}
				161	}
				162
				163	/*
				164	* RECOVERY THREAD
				165	*/
				166
				167	static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm)
				168	{
				169	/* wake the recovery thread
				170	* this will wake the reco thread in one of three places
				171	* 1) sleeping with no recovery happening
				172	* 2) sleeping with recovery mastered elsewhere
				173	* 3) recovery mastered here, waiting on reco data */
				174
				175	wake_up(&dlm->dlm_reco_thread_wq);
				176	}
				177
				178	/* Launch the recovery thread */
				179	int dlm_launch_recovery_thread(struct dlm_ctxt *dlm)
				180	{
				181	mlog(0, "starting dlm recovery thread...\n");
				182
				183	dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm,
				184	"dlm_reco_thread");
				185	if (IS_ERR(dlm->dlm_reco_thread_task)) {
				186	mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task));
				187	dlm->dlm_reco_thread_task = NULL;
				188	return -EINVAL;
				189	}
				190
				191	return 0;
				192	}
				193
				194	void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
				195	{
				196	if (dlm->dlm_reco_thread_task) {
				197	mlog(0, "waiting for dlm recovery thread to exit\n");
				198	kthread_stop(dlm->dlm_reco_thread_task);
				199	dlm->dlm_reco_thread_task = NULL;
				200	}
				201	}
				202
				203
				204
				205	/*
				206	* this is lame, but here's how recovery works...
				207	* 1) all recovery threads cluster wide will work on recovering
				208	* ONE node at a time
				209	* 2) negotiate who will take over all the locks for the dead node.
				210	* thats right... ALL the locks.
				211	* 3) once a new master is chosen, everyone scans all locks
				212	* and moves aside those mastered by the dead guy
				213	* 4) each of these locks should be locked until recovery is done
				214	* 5) the new master collects up all of secondary lock queue info
				215	* one lock at a time, forcing each node to communicate back
				216	* before continuing
				217	* 6) each secondary lock queue responds with the full known lock info
				218	* 7) once the new master has run all its locks, it sends a ALLDONE!
				219	* message to everyone
				220	* 8) upon receiving this message, the secondary queue node unlocks
				221	* and responds to the ALLDONE
				222	* 9) once the new master gets responses from everyone, he unlocks
				223	* everything and recovery for this dead node is done
				224	*10) go back to 2) while there are still dead nodes
				225	*
				226	*/
				227
				228
				229	#define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
				230
				231	static int dlm_recovery_thread(void *data)
				232	{
				233	int status;
				234	struct dlm_ctxt *dlm = data;
				235	unsigned long timeout = msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS);
				236
				237	mlog(0, "dlm thread running for %s...\n", dlm->name);
				238
				239	while (!kthread_should_stop()) {
				240	if (dlm_joined(dlm)) {
				241	status = dlm_do_recovery(dlm);
				242	if (status == -EAGAIN) {
				243	/* do not sleep, recheck immediately. */
				244	continue;
				245	}
				246	if (status < 0)
				247	mlog_errno(status);
				248	}
				249
				250	wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
				251	kthread_should_stop(),
				252	timeout);
				253	}
				254
				255	mlog(0, "quitting DLM recovery thread\n");
				256	return 0;
				257	}
				258
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	259	/* returns true when the recovery master has contacted us */
				260	static int dlm_reco_master_ready(struct dlm_ctxt *dlm)
				261	{
				262	int ready;
				263	spin_lock(&dlm->spinlock);
				264	ready = (dlm->reco.new_master != O2NM_INVALID_NODE_NUM);
				265	spin_unlock(&dlm->spinlock);
				266	return ready;
				267	}
				268
				269	/* returns true if node is no longer in the domain
				270	* could be dead or just not joined */
				271	int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
				272	{
				273	int dead;
				274	spin_lock(&dlm->spinlock);
				275	dead = test_bit(node, dlm->domain_map);
				276	spin_unlock(&dlm->spinlock);
				277	return dead;
				278	}
				279
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	280	/* callers of the top-level api calls (dlmlock/dlmunlock) should
				281	* block on the dlm->reco.event when recovery is in progress.
				282	* the dlm recovery thread will set this state when it begins
				283	* recovering a dead node (as the new master or not) and clear
				284	* the state and wake as soon as all affected lock resources have
				285	* been marked with the RECOVERY flag */
				286	static int dlm_in_recovery(struct dlm_ctxt *dlm)
				287	{
				288	int in_recovery;
				289	spin_lock(&dlm->spinlock);
				290	in_recovery = !!(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
				291	spin_unlock(&dlm->spinlock);
				292	return in_recovery;
				293	}
				294
				295
				296	void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
				297	{
				298	wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
				299	}
				300
				301	static void dlm_begin_recovery(struct dlm_ctxt *dlm)
				302	{
				303	spin_lock(&dlm->spinlock);
				304	BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
				305	dlm->reco.state \|= DLM_RECO_STATE_ACTIVE;
				306	spin_unlock(&dlm->spinlock);
				307	}
				308
				309	static void dlm_end_recovery(struct dlm_ctxt *dlm)
				310	{
				311	spin_lock(&dlm->spinlock);
				312	BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
				313	dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
				314	spin_unlock(&dlm->spinlock);
				315	wake_up(&dlm->reco.event);
				316	}
				317
				318	static int dlm_do_recovery(struct dlm_ctxt *dlm)
				319	{
				320	int status = 0;
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	321	int ret;
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	322
				323	spin_lock(&dlm->spinlock);
				324
				325	/* check to see if the new master has died */
				326	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
				327	test_bit(dlm->reco.new_master, dlm->recovery_map)) {
				328	mlog(0, "new master %u died while recovering %u!\n",
				329	dlm->reco.new_master, dlm->reco.dead_node);
				330	/* unset the new_master, leave dead_node */
				331	dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
				332	}
				333
				334	/* select a target to recover */
				335	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
				336	int bit;
				337
				338	bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
				339	if (bit >= O2NM_MAX_NODES \|\| bit < 0)
				340	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
				341	else
				342	dlm->reco.dead_node = bit;
				343	} else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
				344	/* BUG? */
				345	mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n",
				346	dlm->reco.dead_node);
				347	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
				348	}
				349
				350	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
				351	// mlog(0, "nothing to recover! sleeping now!\n");
				352	spin_unlock(&dlm->spinlock);
				353	/* return to main thread loop and sleep. */
				354	return 0;
				355	}
				356	mlog(0, "recovery thread found node %u in the recovery map!\n",
				357	dlm->reco.dead_node);
				358	spin_unlock(&dlm->spinlock);
				359
				360	/* take write barrier */
				361	/* (stops the list reshuffling thread, proxy ast handling) */
				362	dlm_begin_recovery(dlm);
				363
				364	if (dlm->reco.new_master == dlm->node_num)
				365	goto master_here;
				366
				367	if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	368	/* choose a new master, returns 0 if this node
				369	* is the master, -EEXIST if it's another node.
				370	* this does not return until a new master is chosen
				371	* or recovery completes entirely. */
				372	ret = dlm_pick_recovery_master(dlm);
				373	if (!ret) {
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	374	/* already notified everyone. go. */
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	375	goto master_here;
				376	}
				377	mlog(0, "another node will master this recovery session.\n");
				378	}
				379	mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n",
				380	dlm->name, dlm->reco.new_master,
				381	dlm->node_num, dlm->reco.dead_node);
				382
				383	/* it is safe to start everything back up here
				384	* because all of the dead node's lock resources
				385	* have been marked as in-recovery */
				386	dlm_end_recovery(dlm);
				387
				388	/* sleep out in main dlm_recovery_thread loop. */
				389	return 0;
				390
				391	master_here:
				392	mlog(0, "mastering recovery of %s:%u here(this=%u)!\n",
				393	dlm->name, dlm->reco.dead_node, dlm->node_num);
				394
				395	status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
				396	if (status < 0) {
				397	mlog(ML_ERROR, "error %d remastering locks for node %u, "
				398	"retrying.\n", status, dlm->reco.dead_node);
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	399	/* yield a bit to allow any final network messages
				400	* to get handled on remaining nodes */
				401	msleep(100);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	402	} else {
				403	/* success! see if any other nodes need recovery */
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	404	mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n",
				405	dlm->name, dlm->reco.dead_node, dlm->node_num);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	406	dlm_reset_recovery(dlm);
				407	}
				408	dlm_end_recovery(dlm);
				409
				410	/* continue and look for another dead node */
				411	return -EAGAIN;
				412	}
				413
				414	static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
				415	{
				416	int status = 0;
				417	struct dlm_reco_node_data *ndata;
				418	struct list_head *iter;
				419	int all_nodes_done;
				420	int destroy = 0;
				421	int pass = 0;
				422
				423	status = dlm_init_recovery_area(dlm, dead_node);
				424	if (status < 0)
				425	goto leave;
				426
				427	/* safe to access the node data list without a lock, since this
				428	* process is the only one to change the list */
				429	list_for_each(iter, &dlm->reco.node_data) {
				430	ndata = list_entry (iter, struct dlm_reco_node_data, list);
				431	BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
				432	ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
				433
				434	mlog(0, "requesting lock info from node %u\n",
				435	ndata->node_num);
				436
				437	if (ndata->node_num == dlm->node_num) {
				438	ndata->state = DLM_RECO_NODE_DATA_DONE;
				439	continue;
				440	}
				441
				442	status = dlm_request_all_locks(dlm, ndata->node_num, dead_node);
				443	if (status < 0) {
				444	mlog_errno(status);
				445	if (dlm_is_host_down(status))
				446	ndata->state = DLM_RECO_NODE_DATA_DEAD;
				447	else {
				448	destroy = 1;
				449	goto leave;
				450	}
				451	}
				452
				453	switch (ndata->state) {
				454	case DLM_RECO_NODE_DATA_INIT:
				455	case DLM_RECO_NODE_DATA_FINALIZE_SENT:
				456	case DLM_RECO_NODE_DATA_REQUESTED:
				457	BUG();
				458	break;
				459	case DLM_RECO_NODE_DATA_DEAD:
				460	mlog(0, "node %u died after requesting "
				461	"recovery info for node %u\n",
				462	ndata->node_num, dead_node);
				463	// start all over
				464	destroy = 1;
				465	status = -EAGAIN;
				466	goto leave;
				467	case DLM_RECO_NODE_DATA_REQUESTING:
				468	ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
				469	mlog(0, "now receiving recovery data from "
				470	"node %u for dead node %u\n",
				471	ndata->node_num, dead_node);
				472	break;
				473	case DLM_RECO_NODE_DATA_RECEIVING:
				474	mlog(0, "already receiving recovery data from "
				475	"node %u for dead node %u\n",
				476	ndata->node_num, dead_node);
				477	break;
				478	case DLM_RECO_NODE_DATA_DONE:
				479	mlog(0, "already DONE receiving recovery data "
				480	"from node %u for dead node %u\n",
				481	ndata->node_num, dead_node);
				482	break;
				483	}
				484	}
				485
				486	mlog(0, "done requesting all lock info\n");
				487
				488	/* nodes should be sending reco data now
				489	* just need to wait */
				490
				491	while (1) {
				492	/* check all the nodes now to see if we are
				493	* done, or if anyone died */
				494	all_nodes_done = 1;
				495	spin_lock(&dlm_reco_state_lock);
				496	list_for_each(iter, &dlm->reco.node_data) {
				497	ndata = list_entry (iter, struct dlm_reco_node_data, list);
				498
				499	mlog(0, "checking recovery state of node %u\n",
				500	ndata->node_num);
				501	switch (ndata->state) {
				502	case DLM_RECO_NODE_DATA_INIT:
				503	case DLM_RECO_NODE_DATA_REQUESTING:
				504	mlog(ML_ERROR, "bad ndata state for "
				505	"node %u: state=%d\n",
				506	ndata->node_num, ndata->state);
				507	BUG();
				508	break;
				509	case DLM_RECO_NODE_DATA_DEAD:
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	510	mlog(ML_NOTICE, "node %u died after "
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	511	"requesting recovery info for "
				512	"node %u\n", ndata->node_num,
				513	dead_node);
				514	spin_unlock(&dlm_reco_state_lock);
				515	// start all over
				516	destroy = 1;
				517	status = -EAGAIN;
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	518	/* instead of spinning like crazy here,
				519	* wait for the domain map to catch up
				520	* with the network state. otherwise this
				521	* can be hit hundreds of times before
				522	* the node is really seen as dead. */
				523	wait_event_timeout(dlm->dlm_reco_thread_wq,
				524	dlm_is_node_dead(dlm,
				525	ndata->node_num),
				526	msecs_to_jiffies(1000));
				527	mlog(0, "waited 1 sec for %u, "
				528	"dead? %s\n", ndata->node_num,
				529	dlm_is_node_dead(dlm, ndata->node_num) ?
				530	"yes" : "no");
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	531	goto leave;
				532	case DLM_RECO_NODE_DATA_RECEIVING:
				533	case DLM_RECO_NODE_DATA_REQUESTED:
				534	all_nodes_done = 0;
				535	break;
				536	case DLM_RECO_NODE_DATA_DONE:
				537	break;
				538	case DLM_RECO_NODE_DATA_FINALIZE_SENT:
				539	break;
				540	}
				541	}
				542	spin_unlock(&dlm_reco_state_lock);
				543
				544	mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass,
				545	all_nodes_done?"yes":"no");
				546	if (all_nodes_done) {
				547	int ret;
				548
				549	/* all nodes are now in DLM_RECO_NODE_DATA_DONE state
				550	* just send a finalize message to everyone and
				551	* clean up */
				552	mlog(0, "all nodes are done! send finalize\n");
				553	ret = dlm_send_finalize_reco_message(dlm);
				554	if (ret < 0)
				555	mlog_errno(ret);
				556
				557	spin_lock(&dlm->spinlock);
				558	dlm_finish_local_lockres_recovery(dlm, dead_node,
				559	dlm->node_num);
				560	spin_unlock(&dlm->spinlock);
				561	mlog(0, "should be done with recovery!\n");
				562
				563	mlog(0, "finishing recovery of %s at %lu, "
				564	"dead=%u, this=%u, new=%u\n", dlm->name,
				565	jiffies, dlm->reco.dead_node,
				566	dlm->node_num, dlm->reco.new_master);
				567	destroy = 1;
				568	status = ret;
				569	/* rescan everything marked dirty along the way */
				570	dlm_kick_thread(dlm, NULL);
				571	break;
				572	}
				573	/* wait to be signalled, with periodic timeout
				574	* to check for node death */
				575	wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
				576	kthread_should_stop(),
				577	msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS));
				578
				579	}
				580
				581	leave:
				582	if (destroy)
				583	dlm_destroy_recovery_area(dlm, dead_node);
				584
				585	mlog_exit(status);
				586	return status;
				587	}
				588
				589	static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
				590	{
				591	int num=0;
				592	struct dlm_reco_node_data *ndata;
				593
				594	spin_lock(&dlm->spinlock);
				595	memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map));
				596	/* nodes can only be removed (by dying) after dropping
				597	* this lock, and death will be trapped later, so this should do */
				598	spin_unlock(&dlm->spinlock);
				599
				600	while (1) {
				601	num = find_next_bit (dlm->reco.node_map, O2NM_MAX_NODES, num);
				602	if (num >= O2NM_MAX_NODES) {
				603	break;
				604	}
				605	BUG_ON(num == dead_node);
				606
				607	ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL);
				608	if (!ndata) {
				609	dlm_destroy_recovery_area(dlm, dead_node);
				610	return -ENOMEM;
				611	}
				612	ndata->node_num = num;
				613	ndata->state = DLM_RECO_NODE_DATA_INIT;
				614	spin_lock(&dlm_reco_state_lock);
				615	list_add_tail(&ndata->list, &dlm->reco.node_data);
				616	spin_unlock(&dlm_reco_state_lock);
				617	num++;
				618	}
				619
				620	return 0;
				621	}
				622
				623	static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
				624	{
				625	struct list_head iter, iter2;
				626	struct dlm_reco_node_data *ndata;
				627	LIST_HEAD(tmplist);
				628
				629	spin_lock(&dlm_reco_state_lock);
				630	list_splice_init(&dlm->reco.node_data, &tmplist);
				631	spin_unlock(&dlm_reco_state_lock);
				632
				633	list_for_each_safe(iter, iter2, &tmplist) {
				634	ndata = list_entry (iter, struct dlm_reco_node_data, list);
				635	list_del_init(&ndata->list);
				636	kfree(ndata);
				637	}
				638	}
				639
				640	static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
				641	u8 dead_node)
				642	{
				643	struct dlm_lock_request lr;
				644	enum dlm_status ret;
				645
				646	mlog(0, "\n");
				647
				648
				649	mlog(0, "dlm_request_all_locks: dead node is %u, sending request "
				650	"to %u\n", dead_node, request_from);
				651
				652	memset(&lr, 0, sizeof(lr));
				653	lr.node_idx = dlm->node_num;
				654	lr.dead_node = dead_node;
				655
				656	// send message
				657	ret = DLM_NOLOCKMGR;
				658	ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
				659	&lr, sizeof(lr), request_from, NULL);
				660
				661	/* negative status is handled by caller */
				662	if (ret < 0)
				663	mlog_errno(ret);
				664
				665	// return from here, then
				666	// sleep until all received or error
				667	return ret;
				668
				669	}
				670
				671	int dlm_request_all_locks_handler(struct o2net_msg msg, u32 len, void data)
				672	{
				673	struct dlm_ctxt *dlm = data;
				674	struct dlm_lock_request lr = (struct dlm_lock_request )msg->buf;
				675	char *buf = NULL;
				676	struct dlm_work_item *item = NULL;
				677
				678	if (!dlm_grab(dlm))
				679	return -EINVAL;
				680
				681	BUG_ON(lr->dead_node != dlm->reco.dead_node);
				682
				683	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
				684	if (!item) {
				685	dlm_put(dlm);
				686	return -ENOMEM;
				687	}
				688
				689	/* this will get freed by dlm_request_all_locks_worker */
				690	buf = (char *) __get_free_page(GFP_KERNEL);
				691	if (!buf) {
				692	kfree(item);
				693	dlm_put(dlm);
				694	return -ENOMEM;
				695	}
				696
				697	/* queue up work for dlm_request_all_locks_worker */
				698	dlm_grab(dlm); /* get an extra ref for the work item */
				699	dlm_init_work_item(dlm, item, dlm_request_all_locks_worker, buf);
				700	item->u.ral.reco_master = lr->node_idx;
				701	item->u.ral.dead_node = lr->dead_node;
				702	spin_lock(&dlm->work_lock);
				703	list_add_tail(&item->list, &dlm->work_list);
				704	spin_unlock(&dlm->work_lock);
				705	schedule_work(&dlm->dispatched_work);
				706
				707	dlm_put(dlm);
				708	return 0;
				709	}
				710
				711	static void dlm_request_all_locks_worker(struct dlm_work_item item, void data)
				712	{
				713	struct dlm_migratable_lockres *mres;
				714	struct dlm_lock_resource *res;
				715	struct dlm_ctxt *dlm;
				716	LIST_HEAD(resources);
				717	struct list_head *iter;
				718	int ret;
				719	u8 dead_node, reco_master;
				720
				721	dlm = item->dlm;
				722	dead_node = item->u.ral.dead_node;
				723	reco_master = item->u.ral.reco_master;
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	724	mres = (struct dlm_migratable_lockres *)data;
				725
				726	if (dead_node != dlm->reco.dead_node \|\|
				727	reco_master != dlm->reco.new_master) {
				728	/* show extra debug info if the recovery state is messed */
				729	mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), "
				730	"request(dead=%u, master=%u)\n",
				731	dlm->name, dlm->reco.dead_node, dlm->reco.new_master,
				732	dead_node, reco_master);
				733	mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u "
				734	"entry[0]={c=%"MLFu64",l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n",
				735	dlm->name, mres->lockname_len, mres->lockname, mres->master,
				736	mres->num_locks, mres->total_locks, mres->flags,
				737	mres->ml[0].cookie, mres->ml[0].list, mres->ml[0].flags,
				738	mres->ml[0].type, mres->ml[0].convert_type,
				739	mres->ml[0].highest_blocked, mres->ml[0].node);
				740	BUG();
				741	}
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	742	BUG_ON(dead_node != dlm->reco.dead_node);
				743	BUG_ON(reco_master != dlm->reco.new_master);
				744
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	745	/* lock resources should have already been moved to the
				746	* dlm->reco.resources list. now move items from that list
				747	* to a temp list if the dead owner matches. note that the
				748	* whole cluster recovers only one node at a time, so we
				749	* can safely move UNKNOWN lock resources for each recovery
				750	* session. */
				751	dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
				752
				753	/* now we can begin blasting lockreses without the dlm lock */
				754	list_for_each(iter, &resources) {
				755	res = list_entry (iter, struct dlm_lock_resource, recovering);
				756	ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
				757	DLM_MRES_RECOVERY);
				758	if (ret < 0)
				759	mlog_errno(ret);
				760	}
				761
				762	/* move the resources back to the list */
				763	spin_lock(&dlm->spinlock);
				764	list_splice_init(&resources, &dlm->reco.resources);
				765	spin_unlock(&dlm->spinlock);
				766
				767	ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
				768	if (ret < 0)
				769	mlog_errno(ret);
				770
				771	free_page((unsigned long)data);
				772	}
				773
				774
				775	static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
				776	{
				777	int ret, tmpret;
				778	struct dlm_reco_data_done done_msg;
				779
				780	memset(&done_msg, 0, sizeof(done_msg));
				781	done_msg.node_idx = dlm->node_num;
				782	done_msg.dead_node = dead_node;
				783	mlog(0, "sending DATA DONE message to %u, "
				784	"my node=%u, dead node=%u\n", send_to, done_msg.node_idx,
				785	done_msg.dead_node);
				786
				787	ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
				788	sizeof(done_msg), send_to, &tmpret);
				789	/* negative status is ignored by the caller */
				790	if (ret >= 0)
				791	ret = tmpret;
				792	return ret;
				793	}
				794
				795
				796	int dlm_reco_data_done_handler(struct o2net_msg msg, u32 len, void data)
				797	{
				798	struct dlm_ctxt *dlm = data;
				799	struct dlm_reco_data_done done = (struct dlm_reco_data_done )msg->buf;
				800	struct list_head *iter;
				801	struct dlm_reco_node_data *ndata = NULL;
				802	int ret = -EINVAL;
				803
				804	if (!dlm_grab(dlm))
				805	return -EINVAL;
				806
				807	mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
				808	"node_idx=%u, this node=%u\n", done->dead_node,
				809	dlm->reco.dead_node, done->node_idx, dlm->node_num);
				810	BUG_ON(done->dead_node != dlm->reco.dead_node);
				811
				812	spin_lock(&dlm_reco_state_lock);
				813	list_for_each(iter, &dlm->reco.node_data) {
				814	ndata = list_entry (iter, struct dlm_reco_node_data, list);
				815	if (ndata->node_num != done->node_idx)
				816	continue;
				817
				818	switch (ndata->state) {
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	819	/* should have moved beyond INIT but not to FINALIZE yet */
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	820	case DLM_RECO_NODE_DATA_INIT:
				821	case DLM_RECO_NODE_DATA_DEAD:
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	822	case DLM_RECO_NODE_DATA_FINALIZE_SENT:
				823	mlog(ML_ERROR, "bad ndata state for node %u:"
				824	" state=%d\n", ndata->node_num,
				825	ndata->state);
				826	BUG();
				827	break;
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	828	/* these states are possible at this point, anywhere along
				829	* the line of recovery */
				830	case DLM_RECO_NODE_DATA_DONE:
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	831	case DLM_RECO_NODE_DATA_RECEIVING:
				832	case DLM_RECO_NODE_DATA_REQUESTED:
				833	case DLM_RECO_NODE_DATA_REQUESTING:
				834	mlog(0, "node %u is DONE sending "
				835	"recovery data!\n",
				836	ndata->node_num);
				837
				838	ndata->state = DLM_RECO_NODE_DATA_DONE;
				839	ret = 0;
				840	break;
				841	}
				842	}
				843	spin_unlock(&dlm_reco_state_lock);
				844
				845	/* wake the recovery thread, some node is done */
				846	if (!ret)
				847	dlm_kick_recovery_thread(dlm);
				848
				849	if (ret < 0)
				850	mlog(ML_ERROR, "failed to find recovery node data for node "
				851	"%u\n", done->node_idx);
				852	dlm_put(dlm);
				853
				854	mlog(0, "leaving reco data done handler, ret=%d\n", ret);
				855	return ret;
				856	}
				857
				858	static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
				859	struct list_head *list,
				860	u8 dead_node)
				861	{
				862	struct dlm_lock_resource *res;
				863	struct list_head iter, iter2;
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	864	struct dlm_lock *lock;
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	865
				866	spin_lock(&dlm->spinlock);
				867	list_for_each_safe(iter, iter2, &dlm->reco.resources) {
				868	res = list_entry (iter, struct dlm_lock_resource, recovering);
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	869	/* always prune any $RECOVERY entries for dead nodes,
				870	* otherwise hangs can occur during later recovery */
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	871	if (dlm_is_recovery_lock(res->lockname.name,
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	872	res->lockname.len)) {
				873	spin_lock(&res->spinlock);
				874	list_for_each_entry(lock, &res->granted, list) {
				875	if (lock->ml.node == dead_node) {
				876	mlog(0, "AHA! there was "
				877	"a $RECOVERY lock for dead "
				878	"node %u (%s)!\n",
				879	dead_node, dlm->name);
				880	list_del_init(&lock->list);
				881	dlm_lock_put(lock);
				882	break;
				883	}
				884	}
				885	spin_unlock(&res->spinlock);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	886	continue;
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	887	}
				888
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	889	if (res->owner == dead_node) {
				890	mlog(0, "found lockres owned by dead node while "
				891	"doing recovery for node %u. sending it.\n",
				892	dead_node);
				893	list_del_init(&res->recovering);
				894	list_add_tail(&res->recovering, list);
				895	} else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
				896	mlog(0, "found UNKNOWN owner while doing recovery "
				897	"for node %u. sending it.\n", dead_node);
				898	list_del_init(&res->recovering);
				899	list_add_tail(&res->recovering, list);
				900	}
				901	}
				902	spin_unlock(&dlm->spinlock);
				903	}
				904
				905	static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res)
				906	{
				907	int total_locks = 0;
				908	struct list_head iter, queue = &res->granted;
				909	int i;
				910
				911	for (i=0; i<3; i++) {
				912	list_for_each(iter, queue)
				913	total_locks++;
				914	queue++;
				915	}
				916	return total_locks;
				917	}
				918
				919
				920	static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
				921	struct dlm_migratable_lockres *mres,
				922	u8 send_to,
				923	struct dlm_lock_resource *res,
				924	int total_locks)
				925	{
				926	u64 mig_cookie = be64_to_cpu(mres->mig_cookie);
				927	int mres_total_locks = be32_to_cpu(mres->total_locks);
				928	int sz, ret = 0, status = 0;
				929	u8 orig_flags = mres->flags,
				930	orig_master = mres->master;
				931
				932	BUG_ON(mres->num_locks > DLM_MAX_MIGRATABLE_LOCKS);
				933	if (!mres->num_locks)
				934	return 0;
				935
				936	sz = sizeof(struct dlm_migratable_lockres) +
				937	(mres->num_locks * sizeof(struct dlm_migratable_lock));
				938
				939	/* add an all-done flag if we reached the last lock */
				940	orig_flags = mres->flags;
				941	BUG_ON(total_locks > mres_total_locks);
				942	if (total_locks == mres_total_locks)
				943	mres->flags \|= DLM_MRES_ALL_DONE;
				944
				945	/* send it */
				946	ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres,
				947	sz, send_to, &status);
				948	if (ret < 0) {
				949	/* XXX: negative status is not handled.
				950	* this will end up killing this node. */
				951	mlog_errno(ret);
				952	} else {
				953	/* might get an -ENOMEM back here */
				954	ret = status;
				955	if (ret < 0) {
				956	mlog_errno(ret);
				957
				958	if (ret == -EFAULT) {
				959	mlog(ML_ERROR, "node %u told me to kill "
				960	"myself!\n", send_to);
				961	BUG();
				962	}
				963	}
				964	}
				965
				966	/* zero and reinit the message buffer */
				967	dlm_init_migratable_lockres(mres, res->lockname.name,
				968	res->lockname.len, mres_total_locks,
				969	mig_cookie, orig_flags, orig_master);
				970	return ret;
				971	}
				972
				973	static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
				974	const char *lockname, int namelen,
				975	int total_locks, u64 cookie,
				976	u8 flags, u8 master)
				977	{
				978	/* mres here is one full page */
				979	memset(mres, 0, PAGE_SIZE);
				980	mres->lockname_len = namelen;
				981	memcpy(mres->lockname, lockname, namelen);
				982	mres->num_locks = 0;
				983	mres->total_locks = cpu_to_be32(total_locks);
				984	mres->mig_cookie = cpu_to_be64(cookie);
				985	mres->flags = flags;
				986	mres->master = master;
				987	}
				988
				989
				990	/* returns 1 if this lock fills the network structure,
				991	* 0 otherwise */
				992	static int dlm_add_lock_to_array(struct dlm_lock *lock,
				993	struct dlm_migratable_lockres *mres, int queue)
				994	{
				995	struct dlm_migratable_lock *ml;
				996	int lock_num = mres->num_locks;
				997
				998	ml = &(mres->ml[lock_num]);
				999	ml->cookie = lock->ml.cookie;
				1000	ml->type = lock->ml.type;
				1001	ml->convert_type = lock->ml.convert_type;
				1002	ml->highest_blocked = lock->ml.highest_blocked;
				1003	ml->list = queue;
				1004	if (lock->lksb) {
				1005	ml->flags = lock->lksb->flags;
				1006	/* send our current lvb */
				1007	if (ml->type == LKM_EXMODE \|\|
				1008	ml->type == LKM_PRMODE) {
				1009	/* if it is already set, this had better be a PR
				1010	* and it has to match */
				1011	if (mres->lvb[0] && (ml->type == LKM_EXMODE \|\|
				1012	memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
				1013	mlog(ML_ERROR, "mismatched lvbs!\n");
				1014	__dlm_print_one_lock_resource(lock->lockres);
				1015	BUG();
				1016	}
				1017	memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
				1018	}
				1019	}
				1020	ml->node = lock->ml.node;
				1021	mres->num_locks++;
				1022	/* we reached the max, send this network message */
				1023	if (mres->num_locks == DLM_MAX_MIGRATABLE_LOCKS)
				1024	return 1;
				1025	return 0;
				1026	}
				1027
				1028
				1029	int dlm_send_one_lockres(struct dlm_ctxt dlm, struct dlm_lock_resource res,
				1030	struct dlm_migratable_lockres *mres,
				1031	u8 send_to, u8 flags)
				1032	{
				1033	struct list_head queue, iter;
				1034	int total_locks, i;
				1035	u64 mig_cookie = 0;
				1036	struct dlm_lock *lock;
				1037	int ret = 0;
				1038
				1039	BUG_ON(!(flags & (DLM_MRES_RECOVERY\|DLM_MRES_MIGRATION)));
				1040
				1041	mlog(0, "sending to %u\n", send_to);
				1042
				1043	total_locks = dlm_num_locks_in_lockres(res);
				1044	if (total_locks > DLM_MAX_MIGRATABLE_LOCKS) {
				1045	/* rare, but possible */
				1046	mlog(0, "argh. lockres has %d locks. this will "
				1047	"require more than one network packet to "
				1048	"migrate\n", total_locks);
				1049	mig_cookie = dlm_get_next_mig_cookie();
				1050	}
				1051
				1052	dlm_init_migratable_lockres(mres, res->lockname.name,
				1053	res->lockname.len, total_locks,
				1054	mig_cookie, flags, res->owner);
				1055
				1056	total_locks = 0;
				1057	for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
				1058	queue = dlm_list_idx_to_ptr(res, i);
				1059	list_for_each(iter, queue) {
				1060	lock = list_entry (iter, struct dlm_lock, list);
				1061
				1062	/* add another lock. */
				1063	total_locks++;
				1064	if (!dlm_add_lock_to_array(lock, mres, i))
				1065	continue;
				1066
				1067	/* this filled the lock message,
				1068	* we must send it immediately. */
				1069	ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
				1070	res, total_locks);
				1071	if (ret < 0) {
				1072	// TODO
				1073	mlog(ML_ERROR, "dlm_send_mig_lockres_msg "
				1074	"returned %d, TODO\n", ret);
				1075	BUG();
				1076	}
				1077	}
				1078	}
				1079	/* flush any remaining locks */
				1080	ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
				1081	if (ret < 0) {
				1082	// TODO
				1083	mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, "
				1084	"TODO\n", ret);
				1085	BUG();
				1086	}
				1087	return ret;
				1088	}
				1089
				1090
				1091
				1092	/*
				1093	* this message will contain no more than one page worth of
				1094	* recovery data, and it will work on only one lockres.
				1095	* there may be many locks in this page, and we may need to wait
				1096	* for additional packets to complete all the locks (rare, but
				1097	* possible).
				1098	*/
				1099	/*
				1100	* NOTE: the allocation error cases here are scary
				1101	* we really cannot afford to fail an alloc in recovery
				1102	* do we spin? returning an error only delays the problem really
				1103	*/
				1104
				1105	int dlm_mig_lockres_handler(struct o2net_msg msg, u32 len, void data)
				1106	{
				1107	struct dlm_ctxt *dlm = data;
				1108	struct dlm_migratable_lockres *mres =
				1109	(struct dlm_migratable_lockres *)msg->buf;
				1110	int ret = 0;
				1111	u8 real_master;
				1112	char *buf = NULL;
				1113	struct dlm_work_item *item = NULL;
				1114	struct dlm_lock_resource *res = NULL;
				1115
				1116	if (!dlm_grab(dlm))
				1117	return -EINVAL;
				1118
				1119	BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY\|DLM_MRES_MIGRATION)));
				1120
				1121	real_master = mres->master;
				1122	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
				1123	/* cannot migrate a lockres with no master */
				1124	BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
				1125	}
				1126
				1127	mlog(0, "%s message received from node %u\n",
				1128	(mres->flags & DLM_MRES_RECOVERY) ?
				1129	"recovery" : "migration", mres->master);
				1130	if (mres->flags & DLM_MRES_ALL_DONE)
				1131	mlog(0, "all done flag. all lockres data received!\n");
				1132
				1133	ret = -ENOMEM;
				1134	buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL);
				1135	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
				1136	if (!buf \|\| !item)
				1137	goto leave;
				1138
				1139	/* lookup the lock to see if we have a secondary queue for this
				1140	* already... just add the locks in and this will have its owner
				1141	* and RECOVERY flag changed when it completes. */
				1142	res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len);
				1143	if (res) {
				1144	/* this will get a ref on res */
				1145	/* mark it as recovering/migrating and hash it */
				1146	spin_lock(&res->spinlock);
				1147	if (mres->flags & DLM_MRES_RECOVERY) {
				1148	res->state \|= DLM_LOCK_RES_RECOVERING;
				1149	} else {
				1150	if (res->state & DLM_LOCK_RES_MIGRATING) {
				1151	/* this is at least the second
				1152	* lockres message */
				1153	mlog(0, "lock %.*s is already migrating\n",
				1154	mres->lockname_len,
				1155	mres->lockname);
				1156	} else if (res->state & DLM_LOCK_RES_RECOVERING) {
				1157	/* caller should BUG */
				1158	mlog(ML_ERROR, "node is attempting to migrate "
				1159	"lock %.*s, but marked as recovering!\n",
				1160	mres->lockname_len, mres->lockname);
				1161	ret = -EFAULT;
				1162	spin_unlock(&res->spinlock);
				1163	goto leave;
				1164	}
				1165	res->state \|= DLM_LOCK_RES_MIGRATING;
				1166	}
				1167	spin_unlock(&res->spinlock);
				1168	} else {
				1169	/* need to allocate, just like if it was
				1170	* mastered here normally */
				1171	res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);
				1172	if (!res)
				1173	goto leave;
				1174
				1175	/* to match the ref that we would have gotten if
				1176	* dlm_lookup_lockres had succeeded */
				1177	dlm_lockres_get(res);
				1178
				1179	/* mark it as recovering/migrating and hash it */
				1180	if (mres->flags & DLM_MRES_RECOVERY)
				1181	res->state \|= DLM_LOCK_RES_RECOVERING;
				1182	else
				1183	res->state \|= DLM_LOCK_RES_MIGRATING;
				1184
				1185	spin_lock(&dlm->spinlock);
				1186	__dlm_insert_lockres(dlm, res);
				1187	spin_unlock(&dlm->spinlock);
				1188
				1189	/* now that the new lockres is inserted,
				1190	* make it usable by other processes */
				1191	spin_lock(&res->spinlock);
				1192	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
				1193	spin_unlock(&res->spinlock);
				1194
				1195	/* add an extra ref for just-allocated lockres
				1196	* otherwise the lockres will be purged immediately */
				1197	dlm_lockres_get(res);
				1198
				1199	}
				1200
				1201	/* at this point we have allocated everything we need,
				1202	* and we have a hashed lockres with an extra ref and
				1203	* the proper res->state flags. */
				1204	ret = 0;
				1205	if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) {
				1206	/* migration cannot have an unknown master */
				1207	BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
				1208	mlog(0, "recovery has passed me a lockres with an "
				1209	"unknown owner.. will need to requery: "
				1210	"%.*s\n", mres->lockname_len, mres->lockname);
				1211	} else {
				1212	spin_lock(&res->spinlock);
				1213	dlm_change_lockres_owner(dlm, res, dlm->node_num);
				1214	spin_unlock(&res->spinlock);
				1215	}
				1216
				1217	/* queue up work for dlm_mig_lockres_worker */
				1218	dlm_grab(dlm); /* get an extra ref for the work item */
				1219	memcpy(buf, msg->buf, be16_to_cpu(msg->data_len)); /* copy the whole message */
				1220	dlm_init_work_item(dlm, item, dlm_mig_lockres_worker, buf);
				1221	item->u.ml.lockres = res; /* already have a ref */
				1222	item->u.ml.real_master = real_master;
				1223	spin_lock(&dlm->work_lock);
				1224	list_add_tail(&item->list, &dlm->work_list);
				1225	spin_unlock(&dlm->work_lock);
				1226	schedule_work(&dlm->dispatched_work);
				1227
				1228	leave:
				1229	dlm_put(dlm);
				1230	if (ret < 0) {
				1231	if (buf)
				1232	kfree(buf);
				1233	if (item)
				1234	kfree(item);
				1235	}
				1236
				1237	mlog_exit(ret);
				1238	return ret;
				1239	}
				1240
				1241
				1242	static void dlm_mig_lockres_worker(struct dlm_work_item item, void data)
				1243	{
				1244	struct dlm_ctxt *dlm = data;
				1245	struct dlm_migratable_lockres *mres;
				1246	int ret = 0;
				1247	struct dlm_lock_resource *res;
				1248	u8 real_master;
				1249
				1250	dlm = item->dlm;
				1251	mres = (struct dlm_migratable_lockres *)data;
				1252
				1253	res = item->u.ml.lockres;
				1254	real_master = item->u.ml.real_master;
				1255
				1256	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
				1257	/* this case is super-rare. only occurs if
				1258	* node death happens during migration. */
				1259	again:
				1260	ret = dlm_lockres_master_requery(dlm, res, &real_master);
				1261	if (ret < 0) {
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	1262	mlog(0, "dlm_lockres_master_requery ret=%d\n",
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1263	ret);
				1264	goto again;
				1265	}
				1266	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
				1267	mlog(0, "lockres %.*s not claimed. "
				1268	"this node will take it.\n",
				1269	res->lockname.len, res->lockname.name);
				1270	} else {
				1271	mlog(0, "master needs to respond to sender "
				1272	"that node %u still owns %.*s\n",
				1273	real_master, res->lockname.len,
				1274	res->lockname.name);
				1275	/* cannot touch this lockres */
				1276	goto leave;
				1277	}
				1278	}
				1279
				1280	ret = dlm_process_recovery_data(dlm, res, mres);
				1281	if (ret < 0)
				1282	mlog(0, "dlm_process_recovery_data returned %d\n", ret);
				1283	else
				1284	mlog(0, "dlm_process_recovery_data succeeded\n");
				1285
				1286	if ((mres->flags & (DLM_MRES_MIGRATION\|DLM_MRES_ALL_DONE)) ==
				1287	(DLM_MRES_MIGRATION\|DLM_MRES_ALL_DONE)) {
				1288	ret = dlm_finish_migration(dlm, res, mres->master);
				1289	if (ret < 0)
				1290	mlog_errno(ret);
				1291	}
				1292
				1293	leave:
				1294	kfree(data);
				1295	mlog_exit(ret);
				1296	}
				1297
				1298
				1299
				1300	static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
				1301	struct dlm_lock_resource *res,
				1302	u8 *real_master)
				1303	{
				1304	struct dlm_node_iter iter;
				1305	int nodenum;
				1306	int ret = 0;
				1307
				1308	*real_master = DLM_LOCK_RES_OWNER_UNKNOWN;
				1309
				1310	/* we only reach here if one of the two nodes in a
				1311	* migration died while the migration was in progress.
				1312	* at this point we need to requery the master. we
				1313	* know that the new_master got as far as creating
				1314	* an mle on at least one node, but we do not know
				1315	* if any nodes had actually cleared the mle and set
				1316	* the master to the new_master. the old master
				1317	* is supposed to set the owner to UNKNOWN in the
				1318	* event of a new_master death, so the only possible
				1319	* responses that we can get from nodes here are
				1320	* that the master is new_master, or that the master
				1321	* is UNKNOWN.
				1322	* if all nodes come back with UNKNOWN then we know
				1323	* the lock needs remastering here.
				1324	* if any node comes back with a valid master, check
				1325	* to see if that master is the one that we are
				1326	* recovering. if so, then the new_master died and
				1327	* we need to remaster this lock. if not, then the
				1328	* new_master survived and that node will respond to
				1329	* other nodes about the owner.
				1330	* if there is an owner, this node needs to dump this
				1331	* lockres and alert the sender that this lockres
				1332	* was rejected. */
				1333	spin_lock(&dlm->spinlock);
				1334	dlm_node_iter_init(dlm->domain_map, &iter);
				1335	spin_unlock(&dlm->spinlock);
				1336
				1337	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
				1338	/* do not send to self */
				1339	if (nodenum == dlm->node_num)
				1340	continue;
				1341	ret = dlm_do_master_requery(dlm, res, nodenum, real_master);
				1342	if (ret < 0) {
				1343	mlog_errno(ret);
				1344	BUG();
				1345	/* TODO: need to figure a way to restart this */
				1346	}
				1347	if (*real_master != DLM_LOCK_RES_OWNER_UNKNOWN) {
				1348	mlog(0, "lock master is %u\n", *real_master);
				1349	break;
				1350	}
				1351	}
				1352	return ret;
				1353	}
				1354
				1355
				1356	static int dlm_do_master_requery(struct dlm_ctxt *dlm,
				1357	struct dlm_lock_resource *res,
				1358	u8 nodenum, u8 *real_master)
				1359	{
				1360	int ret = -EINVAL;
				1361	struct dlm_master_requery req;
				1362	int status = DLM_LOCK_RES_OWNER_UNKNOWN;
				1363
				1364	memset(&req, 0, sizeof(req));
				1365	req.node_idx = dlm->node_num;
				1366	req.namelen = res->lockname.len;
				1367	memcpy(req.name, res->lockname.name, res->lockname.len);
				1368
				1369	ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key,
				1370	&req, sizeof(req), nodenum, &status);
				1371	/* XXX: negative status not handled properly here. */
				1372	if (ret < 0)
				1373	mlog_errno(ret);
				1374	else {
				1375	BUG_ON(status < 0);
				1376	BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
				1377	*real_master = (u8) (status & 0xff);
				1378	mlog(0, "node %u responded to master requery with %u\n",
				1379	nodenum, *real_master);
				1380	ret = 0;
				1381	}
				1382	return ret;
				1383	}
				1384
				1385
				1386	/* this function cannot error, so unless the sending
				1387	* or receiving of the message failed, the owner can
				1388	* be trusted */
				1389	int dlm_master_requery_handler(struct o2net_msg msg, u32 len, void data)
				1390	{
				1391	struct dlm_ctxt *dlm = data;
				1392	struct dlm_master_requery req = (struct dlm_master_requery )msg->buf;
				1393	struct dlm_lock_resource *res = NULL;
				1394	int master = DLM_LOCK_RES_OWNER_UNKNOWN;
				1395	u32 flags = DLM_ASSERT_MASTER_REQUERY;
				1396
				1397	if (!dlm_grab(dlm)) {
				1398	/* since the domain has gone away on this
				1399	* node, the proper response is UNKNOWN */
				1400	return master;
				1401	}
				1402
				1403	spin_lock(&dlm->spinlock);
				1404	res = __dlm_lookup_lockres(dlm, req->name, req->namelen);
				1405	if (res) {
				1406	spin_lock(&res->spinlock);
				1407	master = res->owner;
				1408	if (master == dlm->node_num) {
				1409	int ret = dlm_dispatch_assert_master(dlm, res,
				1410	0, 0, flags);
				1411	if (ret < 0) {
				1412	mlog_errno(-ENOMEM);
				1413	/* retry!? */
				1414	BUG();
				1415	}
				1416	}
				1417	spin_unlock(&res->spinlock);
				1418	}
				1419	spin_unlock(&dlm->spinlock);
				1420
				1421	dlm_put(dlm);
				1422	return master;
				1423	}
				1424
				1425	static inline struct list_head *
				1426	dlm_list_num_to_pointer(struct dlm_lock_resource *res, int list_num)
				1427	{
				1428	struct list_head *ret;
				1429	BUG_ON(list_num < 0);
				1430	BUG_ON(list_num > 2);
				1431	ret = &(res->granted);
				1432	ret += list_num;
				1433	return ret;
				1434	}
				1435	/* TODO: do ast flush business
				1436	* TODO: do MIGRATING and RECOVERING spinning
				1437	*/
				1438
				1439	/*
				1440	* NOTE about in-flight requests during migration:
				1441	*
				1442	* Before attempting the migrate, the master has marked the lockres as
				1443	* MIGRATING and then flushed all of its pending ASTS. So any in-flight
				1444	* requests either got queued before the MIGRATING flag got set, in which
				1445	* case the lock data will reflect the change and a return message is on
				1446	* the way, or the request failed to get in before MIGRATING got set. In
				1447	* this case, the caller will be told to spin and wait for the MIGRATING
				1448	* flag to be dropped, then recheck the master.
				1449	* This holds true for the convert, cancel and unlock cases, and since lvb
				1450	* updates are tied to these same messages, it applies to lvb updates as
				1451	* well. For the lock case, there is no way a lock can be on the master
				1452	* queue and not be on the secondary queue since the lock is always added
				1453	* locally first. This means that the new target node will never be sent
				1454	* a lock that he doesn't already have on the list.
				1455	* In total, this means that the local lock is correct and should not be
				1456	* updated to match the one sent by the master. Any messages sent back
				1457	* from the master before the MIGRATING flag will bring the lock properly
				1458	* up-to-date, and the change will be ordered properly for the waiter.
				1459	* We will not attempt to modify the lock underneath the waiter.
				1460	*/
				1461
				1462	static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
				1463	struct dlm_lock_resource *res,
				1464	struct dlm_migratable_lockres *mres)
				1465	{
				1466	struct dlm_migratable_lock *ml;
				1467	struct list_head *queue;
				1468	struct dlm_lock *newlock = NULL;
				1469	struct dlm_lockstatus *lksb = NULL;
				1470	int ret = 0;
				1471	int i;
				1472	struct list_head *iter;
				1473	struct dlm_lock *lock = NULL;
				1474
				1475	mlog(0, "running %d locks for this lockres\n", mres->num_locks);
				1476	for (i=0; i<mres->num_locks; i++) {
				1477	ml = &(mres->ml[i]);
				1478	BUG_ON(ml->highest_blocked != LKM_IVMODE);
				1479	newlock = NULL;
				1480	lksb = NULL;
				1481
				1482	queue = dlm_list_num_to_pointer(res, ml->list);
				1483
				1484	/* if the lock is for the local node it needs to
				1485	* be moved to the proper location within the queue.
				1486	* do not allocate a new lock structure. */
				1487	if (ml->node == dlm->node_num) {
				1488	/* MIGRATION ONLY! */
				1489	BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
				1490
				1491	spin_lock(&res->spinlock);
				1492	list_for_each(iter, queue) {
				1493	lock = list_entry (iter, struct dlm_lock, list);
				1494	if (lock->ml.cookie != ml->cookie)
				1495	lock = NULL;
				1496	else
				1497	break;
				1498	}
				1499
				1500	/* lock is always created locally first, and
				1501	* destroyed locally last. it must be on the list */
				1502	if (!lock) {
				1503	mlog(ML_ERROR, "could not find local lock "
				1504	"with cookie %"MLFu64"!\n",
				1505	ml->cookie);
				1506	BUG();
				1507	}
				1508	BUG_ON(lock->ml.node != ml->node);
				1509
				1510	/* see NOTE above about why we do not update
				1511	* to match the master here */
				1512
				1513	/* move the lock to its proper place */
				1514	/* do not alter lock refcount. switching lists. */
				1515	list_del_init(&lock->list);
				1516	list_add_tail(&lock->list, queue);
				1517	spin_unlock(&res->spinlock);
				1518
				1519	mlog(0, "just reordered a local lock!\n");
				1520	continue;
				1521	}
				1522
				1523	/* lock is for another node. */
				1524	newlock = dlm_new_lock(ml->type, ml->node,
				1525	be64_to_cpu(ml->cookie), NULL);
				1526	if (!newlock) {
				1527	ret = -ENOMEM;
				1528	goto leave;
				1529	}
				1530	lksb = newlock->lksb;
				1531	dlm_lock_attach_lockres(newlock, res);
				1532
				1533	if (ml->convert_type != LKM_IVMODE) {
				1534	BUG_ON(queue != &res->converting);
				1535	newlock->ml.convert_type = ml->convert_type;
				1536	}
				1537	lksb->flags \|= (ml->flags &
				1538	(DLM_LKSB_PUT_LVB\|DLM_LKSB_GET_LVB));
				1539
				1540	if (mres->lvb[0]) {
				1541	if (lksb->flags & DLM_LKSB_PUT_LVB) {
				1542	/* other node was trying to update
				1543	* lvb when node died. recreate the
				1544	* lksb with the updated lvb. */
				1545	memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN);
				1546	} else {
				1547	/* otherwise, the node is sending its
				1548	* most recent valid lvb info */
				1549	BUG_ON(ml->type != LKM_EXMODE &&
				1550	ml->type != LKM_PRMODE);
				1551	if (res->lvb[0] && (ml->type == LKM_EXMODE \|\|
				1552	memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
				1553	mlog(ML_ERROR, "received bad lvb!\n");
				1554	__dlm_print_one_lock_resource(res);
				1555	BUG();
				1556	}
				1557	memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
				1558	}
				1559	}
				1560
				1561
				1562	/* NOTE:
				1563	* wrt lock queue ordering and recovery:
				1564	* 1. order of locks on granted queue is
				1565	* meaningless.
				1566	* 2. order of locks on converting queue is
				1567	* LOST with the node death. sorry charlie.
				1568	* 3. order of locks on the blocked queue is
				1569	* also LOST.
				1570	* order of locks does not affect integrity, it
				1571	* just means that a lock request may get pushed
				1572	* back in line as a result of the node death.
				1573	* also note that for a given node the lock order
				1574	* for its secondary queue locks is preserved
				1575	* relative to each other, but clearly not
				1576	* preserved relative to locks from other nodes.
				1577	*/
				1578	spin_lock(&res->spinlock);
				1579	dlm_lock_get(newlock);
				1580	list_add_tail(&newlock->list, queue);
				1581	spin_unlock(&res->spinlock);
				1582	}
				1583	mlog(0, "done running all the locks\n");
				1584
				1585	leave:
				1586	if (ret < 0) {
				1587	mlog_errno(ret);
				1588	if (newlock)
				1589	dlm_lock_put(newlock);
				1590	}
				1591
				1592	mlog_exit(ret);
				1593	return ret;
				1594	}
				1595
				1596	void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
				1597	struct dlm_lock_resource *res)
				1598	{
				1599	int i;
				1600	struct list_head queue, iter, *iter2;
				1601	struct dlm_lock *lock;
				1602
				1603	res->state \|= DLM_LOCK_RES_RECOVERING;
				1604	if (!list_empty(&res->recovering))
				1605	list_del_init(&res->recovering);
				1606	list_add_tail(&res->recovering, &dlm->reco.resources);
				1607
				1608	/* find any pending locks and put them back on proper list */
				1609	for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) {
				1610	queue = dlm_list_idx_to_ptr(res, i);
				1611	list_for_each_safe(iter, iter2, queue) {
				1612	lock = list_entry (iter, struct dlm_lock, list);
				1613	dlm_lock_get(lock);
				1614	if (lock->convert_pending) {
				1615	/* move converting lock back to granted */
				1616	BUG_ON(i != DLM_CONVERTING_LIST);
				1617	mlog(0, "node died with convert pending "
				1618	"on %.*s. move back to granted list.\n",
				1619	res->lockname.len, res->lockname.name);
				1620	dlm_revert_pending_convert(res, lock);
				1621	lock->convert_pending = 0;
				1622	} else if (lock->lock_pending) {
				1623	/* remove pending lock requests completely */
				1624	BUG_ON(i != DLM_BLOCKED_LIST);
				1625	mlog(0, "node died with lock pending "
				1626	"on %.*s. remove from blocked list and skip.\n",
				1627	res->lockname.len, res->lockname.name);
				1628	/* lock will be floating until ref in
				1629	* dlmlock_remote is freed after the network
				1630	* call returns. ok for it to not be on any
				1631	* list since no ast can be called
				1632	* (the master is dead). */
				1633	dlm_revert_pending_lock(res, lock);
				1634	lock->lock_pending = 0;
				1635	} else if (lock->unlock_pending) {
				1636	/* if an unlock was in progress, treat as
				1637	* if this had completed successfully
				1638	* before sending this lock state to the
				1639	* new master. note that the dlm_unlock
				1640	* call is still responsible for calling
				1641	* the unlockast. that will happen after
				1642	* the network call times out. for now,
				1643	* just move lists to prepare the new
				1644	* recovery master. */
				1645	BUG_ON(i != DLM_GRANTED_LIST);
				1646	mlog(0, "node died with unlock pending "
				1647	"on %.*s. remove from blocked list and skip.\n",
				1648	res->lockname.len, res->lockname.name);
				1649	dlm_commit_pending_unlock(res, lock);
				1650	lock->unlock_pending = 0;
				1651	} else if (lock->cancel_pending) {
				1652	/* if a cancel was in progress, treat as
				1653	* if this had completed successfully
				1654	* before sending this lock state to the
				1655	* new master */
				1656	BUG_ON(i != DLM_CONVERTING_LIST);
				1657	mlog(0, "node died with cancel pending "
				1658	"on %.*s. move back to granted list.\n",
				1659	res->lockname.len, res->lockname.name);
				1660	dlm_commit_pending_cancel(res, lock);
				1661	lock->cancel_pending = 0;
				1662	}
				1663	dlm_lock_put(lock);
				1664	}
				1665	}
				1666	}
				1667
				1668
				1669
				1670	/* removes all recovered locks from the recovery list.
				1671	* sets the res->owner to the new master.
				1672	* unsets the RECOVERY flag and wakes waiters. */
				1673	static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
				1674	u8 dead_node, u8 new_master)
				1675	{
				1676	int i;
				1677	struct list_head iter, iter2, *bucket;
				1678	struct dlm_lock_resource *res;
				1679
				1680	mlog_entry_void();
				1681
				1682	assert_spin_locked(&dlm->spinlock);
				1683
				1684	list_for_each_safe(iter, iter2, &dlm->reco.resources) {
				1685	res = list_entry (iter, struct dlm_lock_resource, recovering);
				1686	if (res->owner == dead_node) {
				1687	list_del_init(&res->recovering);
				1688	spin_lock(&res->spinlock);
				1689	dlm_change_lockres_owner(dlm, res, new_master);
				1690	res->state &= ~DLM_LOCK_RES_RECOVERING;
				1691	__dlm_dirty_lockres(dlm, res);
				1692	spin_unlock(&res->spinlock);
				1693	wake_up(&res->wq);
				1694	}
				1695	}
				1696
				1697	/* this will become unnecessary eventually, but
				1698	* for now we need to run the whole hash, clear
				1699	* the RECOVERING state and set the owner
				1700	* if necessary */
				1701	for (i=0; i<DLM_HASH_SIZE; i++) {
				1702	bucket = &(dlm->resources[i]);
				1703	list_for_each(iter, bucket) {
				1704	res = list_entry (iter, struct dlm_lock_resource, list);
				1705	if (res->state & DLM_LOCK_RES_RECOVERING) {
				1706	if (res->owner == dead_node) {
				1707	mlog(0, "(this=%u) res %.*s owner=%u "
				1708	"was not on recovering list, but "
				1709	"clearing state anyway\n",
				1710	dlm->node_num, res->lockname.len,
				1711	res->lockname.name, new_master);
				1712	} else if (res->owner == dlm->node_num) {
				1713	mlog(0, "(this=%u) res %.*s owner=%u "
				1714	"was not on recovering list, "
				1715	"owner is THIS node, clearing\n",
				1716	dlm->node_num, res->lockname.len,
				1717	res->lockname.name, new_master);
				1718	} else
				1719	continue;
				1720
				1721	spin_lock(&res->spinlock);
				1722	dlm_change_lockres_owner(dlm, res, new_master);
				1723	res->state &= ~DLM_LOCK_RES_RECOVERING;
				1724	__dlm_dirty_lockres(dlm, res);
				1725	spin_unlock(&res->spinlock);
				1726	wake_up(&res->wq);
				1727	}
				1728	}
				1729	}
				1730	}
				1731
				1732	static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local)
				1733	{
				1734	if (local) {
				1735	if (lock->ml.type != LKM_EXMODE &&
				1736	lock->ml.type != LKM_PRMODE)
				1737	return 1;
				1738	} else if (lock->ml.type == LKM_EXMODE)
				1739	return 1;
				1740	return 0;
				1741	}
				1742
				1743	static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
				1744	struct dlm_lock_resource *res, u8 dead_node)
				1745	{
				1746	struct list_head iter, queue;
				1747	struct dlm_lock *lock;
				1748	int blank_lvb = 0, local = 0;
				1749	int i;
				1750	u8 search_node;
				1751
				1752	assert_spin_locked(&dlm->spinlock);
				1753	assert_spin_locked(&res->spinlock);
				1754
				1755	if (res->owner == dlm->node_num)
				1756	/* if this node owned the lockres, and if the dead node
				1757	* had an EX when he died, blank out the lvb */
				1758	search_node = dead_node;
				1759	else {
				1760	/* if this is a secondary lockres, and we had no EX or PR
				1761	* locks granted, we can no longer trust the lvb */
				1762	search_node = dlm->node_num;
				1763	local = 1; /* check local state for valid lvb */
				1764	}
				1765
				1766	for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) {
				1767	queue = dlm_list_idx_to_ptr(res, i);
				1768	list_for_each(iter, queue) {
				1769	lock = list_entry (iter, struct dlm_lock, list);
				1770	if (lock->ml.node == search_node) {
				1771	if (dlm_lvb_needs_invalidation(lock, local)) {
				1772	/* zero the lksb lvb and lockres lvb */
				1773	blank_lvb = 1;
				1774	memset(lock->lksb->lvb, 0, DLM_LVB_LEN);
				1775	}
				1776	}
				1777	}
				1778	}
				1779
				1780	if (blank_lvb) {
				1781	mlog(0, "clearing %.*s lvb, dead node %u had EX\n",
				1782	res->lockname.len, res->lockname.name, dead_node);
				1783	memset(res->lvb, 0, DLM_LVB_LEN);
				1784	}
				1785	}
				1786
				1787	static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
				1788	struct dlm_lock_resource *res, u8 dead_node)
				1789	{
				1790	struct list_head iter, tmpiter;
				1791	struct dlm_lock *lock;
				1792
				1793	/* this node is the lockres master:
				1794	* 1) remove any stale locks for the dead node
				1795	* 2) if the dead node had an EX when he died, blank out the lvb
				1796	*/
				1797	assert_spin_locked(&dlm->spinlock);
				1798	assert_spin_locked(&res->spinlock);
				1799
				1800	/* TODO: check pending_asts, pending_basts here */
				1801	list_for_each_safe(iter, tmpiter, &res->granted) {
				1802	lock = list_entry (iter, struct dlm_lock, list);
				1803	if (lock->ml.node == dead_node) {
				1804	list_del_init(&lock->list);
				1805	dlm_lock_put(lock);
				1806	}
				1807	}
				1808	list_for_each_safe(iter, tmpiter, &res->converting) {
				1809	lock = list_entry (iter, struct dlm_lock, list);
				1810	if (lock->ml.node == dead_node) {
				1811	list_del_init(&lock->list);
				1812	dlm_lock_put(lock);
				1813	}
				1814	}
				1815	list_for_each_safe(iter, tmpiter, &res->blocked) {
				1816	lock = list_entry (iter, struct dlm_lock, list);
				1817	if (lock->ml.node == dead_node) {
				1818	list_del_init(&lock->list);
				1819	dlm_lock_put(lock);
				1820	}
				1821	}
				1822
				1823	/* do not kick thread yet */
				1824	__dlm_dirty_lockres(dlm, res);
				1825	}
				1826
				1827	/* if this node is the recovery master, and there are no
				1828	* locks for a given lockres owned by this node that are in
				1829	* either PR or EX mode, zero out the lvb before requesting.
				1830	*
				1831	*/
				1832
				1833
				1834	static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
				1835	{
				1836	struct list_head *iter;
				1837	struct dlm_lock_resource *res;
				1838	int i;
				1839	struct list_head *bucket;
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	1840	struct dlm_lock *lock;
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1841
				1842
				1843	/* purge any stale mles */
				1844	dlm_clean_master_list(dlm, dead_node);
				1845
				1846	/*
				1847	* now clean up all lock resources. there are two rules:
				1848	*
				1849	* 1) if the dead node was the master, move the lockres
				1850	* to the recovering list. set the RECOVERING flag.
				1851	* this lockres needs to be cleaned up before it can
				1852	* be used further.
				1853	*
				1854	* 2) if this node was the master, remove all locks from
				1855	* each of the lockres queues that were owned by the
				1856	* dead node. once recovery finishes, the dlm thread
				1857	* can be kicked again to see if any ASTs or BASTs
				1858	* need to be fired as a result.
				1859	*/
				1860	for (i=0; i<DLM_HASH_SIZE; i++) {
				1861	bucket = &(dlm->resources[i]);
				1862	list_for_each(iter, bucket) {
				1863	res = list_entry (iter, struct dlm_lock_resource, list);
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	1864	/* always prune any $RECOVERY entries for dead nodes,
				1865	* otherwise hangs can occur during later recovery */
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1866	if (dlm_is_recovery_lock(res->lockname.name,
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	1867	res->lockname.len)) {
				1868	spin_lock(&res->spinlock);
				1869	list_for_each_entry(lock, &res->granted, list) {
				1870	if (lock->ml.node == dead_node) {
				1871	mlog(0, "AHA! there was "
				1872	"a $RECOVERY lock for dead "
				1873	"node %u (%s)!\n",
				1874	dead_node, dlm->name);
				1875	list_del_init(&lock->list);
				1876	dlm_lock_put(lock);
				1877	break;
				1878	}
				1879	}
				1880	spin_unlock(&res->spinlock);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1881	continue;
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	1882	}
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1883	spin_lock(&res->spinlock);
				1884	/* zero the lvb if necessary */
				1885	dlm_revalidate_lvb(dlm, res, dead_node);
				1886	if (res->owner == dead_node)
				1887	dlm_move_lockres_to_recovery_list(dlm, res);
				1888	else if (res->owner == dlm->node_num) {
				1889	dlm_free_dead_locks(dlm, res, dead_node);
				1890	__dlm_lockres_calc_usage(dlm, res);
				1891	}
				1892	spin_unlock(&res->spinlock);
				1893	}
				1894	}
				1895
				1896	}
				1897
				1898	static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
				1899	{
				1900	assert_spin_locked(&dlm->spinlock);
				1901
				1902	/* check to see if the node is already considered dead */
				1903	if (!test_bit(idx, dlm->live_nodes_map)) {
				1904	mlog(0, "for domain %s, node %d is already dead. "
				1905	"another node likely did recovery already.\n",
				1906	dlm->name, idx);
				1907	return;
				1908	}
				1909
				1910	/* check to see if we do not care about this node */
				1911	if (!test_bit(idx, dlm->domain_map)) {
				1912	/* This also catches the case that we get a node down
				1913	* but haven't joined the domain yet. */
				1914	mlog(0, "node %u already removed from domain!\n", idx);
				1915	return;
				1916	}
				1917
				1918	clear_bit(idx, dlm->live_nodes_map);
				1919
				1920	/* Clean up join state on node death. */
				1921	if (dlm->joining_node == idx) {
				1922	mlog(0, "Clearing join state for node %u\n", idx);
				1923	__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
				1924	}
				1925
				1926	/* make sure local cleanup occurs before the heartbeat events */
				1927	if (!test_bit(idx, dlm->recovery_map))
				1928	dlm_do_local_recovery_cleanup(dlm, idx);
				1929
				1930	/* notify anything attached to the heartbeat events */
				1931	dlm_hb_event_notify_attached(dlm, idx, 0);
				1932
				1933	mlog(0, "node %u being removed from domain map!\n", idx);
				1934	clear_bit(idx, dlm->domain_map);
				1935	/* wake up migration waiters if a node goes down.
				1936	* perhaps later we can genericize this for other waiters. */
				1937	wake_up(&dlm->migration_wq);
				1938
				1939	if (test_bit(idx, dlm->recovery_map))
				1940	mlog(0, "domain %s, node %u already added "
				1941	"to recovery map!\n", dlm->name, idx);
				1942	else
				1943	set_bit(idx, dlm->recovery_map);
				1944	}
				1945
				1946	void dlm_hb_node_down_cb(struct o2nm_node node, int idx, void data)
				1947	{
				1948	struct dlm_ctxt *dlm = data;
				1949
				1950	if (!dlm_grab(dlm))
				1951	return;
				1952
				1953	spin_lock(&dlm->spinlock);
				1954	__dlm_hb_node_down(dlm, idx);
				1955	spin_unlock(&dlm->spinlock);
				1956
				1957	dlm_put(dlm);
				1958	}
				1959
				1960	void dlm_hb_node_up_cb(struct o2nm_node node, int idx, void data)
				1961	{
				1962	struct dlm_ctxt *dlm = data;
				1963
				1964	if (!dlm_grab(dlm))
				1965	return;
				1966
				1967	spin_lock(&dlm->spinlock);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1968	set_bit(idx, dlm->live_nodes_map);
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	1969	/* do NOT notify mle attached to the heartbeat events.
				1970	* new nodes are not interesting in mastery until joined. */
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1971	spin_unlock(&dlm->spinlock);
				1972
				1973	dlm_put(dlm);
				1974	}
				1975
				1976	static void dlm_reco_ast(void *astdata)
				1977	{
				1978	struct dlm_ctxt *dlm = astdata;
				1979	mlog(0, "ast for recovery lock fired!, this=%u, dlm=%s\n",
				1980	dlm->node_num, dlm->name);
				1981	}
				1982	static void dlm_reco_bast(void *astdata, int blocked_type)
				1983	{
				1984	struct dlm_ctxt *dlm = astdata;
				1985	mlog(0, "bast for recovery lock fired!, this=%u, dlm=%s\n",
				1986	dlm->node_num, dlm->name);
				1987	}
				1988	static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
				1989	{
				1990	mlog(0, "unlockast for recovery lock fired!\n");
				1991	}
				1992
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	1993	/*
				1994	* dlm_pick_recovery_master will continually attempt to use
				1995	* dlmlock() on the special "$RECOVERY" lockres with the
				1996	* LKM_NOQUEUE flag to get an EX. every thread that enters
				1997	* this function on each node racing to become the recovery
				1998	* master will not stop attempting this until either:
				1999	* a) this node gets the EX (and becomes the recovery master),
				2000	* or b) dlm->reco.new_master gets set to some nodenum
				2001	* != O2NM_INVALID_NODE_NUM (another node will do the reco).
				2002	* so each time a recovery master is needed, the entire cluster
				2003	* will sync at this point. if the new master dies, that will
				2004	* be detected in dlm_do_recovery */
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2005	static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
				2006	{
				2007	enum dlm_status ret;
				2008	struct dlm_lockstatus lksb;
				2009	int status = -EINVAL;
				2010
				2011	mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
				2012	dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	2013	again:
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2014	memset(&lksb, 0, sizeof(lksb));
				2015
				2016	ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE\|LKM_RECOVERY,
				2017	DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast);
				2018
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	2019	mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n",
				2020	dlm->name, ret, lksb.status);
				2021
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2022	if (ret == DLM_NORMAL) {
				2023	mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
				2024	dlm->name, dlm->node_num);
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	2025
				2026	/* got the EX lock. check to see if another node
				2027	* just became the reco master */
				2028	if (dlm_reco_master_ready(dlm)) {
				2029	mlog(0, "%s: got reco EX lock, but %u will "
				2030	"do the recovery\n", dlm->name,
				2031	dlm->reco.new_master);
				2032	status = -EEXIST;
				2033	} else {
				2034	status = dlm_send_begin_reco_message(dlm,
				2035	dlm->reco.dead_node);
				2036	/* this always succeeds */
				2037	BUG_ON(status);
				2038
				2039	/* set the new_master to this node */
				2040	spin_lock(&dlm->spinlock);
				2041	dlm->reco.new_master = dlm->node_num;
				2042	spin_unlock(&dlm->spinlock);
				2043	}
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2044
				2045	/* recovery lock is a special case. ast will not get fired,
				2046	* so just go ahead and unlock it. */
				2047	ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm);
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	2048	if (ret == DLM_DENIED) {
				2049	mlog(0, "got DLM_DENIED, trying LKM_CANCEL\n");
				2050	ret = dlmunlock(dlm, &lksb, LKM_CANCEL, dlm_reco_unlock_ast, dlm);
				2051	}
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2052	if (ret != DLM_NORMAL) {
				2053	/* this would really suck. this could only happen
				2054	* if there was a network error during the unlock
				2055	* because of node death. this means the unlock
				2056	* is actually "done" and the lock structure is
				2057	* even freed. we can continue, but only
				2058	* because this specific lock name is special. */
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	2059	mlog(ML_ERROR, "dlmunlock returned %d\n", ret);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2060	}
				2061	} else if (ret == DLM_NOTQUEUED) {
				2062	mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
				2063	dlm->name, dlm->node_num);
				2064	/* another node is master. wait on
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	2065	* reco.new_master != O2NM_INVALID_NODE_NUM
				2066	* for at most one second */
				2067	wait_event_timeout(dlm->dlm_reco_thread_wq,
				2068	dlm_reco_master_ready(dlm),
				2069	msecs_to_jiffies(1000));
				2070	if (!dlm_reco_master_ready(dlm)) {
				2071	mlog(0, "%s: reco master taking awhile\n",
				2072	dlm->name);
				2073	goto again;
				2074	}
				2075	/* another node has informed this one that it is reco master */
				2076	mlog(0, "%s: reco master %u is ready to recover %u\n",
				2077	dlm->name, dlm->reco.new_master, dlm->reco.dead_node);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2078	status = -EEXIST;
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	2079	} else {
				2080	struct dlm_lock_resource *res;
				2081
				2082	/* dlmlock returned something other than NOTQUEUED or NORMAL */
				2083	mlog(ML_ERROR, "%s: got %s from dlmlock($RECOVERY), "
				2084	"lksb.status=%s\n", dlm->name, dlm_errname(ret),
				2085	dlm_errname(lksb.status));
				2086	res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
				2087	DLM_RECOVERY_LOCK_NAME_LEN);
				2088	if (res) {
				2089	dlm_print_one_lock_resource(res);
				2090	dlm_lockres_put(res);
				2091	} else {
				2092	mlog(ML_ERROR, "recovery lock not found\n");
				2093	}
				2094	BUG();
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2095	}
				2096
				2097	return status;
				2098	}
				2099
				2100	static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
				2101	{
				2102	struct dlm_begin_reco br;
				2103	int ret = 0;
				2104	struct dlm_node_iter iter;
				2105	int nodenum;
				2106	int status;
				2107
				2108	mlog_entry("%u\n", dead_node);
				2109
				2110	mlog(0, "dead node is %u\n", dead_node);
				2111
				2112	spin_lock(&dlm->spinlock);
				2113	dlm_node_iter_init(dlm->domain_map, &iter);
				2114	spin_unlock(&dlm->spinlock);
				2115
				2116	clear_bit(dead_node, iter.node_map);
				2117
				2118	memset(&br, 0, sizeof(br));
				2119	br.node_idx = dlm->node_num;
				2120	br.dead_node = dead_node;
				2121
				2122	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
				2123	ret = 0;
				2124	if (nodenum == dead_node) {
				2125	mlog(0, "not sending begin reco to dead node "
				2126	"%u\n", dead_node);
				2127	continue;
				2128	}
				2129	if (nodenum == dlm->node_num) {
				2130	mlog(0, "not sending begin reco to self\n");
				2131	continue;
				2132	}
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	2133	retry:
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2134	ret = -EINVAL;
				2135	mlog(0, "attempting to send begin reco msg to %d\n",
				2136	nodenum);
				2137	ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key,
				2138	&br, sizeof(br), nodenum, &status);
				2139	/* negative status is handled ok by caller here */
				2140	if (ret >= 0)
				2141	ret = status;
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	2142	if (dlm_is_host_down(ret)) {
				2143	/* node is down. not involved in recovery
				2144	* so just keep going */
				2145	mlog(0, "%s: node %u was down when sending "
				2146	"begin reco msg (%d)\n", dlm->name, nodenum, ret);
				2147	ret = 0;
				2148	}
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2149	if (ret < 0) {
				2150	struct dlm_lock_resource *res;
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	2151	/* this is now a serious problem, possibly ENOMEM
				2152	* in the network stack. must retry */
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2153	mlog_errno(ret);
				2154	mlog(ML_ERROR, "begin reco of dlm %s to node %u "
				2155	" returned %d\n", dlm->name, nodenum, ret);
				2156	res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
				2157	DLM_RECOVERY_LOCK_NAME_LEN);
				2158	if (res) {
				2159	dlm_print_one_lock_resource(res);
				2160	dlm_lockres_put(res);
				2161	} else {
				2162	mlog(ML_ERROR, "recovery lock not found\n");
				2163	}
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	2164	/* sleep for a bit in hopes that we can avoid
				2165	* another ENOMEM */
				2166	msleep(100);
				2167	goto retry;
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2168	}
				2169	}
				2170
				2171	return ret;
				2172	}
				2173
				2174	int dlm_begin_reco_handler(struct o2net_msg msg, u32 len, void data)
				2175	{
				2176	struct dlm_ctxt *dlm = data;
				2177	struct dlm_begin_reco br = (struct dlm_begin_reco )msg->buf;
				2178
				2179	/* ok to return 0, domain has gone away */
				2180	if (!dlm_grab(dlm))
				2181	return 0;
				2182
				2183	mlog(0, "node %u wants to recover node %u\n",
				2184	br->node_idx, br->dead_node);
				2185
				2186	dlm_fire_domain_eviction_callbacks(dlm, br->dead_node);
				2187
				2188	spin_lock(&dlm->spinlock);
				2189	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	2190	if (test_bit(dlm->reco.new_master, dlm->recovery_map)) {
				2191	mlog(0, "%s: new_master %u died, changing "
				2192	"to %u\n", dlm->name, dlm->reco.new_master,
				2193	br->node_idx);
				2194	} else {
				2195	mlog(0, "%s: new_master %u NOT DEAD, changing "
				2196	"to %u\n", dlm->name, dlm->reco.new_master,
				2197	br->node_idx);
				2198	/* may not have seen the new master as dead yet */
				2199	}
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2200	}
				2201	if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	2202	mlog(ML_NOTICE, "%s: dead_node previously set to %u, "
				2203	"node %u changing it to %u\n", dlm->name,
				2204	dlm->reco.dead_node, br->node_idx, br->dead_node);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2205	}
				2206	dlm->reco.new_master = br->node_idx;
				2207	dlm->reco.dead_node = br->dead_node;
				2208	if (!test_bit(br->dead_node, dlm->recovery_map)) {
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	2209	mlog(0, "recovery master %u sees %u as dead, but this "
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2210	"node has not yet. marking %u as dead\n",
				2211	br->node_idx, br->dead_node, br->dead_node);
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame^]	2212	if (!test_bit(br->dead_node, dlm->domain_map) \|\|
				2213	!test_bit(br->dead_node, dlm->live_nodes_map))
				2214	mlog(0, "%u not in domain/live_nodes map "
				2215	"so setting it in reco map manually\n",
				2216	br->dead_node);
				2217	set_bit(br->dead_node, dlm->recovery_map);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2218	__dlm_hb_node_down(dlm, br->dead_node);
				2219	}
				2220	spin_unlock(&dlm->spinlock);
				2221
				2222	dlm_kick_recovery_thread(dlm);
				2223	dlm_put(dlm);
				2224	return 0;
				2225	}
				2226
				2227	static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
				2228	{
				2229	int ret = 0;
				2230	struct dlm_finalize_reco fr;
				2231	struct dlm_node_iter iter;
				2232	int nodenum;
				2233	int status;
				2234
				2235	mlog(0, "finishing recovery for node %s:%u\n",
				2236	dlm->name, dlm->reco.dead_node);
				2237
				2238	spin_lock(&dlm->spinlock);
				2239	dlm_node_iter_init(dlm->domain_map, &iter);
				2240	spin_unlock(&dlm->spinlock);
				2241
				2242	memset(&fr, 0, sizeof(fr));
				2243	fr.node_idx = dlm->node_num;
				2244	fr.dead_node = dlm->reco.dead_node;
				2245
				2246	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
				2247	if (nodenum == dlm->node_num)
				2248	continue;
				2249	ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key,
				2250	&fr, sizeof(fr), nodenum, &status);
				2251	if (ret >= 0) {
				2252	ret = status;
				2253	if (dlm_is_host_down(ret)) {
				2254	/* this has no effect on this recovery
				2255	* session, so set the status to zero to
				2256	* finish out the last recovery */
				2257	mlog(ML_ERROR, "node %u went down after this "
				2258	"node finished recovery.\n", nodenum);
				2259	ret = 0;
				2260	}
				2261	}
				2262	if (ret < 0) {
				2263	mlog_errno(ret);
				2264	break;
				2265	}
				2266	}
				2267
				2268	return ret;
				2269	}
				2270
				2271	int dlm_finalize_reco_handler(struct o2net_msg msg, u32 len, void data)
				2272	{
				2273	struct dlm_ctxt *dlm = data;
				2274	struct dlm_finalize_reco fr = (struct dlm_finalize_reco )msg->buf;
				2275
				2276	/* ok to return 0, domain has gone away */
				2277	if (!dlm_grab(dlm))
				2278	return 0;
				2279
				2280	mlog(0, "node %u finalizing recovery of node %u\n",
				2281	fr->node_idx, fr->dead_node);
				2282
				2283	spin_lock(&dlm->spinlock);
				2284
				2285	if (dlm->reco.new_master != fr->node_idx) {
				2286	mlog(ML_ERROR, "node %u sent recovery finalize msg, but node "
				2287	"%u is supposed to be the new master, dead=%u\n",
				2288	fr->node_idx, dlm->reco.new_master, fr->dead_node);
				2289	BUG();
				2290	}
				2291	if (dlm->reco.dead_node != fr->dead_node) {
				2292	mlog(ML_ERROR, "node %u sent recovery finalize msg for dead "
				2293	"node %u, but node %u is supposed to be dead\n",
				2294	fr->node_idx, fr->dead_node, dlm->reco.dead_node);
				2295	BUG();
				2296	}
				2297
				2298	dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
				2299
				2300	spin_unlock(&dlm->spinlock);
				2301
				2302	dlm_reset_recovery(dlm);
				2303
				2304	dlm_kick_recovery_thread(dlm);
				2305	dlm_put(dlm);
				2306	return 0;
				2307	}