Blame - fs/ocfs2/dlm/dlmrecovery.c - kernel/msm-4.9

blob: 39488763728936e706dfb952f07e3f6b5963c0e5 [file] [log] [blame]

Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1	/* -- mode: c; c-basic-offset: 8; --
				2	* vim: noexpandtab sw=8 ts=8 sts=0:
				3	*
				4	* dlmrecovery.c
				5	*
				6	* recovery stuff
				7	*
				8	* Copyright (C) 2004 Oracle. All rights reserved.
				9	*
				10	* This program is free software; you can redistribute it and/or
				11	* modify it under the terms of the GNU General Public
				12	* License as published by the Free Software Foundation; either
				13	* version 2 of the License, or (at your option) any later version.
				14	*
				15	* This program is distributed in the hope that it will be useful,
				16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				18	* General Public License for more details.
				19	*
				20	* You should have received a copy of the GNU General Public
				21	* License along with this program; if not, write to the
				22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				23	* Boston, MA 021110-1307, USA.
				24	*
				25	*/
				26
				27
				28	#include <linux/module.h>
				29	#include <linux/fs.h>
				30	#include <linux/types.h>
				31	#include <linux/slab.h>
				32	#include <linux/highmem.h>
				33	#include <linux/utsname.h>
				34	#include <linux/init.h>
				35	#include <linux/sysctl.h>
				36	#include <linux/random.h>
				37	#include <linux/blkdev.h>
				38	#include <linux/socket.h>
				39	#include <linux/inet.h>
				40	#include <linux/timer.h>
				41	#include <linux/kthread.h>
Adrian Bunk	b4c7f53	2006-01-14 20:55:10 +0100	[diff] [blame]	42	#include <linux/delay.h>
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	43
				44
				45	#include "cluster/heartbeat.h"
				46	#include "cluster/nodemanager.h"
				47	#include "cluster/tcp.h"
				48
				49	#include "dlmapi.h"
				50	#include "dlmcommon.h"
				51	#include "dlmdomain.h"
				52
				53	#define MLOG_MASK_PREFIX (ML_DLM\|ML_DLM_RECOVERY)
				54	#include "cluster/masklog.h"
				55
				56	static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
				57
				58	static int dlm_recovery_thread(void *data);
				59	void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
				60	int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
Kurt Hackel	c03872f	2006-03-06 14:08:49 -0800	[diff] [blame]	61	void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	62	static int dlm_do_recovery(struct dlm_ctxt *dlm);
				63
				64	static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
				65	static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node);
				66	static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
				67	static int dlm_request_all_locks(struct dlm_ctxt *dlm,
				68	u8 request_from, u8 dead_node);
				69	static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
				70
				71	static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res);
				72	static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
				73	const char *lockname, int namelen,
				74	int total_locks, u64 cookie,
				75	u8 flags, u8 master);
				76	static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
				77	struct dlm_migratable_lockres *mres,
				78	u8 send_to,
				79	struct dlm_lock_resource *res,
				80	int total_locks);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	81	static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
				82	struct dlm_lock_resource *res,
				83	struct dlm_migratable_lockres *mres);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	84	static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm);
				85	static int dlm_send_all_done_msg(struct dlm_ctxt *dlm,
				86	u8 dead_node, u8 send_to);
				87	static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node);
				88	static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
				89	struct list_head *list, u8 dead_node);
				90	static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
				91	u8 dead_node, u8 new_master);
				92	static void dlm_reco_ast(void *astdata);
				93	static void dlm_reco_bast(void *astdata, int blocked_type);
				94	static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st);
				95	static void dlm_request_all_locks_worker(struct dlm_work_item *item,
				96	void *data);
				97	static void dlm_mig_lockres_worker(struct dlm_work_item item, void data);
				98
				99	static u64 dlm_get_next_mig_cookie(void);
				100
				101	static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED;
				102	static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED;
				103	static u64 dlm_mig_cookie = 1;
				104
				105	static u64 dlm_get_next_mig_cookie(void)
				106	{
				107	u64 c;
				108	spin_lock(&dlm_mig_cookie_lock);
				109	c = dlm_mig_cookie;
				110	if (dlm_mig_cookie == (~0ULL))
				111	dlm_mig_cookie = 1;
				112	else
				113	dlm_mig_cookie++;
				114	spin_unlock(&dlm_mig_cookie_lock);
				115	return c;
				116	}
				117
Kurt Hackel	ab27eb6	2006-04-27 18:03:49 -0700	[diff] [blame]	118	static inline void dlm_set_reco_dead_node(struct dlm_ctxt *dlm,
				119	u8 dead_node)
				120	{
				121	assert_spin_locked(&dlm->spinlock);
				122	if (dlm->reco.dead_node != dead_node)
				123	mlog(0, "%s: changing dead_node from %u to %u\n",
				124	dlm->name, dlm->reco.dead_node, dead_node);
				125	dlm->reco.dead_node = dead_node;
				126	}
				127
				128	static inline void dlm_set_reco_master(struct dlm_ctxt *dlm,
				129	u8 master)
				130	{
				131	assert_spin_locked(&dlm->spinlock);
				132	mlog(0, "%s: changing new_master from %u to %u\n",
				133	dlm->name, dlm->reco.new_master, master);
				134	dlm->reco.new_master = master;
				135	}
				136
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	137	static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
				138	{
				139	spin_lock(&dlm->spinlock);
				140	clear_bit(dlm->reco.dead_node, dlm->recovery_map);
Kurt Hackel	ab27eb6	2006-04-27 18:03:49 -0700	[diff] [blame]	141	dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
				142	dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	143	spin_unlock(&dlm->spinlock);
				144	}
				145
				146	/* Worker function used during recovery. */
				147	void dlm_dispatch_work(void *data)
				148	{
				149	struct dlm_ctxt dlm = (struct dlm_ctxt )data;
				150	LIST_HEAD(tmp_list);
				151	struct list_head iter, iter2;
				152	struct dlm_work_item *item;
				153	dlm_workfunc_t *workfunc;
				154
				155	spin_lock(&dlm->work_lock);
				156	list_splice_init(&dlm->work_list, &tmp_list);
				157	spin_unlock(&dlm->work_lock);
				158
				159	list_for_each_safe(iter, iter2, &tmp_list) {
				160	item = list_entry(iter, struct dlm_work_item, list);
				161	workfunc = item->func;
				162	list_del_init(&item->list);
				163
				164	/* already have ref on dlm to avoid having
				165	* it disappear. just double-check. */
				166	BUG_ON(item->dlm != dlm);
				167
				168	/* this is allowed to sleep and
				169	* call network stuff */
				170	workfunc(item, item->data);
				171
				172	dlm_put(dlm);
				173	kfree(item);
				174	}
				175	}
				176
				177	/*
				178	* RECOVERY THREAD
				179	*/
				180
Kurt Hackel	c03872f	2006-03-06 14:08:49 -0800	[diff] [blame]	181	void dlm_kick_recovery_thread(struct dlm_ctxt *dlm)
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	182	{
				183	/* wake the recovery thread
				184	* this will wake the reco thread in one of three places
				185	* 1) sleeping with no recovery happening
				186	* 2) sleeping with recovery mastered elsewhere
				187	* 3) recovery mastered here, waiting on reco data */
				188
				189	wake_up(&dlm->dlm_reco_thread_wq);
				190	}
				191
				192	/* Launch the recovery thread */
				193	int dlm_launch_recovery_thread(struct dlm_ctxt *dlm)
				194	{
				195	mlog(0, "starting dlm recovery thread...\n");
				196
				197	dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm,
				198	"dlm_reco_thread");
				199	if (IS_ERR(dlm->dlm_reco_thread_task)) {
				200	mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task));
				201	dlm->dlm_reco_thread_task = NULL;
				202	return -EINVAL;
				203	}
				204
				205	return 0;
				206	}
				207
				208	void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
				209	{
				210	if (dlm->dlm_reco_thread_task) {
				211	mlog(0, "waiting for dlm recovery thread to exit\n");
				212	kthread_stop(dlm->dlm_reco_thread_task);
				213	dlm->dlm_reco_thread_task = NULL;
				214	}
				215	}
				216
				217
				218
				219	/*
				220	* this is lame, but here's how recovery works...
				221	* 1) all recovery threads cluster wide will work on recovering
				222	* ONE node at a time
				223	* 2) negotiate who will take over all the locks for the dead node.
				224	* thats right... ALL the locks.
				225	* 3) once a new master is chosen, everyone scans all locks
				226	* and moves aside those mastered by the dead guy
				227	* 4) each of these locks should be locked until recovery is done
				228	* 5) the new master collects up all of secondary lock queue info
				229	* one lock at a time, forcing each node to communicate back
				230	* before continuing
				231	* 6) each secondary lock queue responds with the full known lock info
				232	* 7) once the new master has run all its locks, it sends a ALLDONE!
				233	* message to everyone
				234	* 8) upon receiving this message, the secondary queue node unlocks
				235	* and responds to the ALLDONE
				236	* 9) once the new master gets responses from everyone, he unlocks
				237	* everything and recovery for this dead node is done
				238	*10) go back to 2) while there are still dead nodes
				239	*
				240	*/
				241
				242
				243	#define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
				244
				245	static int dlm_recovery_thread(void *data)
				246	{
				247	int status;
				248	struct dlm_ctxt *dlm = data;
				249	unsigned long timeout = msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS);
				250
				251	mlog(0, "dlm thread running for %s...\n", dlm->name);
				252
				253	while (!kthread_should_stop()) {
				254	if (dlm_joined(dlm)) {
				255	status = dlm_do_recovery(dlm);
				256	if (status == -EAGAIN) {
				257	/* do not sleep, recheck immediately. */
				258	continue;
				259	}
				260	if (status < 0)
				261	mlog_errno(status);
				262	}
				263
				264	wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
				265	kthread_should_stop(),
				266	timeout);
				267	}
				268
				269	mlog(0, "quitting DLM recovery thread\n");
				270	return 0;
				271	}
				272
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	273	/* returns true when the recovery master has contacted us */
				274	static int dlm_reco_master_ready(struct dlm_ctxt *dlm)
				275	{
				276	int ready;
				277	spin_lock(&dlm->spinlock);
				278	ready = (dlm->reco.new_master != O2NM_INVALID_NODE_NUM);
				279	spin_unlock(&dlm->spinlock);
				280	return ready;
				281	}
				282
				283	/* returns true if node is no longer in the domain
				284	* could be dead or just not joined */
				285	int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
				286	{
				287	int dead;
				288	spin_lock(&dlm->spinlock);
Kurt Hackel	aba9aac	2006-04-27 18:00:21 -0700	[diff] [blame]	289	dead = !test_bit(node, dlm->domain_map);
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	290	spin_unlock(&dlm->spinlock);
				291	return dead;
				292	}
				293
Kurt Hackel	44465a7	2006-01-18 17:05:38 -0800	[diff] [blame]	294	int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
				295	{
				296	if (timeout) {
				297	mlog(ML_NOTICE, "%s: waiting %dms for notification of "
				298	"death of node %u\n", dlm->name, timeout, node);
				299	wait_event_timeout(dlm->dlm_reco_thread_wq,
				300	dlm_is_node_dead(dlm, node),
				301	msecs_to_jiffies(timeout));
				302	} else {
				303	mlog(ML_NOTICE, "%s: waiting indefinitely for notification "
				304	"of death of node %u\n", dlm->name, node);
				305	wait_event(dlm->dlm_reco_thread_wq,
				306	dlm_is_node_dead(dlm, node));
				307	}
				308	/* for now, return 0 */
				309	return 0;
				310	}
				311
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	312	/* callers of the top-level api calls (dlmlock/dlmunlock) should
				313	* block on the dlm->reco.event when recovery is in progress.
				314	* the dlm recovery thread will set this state when it begins
				315	* recovering a dead node (as the new master or not) and clear
				316	* the state and wake as soon as all affected lock resources have
				317	* been marked with the RECOVERY flag */
				318	static int dlm_in_recovery(struct dlm_ctxt *dlm)
				319	{
				320	int in_recovery;
				321	spin_lock(&dlm->spinlock);
				322	in_recovery = !!(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
				323	spin_unlock(&dlm->spinlock);
				324	return in_recovery;
				325	}
				326
				327
				328	void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
				329	{
				330	wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
				331	}
				332
				333	static void dlm_begin_recovery(struct dlm_ctxt *dlm)
				334	{
				335	spin_lock(&dlm->spinlock);
				336	BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
				337	dlm->reco.state \|= DLM_RECO_STATE_ACTIVE;
				338	spin_unlock(&dlm->spinlock);
				339	}
				340
				341	static void dlm_end_recovery(struct dlm_ctxt *dlm)
				342	{
				343	spin_lock(&dlm->spinlock);
				344	BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
				345	dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
				346	spin_unlock(&dlm->spinlock);
				347	wake_up(&dlm->reco.event);
				348	}
				349
				350	static int dlm_do_recovery(struct dlm_ctxt *dlm)
				351	{
				352	int status = 0;
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	353	int ret;
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	354
				355	spin_lock(&dlm->spinlock);
				356
				357	/* check to see if the new master has died */
				358	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
				359	test_bit(dlm->reco.new_master, dlm->recovery_map)) {
				360	mlog(0, "new master %u died while recovering %u!\n",
				361	dlm->reco.new_master, dlm->reco.dead_node);
				362	/* unset the new_master, leave dead_node */
Kurt Hackel	ab27eb6	2006-04-27 18:03:49 -0700	[diff] [blame]	363	dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	364	}
				365
				366	/* select a target to recover */
				367	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
				368	int bit;
				369
				370	bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
				371	if (bit >= O2NM_MAX_NODES \|\| bit < 0)
Kurt Hackel	ab27eb6	2006-04-27 18:03:49 -0700	[diff] [blame]	372	dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	373	else
Kurt Hackel	ab27eb6	2006-04-27 18:03:49 -0700	[diff] [blame]	374	dlm_set_reco_dead_node(dlm, bit);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	375	} else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
				376	/* BUG? */
				377	mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n",
				378	dlm->reco.dead_node);
Kurt Hackel	ab27eb6	2006-04-27 18:03:49 -0700	[diff] [blame]	379	dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	380	}
				381
				382	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
				383	// mlog(0, "nothing to recover! sleeping now!\n");
				384	spin_unlock(&dlm->spinlock);
				385	/* return to main thread loop and sleep. */
				386	return 0;
				387	}
				388	mlog(0, "recovery thread found node %u in the recovery map!\n",
				389	dlm->reco.dead_node);
				390	spin_unlock(&dlm->spinlock);
				391
				392	/* take write barrier */
				393	/* (stops the list reshuffling thread, proxy ast handling) */
				394	dlm_begin_recovery(dlm);
				395
				396	if (dlm->reco.new_master == dlm->node_num)
				397	goto master_here;
				398
				399	if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	400	/* choose a new master, returns 0 if this node
				401	* is the master, -EEXIST if it's another node.
				402	* this does not return until a new master is chosen
				403	* or recovery completes entirely. */
				404	ret = dlm_pick_recovery_master(dlm);
				405	if (!ret) {
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	406	/* already notified everyone. go. */
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	407	goto master_here;
				408	}
				409	mlog(0, "another node will master this recovery session.\n");
				410	}
				411	mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n",
				412	dlm->name, dlm->reco.new_master,
				413	dlm->node_num, dlm->reco.dead_node);
				414
				415	/* it is safe to start everything back up here
				416	* because all of the dead node's lock resources
				417	* have been marked as in-recovery */
				418	dlm_end_recovery(dlm);
				419
				420	/* sleep out in main dlm_recovery_thread loop. */
				421	return 0;
				422
				423	master_here:
				424	mlog(0, "mastering recovery of %s:%u here(this=%u)!\n",
				425	dlm->name, dlm->reco.dead_node, dlm->node_num);
				426
				427	status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
				428	if (status < 0) {
				429	mlog(ML_ERROR, "error %d remastering locks for node %u, "
				430	"retrying.\n", status, dlm->reco.dead_node);
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	431	/* yield a bit to allow any final network messages
				432	* to get handled on remaining nodes */
				433	msleep(100);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	434	} else {
				435	/* success! see if any other nodes need recovery */
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	436	mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n",
				437	dlm->name, dlm->reco.dead_node, dlm->node_num);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	438	dlm_reset_recovery(dlm);
				439	}
				440	dlm_end_recovery(dlm);
				441
				442	/* continue and look for another dead node */
				443	return -EAGAIN;
				444	}
				445
				446	static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
				447	{
				448	int status = 0;
				449	struct dlm_reco_node_data *ndata;
				450	struct list_head *iter;
				451	int all_nodes_done;
				452	int destroy = 0;
				453	int pass = 0;
				454
				455	status = dlm_init_recovery_area(dlm, dead_node);
				456	if (status < 0)
				457	goto leave;
				458
				459	/* safe to access the node data list without a lock, since this
				460	* process is the only one to change the list */
				461	list_for_each(iter, &dlm->reco.node_data) {
				462	ndata = list_entry (iter, struct dlm_reco_node_data, list);
				463	BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
				464	ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
				465
				466	mlog(0, "requesting lock info from node %u\n",
				467	ndata->node_num);
				468
				469	if (ndata->node_num == dlm->node_num) {
				470	ndata->state = DLM_RECO_NODE_DATA_DONE;
				471	continue;
				472	}
				473
				474	status = dlm_request_all_locks(dlm, ndata->node_num, dead_node);
				475	if (status < 0) {
				476	mlog_errno(status);
				477	if (dlm_is_host_down(status))
				478	ndata->state = DLM_RECO_NODE_DATA_DEAD;
				479	else {
				480	destroy = 1;
				481	goto leave;
				482	}
				483	}
				484
				485	switch (ndata->state) {
				486	case DLM_RECO_NODE_DATA_INIT:
				487	case DLM_RECO_NODE_DATA_FINALIZE_SENT:
				488	case DLM_RECO_NODE_DATA_REQUESTED:
				489	BUG();
				490	break;
				491	case DLM_RECO_NODE_DATA_DEAD:
				492	mlog(0, "node %u died after requesting "
				493	"recovery info for node %u\n",
				494	ndata->node_num, dead_node);
				495	// start all over
				496	destroy = 1;
				497	status = -EAGAIN;
				498	goto leave;
				499	case DLM_RECO_NODE_DATA_REQUESTING:
				500	ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
				501	mlog(0, "now receiving recovery data from "
				502	"node %u for dead node %u\n",
				503	ndata->node_num, dead_node);
				504	break;
				505	case DLM_RECO_NODE_DATA_RECEIVING:
				506	mlog(0, "already receiving recovery data from "
				507	"node %u for dead node %u\n",
				508	ndata->node_num, dead_node);
				509	break;
				510	case DLM_RECO_NODE_DATA_DONE:
				511	mlog(0, "already DONE receiving recovery data "
				512	"from node %u for dead node %u\n",
				513	ndata->node_num, dead_node);
				514	break;
				515	}
				516	}
				517
				518	mlog(0, "done requesting all lock info\n");
				519
				520	/* nodes should be sending reco data now
				521	* just need to wait */
				522
				523	while (1) {
				524	/* check all the nodes now to see if we are
				525	* done, or if anyone died */
				526	all_nodes_done = 1;
				527	spin_lock(&dlm_reco_state_lock);
				528	list_for_each(iter, &dlm->reco.node_data) {
				529	ndata = list_entry (iter, struct dlm_reco_node_data, list);
				530
				531	mlog(0, "checking recovery state of node %u\n",
				532	ndata->node_num);
				533	switch (ndata->state) {
				534	case DLM_RECO_NODE_DATA_INIT:
				535	case DLM_RECO_NODE_DATA_REQUESTING:
				536	mlog(ML_ERROR, "bad ndata state for "
				537	"node %u: state=%d\n",
				538	ndata->node_num, ndata->state);
				539	BUG();
				540	break;
				541	case DLM_RECO_NODE_DATA_DEAD:
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	542	mlog(ML_NOTICE, "node %u died after "
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	543	"requesting recovery info for "
				544	"node %u\n", ndata->node_num,
				545	dead_node);
				546	spin_unlock(&dlm_reco_state_lock);
				547	// start all over
				548	destroy = 1;
				549	status = -EAGAIN;
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	550	/* instead of spinning like crazy here,
				551	* wait for the domain map to catch up
				552	* with the network state. otherwise this
				553	* can be hit hundreds of times before
				554	* the node is really seen as dead. */
				555	wait_event_timeout(dlm->dlm_reco_thread_wq,
				556	dlm_is_node_dead(dlm,
				557	ndata->node_num),
				558	msecs_to_jiffies(1000));
				559	mlog(0, "waited 1 sec for %u, "
				560	"dead? %s\n", ndata->node_num,
				561	dlm_is_node_dead(dlm, ndata->node_num) ?
				562	"yes" : "no");
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	563	goto leave;
				564	case DLM_RECO_NODE_DATA_RECEIVING:
				565	case DLM_RECO_NODE_DATA_REQUESTED:
				566	all_nodes_done = 0;
				567	break;
				568	case DLM_RECO_NODE_DATA_DONE:
				569	break;
				570	case DLM_RECO_NODE_DATA_FINALIZE_SENT:
				571	break;
				572	}
				573	}
				574	spin_unlock(&dlm_reco_state_lock);
				575
				576	mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass,
				577	all_nodes_done?"yes":"no");
				578	if (all_nodes_done) {
				579	int ret;
				580
				581	/* all nodes are now in DLM_RECO_NODE_DATA_DONE state
				582	* just send a finalize message to everyone and
				583	* clean up */
				584	mlog(0, "all nodes are done! send finalize\n");
				585	ret = dlm_send_finalize_reco_message(dlm);
				586	if (ret < 0)
				587	mlog_errno(ret);
				588
				589	spin_lock(&dlm->spinlock);
				590	dlm_finish_local_lockres_recovery(dlm, dead_node,
				591	dlm->node_num);
				592	spin_unlock(&dlm->spinlock);
				593	mlog(0, "should be done with recovery!\n");
				594
				595	mlog(0, "finishing recovery of %s at %lu, "
				596	"dead=%u, this=%u, new=%u\n", dlm->name,
				597	jiffies, dlm->reco.dead_node,
				598	dlm->node_num, dlm->reco.new_master);
				599	destroy = 1;
				600	status = ret;
				601	/* rescan everything marked dirty along the way */
				602	dlm_kick_thread(dlm, NULL);
				603	break;
				604	}
				605	/* wait to be signalled, with periodic timeout
				606	* to check for node death */
				607	wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
				608	kthread_should_stop(),
				609	msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS));
				610
				611	}
				612
				613	leave:
				614	if (destroy)
				615	dlm_destroy_recovery_area(dlm, dead_node);
				616
				617	mlog_exit(status);
				618	return status;
				619	}
				620
				621	static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
				622	{
				623	int num=0;
				624	struct dlm_reco_node_data *ndata;
				625
				626	spin_lock(&dlm->spinlock);
				627	memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map));
				628	/* nodes can only be removed (by dying) after dropping
				629	* this lock, and death will be trapped later, so this should do */
				630	spin_unlock(&dlm->spinlock);
				631
				632	while (1) {
				633	num = find_next_bit (dlm->reco.node_map, O2NM_MAX_NODES, num);
				634	if (num >= O2NM_MAX_NODES) {
				635	break;
				636	}
				637	BUG_ON(num == dead_node);
				638
				639	ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL);
				640	if (!ndata) {
				641	dlm_destroy_recovery_area(dlm, dead_node);
				642	return -ENOMEM;
				643	}
				644	ndata->node_num = num;
				645	ndata->state = DLM_RECO_NODE_DATA_INIT;
				646	spin_lock(&dlm_reco_state_lock);
				647	list_add_tail(&ndata->list, &dlm->reco.node_data);
				648	spin_unlock(&dlm_reco_state_lock);
				649	num++;
				650	}
				651
				652	return 0;
				653	}
				654
				655	static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
				656	{
				657	struct list_head iter, iter2;
				658	struct dlm_reco_node_data *ndata;
				659	LIST_HEAD(tmplist);
				660
				661	spin_lock(&dlm_reco_state_lock);
				662	list_splice_init(&dlm->reco.node_data, &tmplist);
				663	spin_unlock(&dlm_reco_state_lock);
				664
				665	list_for_each_safe(iter, iter2, &tmplist) {
				666	ndata = list_entry (iter, struct dlm_reco_node_data, list);
				667	list_del_init(&ndata->list);
				668	kfree(ndata);
				669	}
				670	}
				671
				672	static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
				673	u8 dead_node)
				674	{
				675	struct dlm_lock_request lr;
				676	enum dlm_status ret;
				677
				678	mlog(0, "\n");
				679
				680
				681	mlog(0, "dlm_request_all_locks: dead node is %u, sending request "
				682	"to %u\n", dead_node, request_from);
				683
				684	memset(&lr, 0, sizeof(lr));
				685	lr.node_idx = dlm->node_num;
				686	lr.dead_node = dead_node;
				687
				688	// send message
				689	ret = DLM_NOLOCKMGR;
				690	ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
				691	&lr, sizeof(lr), request_from, NULL);
				692
				693	/* negative status is handled by caller */
				694	if (ret < 0)
				695	mlog_errno(ret);
				696
				697	// return from here, then
				698	// sleep until all received or error
				699	return ret;
				700
				701	}
				702
				703	int dlm_request_all_locks_handler(struct o2net_msg msg, u32 len, void data)
				704	{
				705	struct dlm_ctxt *dlm = data;
				706	struct dlm_lock_request lr = (struct dlm_lock_request )msg->buf;
				707	char *buf = NULL;
				708	struct dlm_work_item *item = NULL;
				709
				710	if (!dlm_grab(dlm))
				711	return -EINVAL;
				712
Kurt Hackel	c3187ce	2006-04-27 18:05:41 -0700	[diff] [blame^]	713	if (lr->dead_node != dlm->reco.dead_node) {
				714	mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local "
				715	"dead_node is %u\n", dlm->name, lr->node_idx,
				716	lr->dead_node, dlm->reco.dead_node);
				717	/* this is a hack */
				718	dlm_put(dlm);
				719	return -ENOMEM;
				720	}
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	721	BUG_ON(lr->dead_node != dlm->reco.dead_node);
				722
				723	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
				724	if (!item) {
				725	dlm_put(dlm);
				726	return -ENOMEM;
				727	}
				728
				729	/* this will get freed by dlm_request_all_locks_worker */
				730	buf = (char *) __get_free_page(GFP_KERNEL);
				731	if (!buf) {
				732	kfree(item);
				733	dlm_put(dlm);
				734	return -ENOMEM;
				735	}
				736
				737	/* queue up work for dlm_request_all_locks_worker */
				738	dlm_grab(dlm); /* get an extra ref for the work item */
				739	dlm_init_work_item(dlm, item, dlm_request_all_locks_worker, buf);
				740	item->u.ral.reco_master = lr->node_idx;
				741	item->u.ral.dead_node = lr->dead_node;
				742	spin_lock(&dlm->work_lock);
				743	list_add_tail(&item->list, &dlm->work_list);
				744	spin_unlock(&dlm->work_lock);
				745	schedule_work(&dlm->dispatched_work);
				746
				747	dlm_put(dlm);
				748	return 0;
				749	}
				750
				751	static void dlm_request_all_locks_worker(struct dlm_work_item item, void data)
				752	{
				753	struct dlm_migratable_lockres *mres;
				754	struct dlm_lock_resource *res;
				755	struct dlm_ctxt *dlm;
				756	LIST_HEAD(resources);
				757	struct list_head *iter;
				758	int ret;
				759	u8 dead_node, reco_master;
				760
				761	dlm = item->dlm;
				762	dead_node = item->u.ral.dead_node;
				763	reco_master = item->u.ral.reco_master;
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	764	mres = (struct dlm_migratable_lockres *)data;
				765
				766	if (dead_node != dlm->reco.dead_node \|\|
				767	reco_master != dlm->reco.new_master) {
				768	/* show extra debug info if the recovery state is messed */
				769	mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), "
				770	"request(dead=%u, master=%u)\n",
				771	dlm->name, dlm->reco.dead_node, dlm->reco.new_master,
				772	dead_node, reco_master);
				773	mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u "
Kurt Hackel	2900485	2006-03-02 16:43:36 -0800	[diff] [blame]	774	"entry[0]={c=%u:%llu,l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n",
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	775	dlm->name, mres->lockname_len, mres->lockname, mres->master,
				776	mres->num_locks, mres->total_locks, mres->flags,
Kurt Hackel	2900485	2006-03-02 16:43:36 -0800	[diff] [blame]	777	dlm_get_lock_cookie_node(mres->ml[0].cookie),
				778	dlm_get_lock_cookie_seq(mres->ml[0].cookie),
				779	mres->ml[0].list, mres->ml[0].flags,
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	780	mres->ml[0].type, mres->ml[0].convert_type,
				781	mres->ml[0].highest_blocked, mres->ml[0].node);
				782	BUG();
				783	}
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	784	BUG_ON(dead_node != dlm->reco.dead_node);
				785	BUG_ON(reco_master != dlm->reco.new_master);
				786
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	787	/* lock resources should have already been moved to the
				788	* dlm->reco.resources list. now move items from that list
				789	* to a temp list if the dead owner matches. note that the
				790	* whole cluster recovers only one node at a time, so we
				791	* can safely move UNKNOWN lock resources for each recovery
				792	* session. */
				793	dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
				794
				795	/* now we can begin blasting lockreses without the dlm lock */
				796	list_for_each(iter, &resources) {
				797	res = list_entry (iter, struct dlm_lock_resource, recovering);
				798	ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
				799	DLM_MRES_RECOVERY);
				800	if (ret < 0)
				801	mlog_errno(ret);
				802	}
				803
				804	/* move the resources back to the list */
				805	spin_lock(&dlm->spinlock);
				806	list_splice_init(&resources, &dlm->reco.resources);
				807	spin_unlock(&dlm->spinlock);
				808
				809	ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
				810	if (ret < 0)
				811	mlog_errno(ret);
				812
				813	free_page((unsigned long)data);
				814	}
				815
				816
				817	static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
				818	{
				819	int ret, tmpret;
				820	struct dlm_reco_data_done done_msg;
				821
				822	memset(&done_msg, 0, sizeof(done_msg));
				823	done_msg.node_idx = dlm->node_num;
				824	done_msg.dead_node = dead_node;
				825	mlog(0, "sending DATA DONE message to %u, "
				826	"my node=%u, dead node=%u\n", send_to, done_msg.node_idx,
				827	done_msg.dead_node);
				828
				829	ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
				830	sizeof(done_msg), send_to, &tmpret);
				831	/* negative status is ignored by the caller */
				832	if (ret >= 0)
				833	ret = tmpret;
				834	return ret;
				835	}
				836
				837
				838	int dlm_reco_data_done_handler(struct o2net_msg msg, u32 len, void data)
				839	{
				840	struct dlm_ctxt *dlm = data;
				841	struct dlm_reco_data_done done = (struct dlm_reco_data_done )msg->buf;
				842	struct list_head *iter;
				843	struct dlm_reco_node_data *ndata = NULL;
				844	int ret = -EINVAL;
				845
				846	if (!dlm_grab(dlm))
				847	return -EINVAL;
				848
				849	mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
				850	"node_idx=%u, this node=%u\n", done->dead_node,
				851	dlm->reco.dead_node, done->node_idx, dlm->node_num);
				852	BUG_ON(done->dead_node != dlm->reco.dead_node);
				853
				854	spin_lock(&dlm_reco_state_lock);
				855	list_for_each(iter, &dlm->reco.node_data) {
				856	ndata = list_entry (iter, struct dlm_reco_node_data, list);
				857	if (ndata->node_num != done->node_idx)
				858	continue;
				859
				860	switch (ndata->state) {
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	861	/* should have moved beyond INIT but not to FINALIZE yet */
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	862	case DLM_RECO_NODE_DATA_INIT:
				863	case DLM_RECO_NODE_DATA_DEAD:
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	864	case DLM_RECO_NODE_DATA_FINALIZE_SENT:
				865	mlog(ML_ERROR, "bad ndata state for node %u:"
				866	" state=%d\n", ndata->node_num,
				867	ndata->state);
				868	BUG();
				869	break;
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	870	/* these states are possible at this point, anywhere along
				871	* the line of recovery */
				872	case DLM_RECO_NODE_DATA_DONE:
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	873	case DLM_RECO_NODE_DATA_RECEIVING:
				874	case DLM_RECO_NODE_DATA_REQUESTED:
				875	case DLM_RECO_NODE_DATA_REQUESTING:
				876	mlog(0, "node %u is DONE sending "
				877	"recovery data!\n",
				878	ndata->node_num);
				879
				880	ndata->state = DLM_RECO_NODE_DATA_DONE;
				881	ret = 0;
				882	break;
				883	}
				884	}
				885	spin_unlock(&dlm_reco_state_lock);
				886
				887	/* wake the recovery thread, some node is done */
				888	if (!ret)
				889	dlm_kick_recovery_thread(dlm);
				890
				891	if (ret < 0)
				892	mlog(ML_ERROR, "failed to find recovery node data for node "
				893	"%u\n", done->node_idx);
				894	dlm_put(dlm);
				895
				896	mlog(0, "leaving reco data done handler, ret=%d\n", ret);
				897	return ret;
				898	}
				899
				900	static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
				901	struct list_head *list,
				902	u8 dead_node)
				903	{
				904	struct dlm_lock_resource *res;
				905	struct list_head iter, iter2;
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	906	struct dlm_lock *lock;
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	907
				908	spin_lock(&dlm->spinlock);
				909	list_for_each_safe(iter, iter2, &dlm->reco.resources) {
				910	res = list_entry (iter, struct dlm_lock_resource, recovering);
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	911	/* always prune any $RECOVERY entries for dead nodes,
				912	* otherwise hangs can occur during later recovery */
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	913	if (dlm_is_recovery_lock(res->lockname.name,
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	914	res->lockname.len)) {
				915	spin_lock(&res->spinlock);
				916	list_for_each_entry(lock, &res->granted, list) {
				917	if (lock->ml.node == dead_node) {
				918	mlog(0, "AHA! there was "
				919	"a $RECOVERY lock for dead "
				920	"node %u (%s)!\n",
				921	dead_node, dlm->name);
				922	list_del_init(&lock->list);
				923	dlm_lock_put(lock);
				924	break;
				925	}
				926	}
				927	spin_unlock(&res->spinlock);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	928	continue;
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	929	}
				930
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	931	if (res->owner == dead_node) {
				932	mlog(0, "found lockres owned by dead node while "
				933	"doing recovery for node %u. sending it.\n",
				934	dead_node);
Akinobu Mita	f116629	2006-06-26 00:24:46 -0700	[diff] [blame]	935	list_move_tail(&res->recovering, list);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	936	} else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
				937	mlog(0, "found UNKNOWN owner while doing recovery "
				938	"for node %u. sending it.\n", dead_node);
Akinobu Mita	f116629	2006-06-26 00:24:46 -0700	[diff] [blame]	939	list_move_tail(&res->recovering, list);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	940	}
				941	}
				942	spin_unlock(&dlm->spinlock);
				943	}
				944
				945	static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res)
				946	{
				947	int total_locks = 0;
				948	struct list_head iter, queue = &res->granted;
				949	int i;
				950
				951	for (i=0; i<3; i++) {
				952	list_for_each(iter, queue)
				953	total_locks++;
				954	queue++;
				955	}
				956	return total_locks;
				957	}
				958
				959
				960	static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
				961	struct dlm_migratable_lockres *mres,
				962	u8 send_to,
				963	struct dlm_lock_resource *res,
				964	int total_locks)
				965	{
				966	u64 mig_cookie = be64_to_cpu(mres->mig_cookie);
				967	int mres_total_locks = be32_to_cpu(mres->total_locks);
				968	int sz, ret = 0, status = 0;
				969	u8 orig_flags = mres->flags,
				970	orig_master = mres->master;
				971
				972	BUG_ON(mres->num_locks > DLM_MAX_MIGRATABLE_LOCKS);
				973	if (!mres->num_locks)
				974	return 0;
				975
				976	sz = sizeof(struct dlm_migratable_lockres) +
				977	(mres->num_locks * sizeof(struct dlm_migratable_lock));
				978
				979	/* add an all-done flag if we reached the last lock */
				980	orig_flags = mres->flags;
				981	BUG_ON(total_locks > mres_total_locks);
				982	if (total_locks == mres_total_locks)
				983	mres->flags \|= DLM_MRES_ALL_DONE;
				984
				985	/* send it */
				986	ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres,
				987	sz, send_to, &status);
				988	if (ret < 0) {
				989	/* XXX: negative status is not handled.
				990	* this will end up killing this node. */
				991	mlog_errno(ret);
				992	} else {
				993	/* might get an -ENOMEM back here */
				994	ret = status;
				995	if (ret < 0) {
				996	mlog_errno(ret);
				997
				998	if (ret == -EFAULT) {
				999	mlog(ML_ERROR, "node %u told me to kill "
				1000	"myself!\n", send_to);
				1001	BUG();
				1002	}
				1003	}
				1004	}
				1005
				1006	/* zero and reinit the message buffer */
				1007	dlm_init_migratable_lockres(mres, res->lockname.name,
				1008	res->lockname.len, mres_total_locks,
				1009	mig_cookie, orig_flags, orig_master);
				1010	return ret;
				1011	}
				1012
				1013	static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
				1014	const char *lockname, int namelen,
				1015	int total_locks, u64 cookie,
				1016	u8 flags, u8 master)
				1017	{
				1018	/* mres here is one full page */
				1019	memset(mres, 0, PAGE_SIZE);
				1020	mres->lockname_len = namelen;
				1021	memcpy(mres->lockname, lockname, namelen);
				1022	mres->num_locks = 0;
				1023	mres->total_locks = cpu_to_be32(total_locks);
				1024	mres->mig_cookie = cpu_to_be64(cookie);
				1025	mres->flags = flags;
				1026	mres->master = master;
				1027	}
				1028
				1029
				1030	/* returns 1 if this lock fills the network structure,
				1031	* 0 otherwise */
				1032	static int dlm_add_lock_to_array(struct dlm_lock *lock,
				1033	struct dlm_migratable_lockres *mres, int queue)
				1034	{
				1035	struct dlm_migratable_lock *ml;
				1036	int lock_num = mres->num_locks;
				1037
				1038	ml = &(mres->ml[lock_num]);
				1039	ml->cookie = lock->ml.cookie;
				1040	ml->type = lock->ml.type;
				1041	ml->convert_type = lock->ml.convert_type;
				1042	ml->highest_blocked = lock->ml.highest_blocked;
				1043	ml->list = queue;
				1044	if (lock->lksb) {
				1045	ml->flags = lock->lksb->flags;
				1046	/* send our current lvb */
				1047	if (ml->type == LKM_EXMODE \|\|
				1048	ml->type == LKM_PRMODE) {
				1049	/* if it is already set, this had better be a PR
				1050	* and it has to match */
Kurt Hackel	8bc674c	2006-04-27 18:02:10 -0700	[diff] [blame]	1051	if (!dlm_lvb_is_empty(mres->lvb) &&
				1052	(ml->type == LKM_EXMODE \|\|
				1053	memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1054	mlog(ML_ERROR, "mismatched lvbs!\n");
				1055	__dlm_print_one_lock_resource(lock->lockres);
				1056	BUG();
				1057	}
				1058	memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
				1059	}
				1060	}
				1061	ml->node = lock->ml.node;
				1062	mres->num_locks++;
				1063	/* we reached the max, send this network message */
				1064	if (mres->num_locks == DLM_MAX_MIGRATABLE_LOCKS)
				1065	return 1;
				1066	return 0;
				1067	}
				1068
				1069
				1070	int dlm_send_one_lockres(struct dlm_ctxt dlm, struct dlm_lock_resource res,
				1071	struct dlm_migratable_lockres *mres,
				1072	u8 send_to, u8 flags)
				1073	{
				1074	struct list_head queue, iter;
				1075	int total_locks, i;
				1076	u64 mig_cookie = 0;
				1077	struct dlm_lock *lock;
				1078	int ret = 0;
				1079
				1080	BUG_ON(!(flags & (DLM_MRES_RECOVERY\|DLM_MRES_MIGRATION)));
				1081
				1082	mlog(0, "sending to %u\n", send_to);
				1083
				1084	total_locks = dlm_num_locks_in_lockres(res);
				1085	if (total_locks > DLM_MAX_MIGRATABLE_LOCKS) {
				1086	/* rare, but possible */
				1087	mlog(0, "argh. lockres has %d locks. this will "
				1088	"require more than one network packet to "
				1089	"migrate\n", total_locks);
				1090	mig_cookie = dlm_get_next_mig_cookie();
				1091	}
				1092
				1093	dlm_init_migratable_lockres(mres, res->lockname.name,
				1094	res->lockname.len, total_locks,
				1095	mig_cookie, flags, res->owner);
				1096
				1097	total_locks = 0;
				1098	for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
				1099	queue = dlm_list_idx_to_ptr(res, i);
				1100	list_for_each(iter, queue) {
				1101	lock = list_entry (iter, struct dlm_lock, list);
				1102
				1103	/* add another lock. */
				1104	total_locks++;
				1105	if (!dlm_add_lock_to_array(lock, mres, i))
				1106	continue;
				1107
				1108	/* this filled the lock message,
				1109	* we must send it immediately. */
				1110	ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
				1111	res, total_locks);
				1112	if (ret < 0) {
				1113	// TODO
				1114	mlog(ML_ERROR, "dlm_send_mig_lockres_msg "
				1115	"returned %d, TODO\n", ret);
				1116	BUG();
				1117	}
				1118	}
				1119	}
				1120	/* flush any remaining locks */
				1121	ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
				1122	if (ret < 0) {
				1123	// TODO
				1124	mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, "
				1125	"TODO\n", ret);
				1126	BUG();
				1127	}
				1128	return ret;
				1129	}
				1130
				1131
				1132
				1133	/*
				1134	* this message will contain no more than one page worth of
				1135	* recovery data, and it will work on only one lockres.
				1136	* there may be many locks in this page, and we may need to wait
				1137	* for additional packets to complete all the locks (rare, but
				1138	* possible).
				1139	*/
				1140	/*
				1141	* NOTE: the allocation error cases here are scary
				1142	* we really cannot afford to fail an alloc in recovery
				1143	* do we spin? returning an error only delays the problem really
				1144	*/
				1145
				1146	int dlm_mig_lockres_handler(struct o2net_msg msg, u32 len, void data)
				1147	{
				1148	struct dlm_ctxt *dlm = data;
				1149	struct dlm_migratable_lockres *mres =
				1150	(struct dlm_migratable_lockres *)msg->buf;
				1151	int ret = 0;
				1152	u8 real_master;
				1153	char *buf = NULL;
				1154	struct dlm_work_item *item = NULL;
				1155	struct dlm_lock_resource *res = NULL;
				1156
				1157	if (!dlm_grab(dlm))
				1158	return -EINVAL;
				1159
				1160	BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY\|DLM_MRES_MIGRATION)));
				1161
				1162	real_master = mres->master;
				1163	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
				1164	/* cannot migrate a lockres with no master */
				1165	BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
				1166	}
				1167
				1168	mlog(0, "%s message received from node %u\n",
				1169	(mres->flags & DLM_MRES_RECOVERY) ?
				1170	"recovery" : "migration", mres->master);
				1171	if (mres->flags & DLM_MRES_ALL_DONE)
				1172	mlog(0, "all done flag. all lockres data received!\n");
				1173
				1174	ret = -ENOMEM;
				1175	buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL);
				1176	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
				1177	if (!buf \|\| !item)
				1178	goto leave;
				1179
				1180	/* lookup the lock to see if we have a secondary queue for this
				1181	* already... just add the locks in and this will have its owner
				1182	* and RECOVERY flag changed when it completes. */
				1183	res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len);
				1184	if (res) {
				1185	/* this will get a ref on res */
				1186	/* mark it as recovering/migrating and hash it */
				1187	spin_lock(&res->spinlock);
				1188	if (mres->flags & DLM_MRES_RECOVERY) {
				1189	res->state \|= DLM_LOCK_RES_RECOVERING;
				1190	} else {
				1191	if (res->state & DLM_LOCK_RES_MIGRATING) {
				1192	/* this is at least the second
				1193	* lockres message */
				1194	mlog(0, "lock %.*s is already migrating\n",
				1195	mres->lockname_len,
				1196	mres->lockname);
				1197	} else if (res->state & DLM_LOCK_RES_RECOVERING) {
				1198	/* caller should BUG */
				1199	mlog(ML_ERROR, "node is attempting to migrate "
				1200	"lock %.*s, but marked as recovering!\n",
				1201	mres->lockname_len, mres->lockname);
				1202	ret = -EFAULT;
				1203	spin_unlock(&res->spinlock);
				1204	goto leave;
				1205	}
				1206	res->state \|= DLM_LOCK_RES_MIGRATING;
				1207	}
				1208	spin_unlock(&res->spinlock);
				1209	} else {
				1210	/* need to allocate, just like if it was
				1211	* mastered here normally */
				1212	res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);
				1213	if (!res)
				1214	goto leave;
				1215
				1216	/* to match the ref that we would have gotten if
				1217	* dlm_lookup_lockres had succeeded */
				1218	dlm_lockres_get(res);
				1219
				1220	/* mark it as recovering/migrating and hash it */
				1221	if (mres->flags & DLM_MRES_RECOVERY)
				1222	res->state \|= DLM_LOCK_RES_RECOVERING;
				1223	else
				1224	res->state \|= DLM_LOCK_RES_MIGRATING;
				1225
				1226	spin_lock(&dlm->spinlock);
				1227	__dlm_insert_lockres(dlm, res);
				1228	spin_unlock(&dlm->spinlock);
				1229
				1230	/* now that the new lockres is inserted,
				1231	* make it usable by other processes */
				1232	spin_lock(&res->spinlock);
				1233	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
				1234	spin_unlock(&res->spinlock);
				1235
				1236	/* add an extra ref for just-allocated lockres
				1237	* otherwise the lockres will be purged immediately */
				1238	dlm_lockres_get(res);
				1239
				1240	}
				1241
				1242	/* at this point we have allocated everything we need,
				1243	* and we have a hashed lockres with an extra ref and
				1244	* the proper res->state flags. */
				1245	ret = 0;
				1246	if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) {
				1247	/* migration cannot have an unknown master */
				1248	BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
				1249	mlog(0, "recovery has passed me a lockres with an "
				1250	"unknown owner.. will need to requery: "
				1251	"%.*s\n", mres->lockname_len, mres->lockname);
				1252	} else {
				1253	spin_lock(&res->spinlock);
				1254	dlm_change_lockres_owner(dlm, res, dlm->node_num);
				1255	spin_unlock(&res->spinlock);
				1256	}
				1257
				1258	/* queue up work for dlm_mig_lockres_worker */
				1259	dlm_grab(dlm); /* get an extra ref for the work item */
				1260	memcpy(buf, msg->buf, be16_to_cpu(msg->data_len)); /* copy the whole message */
				1261	dlm_init_work_item(dlm, item, dlm_mig_lockres_worker, buf);
				1262	item->u.ml.lockres = res; /* already have a ref */
				1263	item->u.ml.real_master = real_master;
				1264	spin_lock(&dlm->work_lock);
				1265	list_add_tail(&item->list, &dlm->work_list);
				1266	spin_unlock(&dlm->work_lock);
				1267	schedule_work(&dlm->dispatched_work);
				1268
				1269	leave:
				1270	dlm_put(dlm);
				1271	if (ret < 0) {
				1272	if (buf)
				1273	kfree(buf);
				1274	if (item)
				1275	kfree(item);
				1276	}
				1277
				1278	mlog_exit(ret);
				1279	return ret;
				1280	}
				1281
				1282
				1283	static void dlm_mig_lockres_worker(struct dlm_work_item item, void data)
				1284	{
				1285	struct dlm_ctxt *dlm = data;
				1286	struct dlm_migratable_lockres *mres;
				1287	int ret = 0;
				1288	struct dlm_lock_resource *res;
				1289	u8 real_master;
				1290
				1291	dlm = item->dlm;
				1292	mres = (struct dlm_migratable_lockres *)data;
				1293
				1294	res = item->u.ml.lockres;
				1295	real_master = item->u.ml.real_master;
				1296
				1297	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
				1298	/* this case is super-rare. only occurs if
				1299	* node death happens during migration. */
				1300	again:
				1301	ret = dlm_lockres_master_requery(dlm, res, &real_master);
				1302	if (ret < 0) {
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	1303	mlog(0, "dlm_lockres_master_requery ret=%d\n",
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1304	ret);
				1305	goto again;
				1306	}
				1307	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
				1308	mlog(0, "lockres %.*s not claimed. "
				1309	"this node will take it.\n",
				1310	res->lockname.len, res->lockname.name);
				1311	} else {
				1312	mlog(0, "master needs to respond to sender "
				1313	"that node %u still owns %.*s\n",
				1314	real_master, res->lockname.len,
				1315	res->lockname.name);
				1316	/* cannot touch this lockres */
				1317	goto leave;
				1318	}
				1319	}
				1320
				1321	ret = dlm_process_recovery_data(dlm, res, mres);
				1322	if (ret < 0)
				1323	mlog(0, "dlm_process_recovery_data returned %d\n", ret);
				1324	else
				1325	mlog(0, "dlm_process_recovery_data succeeded\n");
				1326
				1327	if ((mres->flags & (DLM_MRES_MIGRATION\|DLM_MRES_ALL_DONE)) ==
				1328	(DLM_MRES_MIGRATION\|DLM_MRES_ALL_DONE)) {
				1329	ret = dlm_finish_migration(dlm, res, mres->master);
				1330	if (ret < 0)
				1331	mlog_errno(ret);
				1332	}
				1333
				1334	leave:
				1335	kfree(data);
				1336	mlog_exit(ret);
				1337	}
				1338
				1339
				1340
Kurt Hackel	c03872f	2006-03-06 14:08:49 -0800	[diff] [blame]	1341	int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
				1342	struct dlm_lock_resource res, u8 real_master)
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1343	{
				1344	struct dlm_node_iter iter;
				1345	int nodenum;
				1346	int ret = 0;
				1347
				1348	*real_master = DLM_LOCK_RES_OWNER_UNKNOWN;
				1349
				1350	/* we only reach here if one of the two nodes in a
				1351	* migration died while the migration was in progress.
				1352	* at this point we need to requery the master. we
				1353	* know that the new_master got as far as creating
				1354	* an mle on at least one node, but we do not know
				1355	* if any nodes had actually cleared the mle and set
				1356	* the master to the new_master. the old master
				1357	* is supposed to set the owner to UNKNOWN in the
				1358	* event of a new_master death, so the only possible
				1359	* responses that we can get from nodes here are
				1360	* that the master is new_master, or that the master
				1361	* is UNKNOWN.
				1362	* if all nodes come back with UNKNOWN then we know
				1363	* the lock needs remastering here.
				1364	* if any node comes back with a valid master, check
				1365	* to see if that master is the one that we are
				1366	* recovering. if so, then the new_master died and
				1367	* we need to remaster this lock. if not, then the
				1368	* new_master survived and that node will respond to
				1369	* other nodes about the owner.
				1370	* if there is an owner, this node needs to dump this
				1371	* lockres and alert the sender that this lockres
				1372	* was rejected. */
				1373	spin_lock(&dlm->spinlock);
				1374	dlm_node_iter_init(dlm->domain_map, &iter);
				1375	spin_unlock(&dlm->spinlock);
				1376
				1377	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
				1378	/* do not send to self */
				1379	if (nodenum == dlm->node_num)
				1380	continue;
				1381	ret = dlm_do_master_requery(dlm, res, nodenum, real_master);
				1382	if (ret < 0) {
				1383	mlog_errno(ret);
Kurt Hackel	c03872f	2006-03-06 14:08:49 -0800	[diff] [blame]	1384	if (!dlm_is_host_down(ret))
				1385	BUG();
				1386	/* host is down, so answer for that node would be
				1387	* DLM_LOCK_RES_OWNER_UNKNOWN. continue. */
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1388	}
				1389	if (*real_master != DLM_LOCK_RES_OWNER_UNKNOWN) {
				1390	mlog(0, "lock master is %u\n", *real_master);
				1391	break;
				1392	}
				1393	}
				1394	return ret;
				1395	}
				1396
				1397
Kurt Hackel	c03872f	2006-03-06 14:08:49 -0800	[diff] [blame]	1398	int dlm_do_master_requery(struct dlm_ctxt dlm, struct dlm_lock_resource res,
				1399	u8 nodenum, u8 *real_master)
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1400	{
				1401	int ret = -EINVAL;
				1402	struct dlm_master_requery req;
				1403	int status = DLM_LOCK_RES_OWNER_UNKNOWN;
				1404
				1405	memset(&req, 0, sizeof(req));
				1406	req.node_idx = dlm->node_num;
				1407	req.namelen = res->lockname.len;
				1408	memcpy(req.name, res->lockname.name, res->lockname.len);
				1409
				1410	ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key,
				1411	&req, sizeof(req), nodenum, &status);
				1412	/* XXX: negative status not handled properly here. */
				1413	if (ret < 0)
				1414	mlog_errno(ret);
				1415	else {
				1416	BUG_ON(status < 0);
				1417	BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
				1418	*real_master = (u8) (status & 0xff);
				1419	mlog(0, "node %u responded to master requery with %u\n",
				1420	nodenum, *real_master);
				1421	ret = 0;
				1422	}
				1423	return ret;
				1424	}
				1425
				1426
				1427	/* this function cannot error, so unless the sending
				1428	* or receiving of the message failed, the owner can
				1429	* be trusted */
				1430	int dlm_master_requery_handler(struct o2net_msg msg, u32 len, void data)
				1431	{
				1432	struct dlm_ctxt *dlm = data;
				1433	struct dlm_master_requery req = (struct dlm_master_requery )msg->buf;
				1434	struct dlm_lock_resource *res = NULL;
Mark Fasheh	a3d3329	2006-03-09 17:55:56 -0800	[diff] [blame]	1435	unsigned int hash;
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1436	int master = DLM_LOCK_RES_OWNER_UNKNOWN;
				1437	u32 flags = DLM_ASSERT_MASTER_REQUERY;
				1438
				1439	if (!dlm_grab(dlm)) {
				1440	/* since the domain has gone away on this
				1441	* node, the proper response is UNKNOWN */
				1442	return master;
				1443	}
				1444
Mark Fasheh	a3d3329	2006-03-09 17:55:56 -0800	[diff] [blame]	1445	hash = dlm_lockid_hash(req->name, req->namelen);
				1446
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1447	spin_lock(&dlm->spinlock);
Mark Fasheh	a3d3329	2006-03-09 17:55:56 -0800	[diff] [blame]	1448	res = __dlm_lookup_lockres(dlm, req->name, req->namelen, hash);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1449	if (res) {
				1450	spin_lock(&res->spinlock);
				1451	master = res->owner;
				1452	if (master == dlm->node_num) {
				1453	int ret = dlm_dispatch_assert_master(dlm, res,
				1454	0, 0, flags);
				1455	if (ret < 0) {
				1456	mlog_errno(-ENOMEM);
				1457	/* retry!? */
				1458	BUG();
				1459	}
				1460	}
				1461	spin_unlock(&res->spinlock);
				1462	}
				1463	spin_unlock(&dlm->spinlock);
				1464
				1465	dlm_put(dlm);
				1466	return master;
				1467	}
				1468
				1469	static inline struct list_head *
				1470	dlm_list_num_to_pointer(struct dlm_lock_resource *res, int list_num)
				1471	{
				1472	struct list_head *ret;
				1473	BUG_ON(list_num < 0);
				1474	BUG_ON(list_num > 2);
				1475	ret = &(res->granted);
				1476	ret += list_num;
				1477	return ret;
				1478	}
				1479	/* TODO: do ast flush business
				1480	* TODO: do MIGRATING and RECOVERING spinning
				1481	*/
				1482
				1483	/*
				1484	* NOTE about in-flight requests during migration:
				1485	*
				1486	* Before attempting the migrate, the master has marked the lockres as
				1487	* MIGRATING and then flushed all of its pending ASTS. So any in-flight
				1488	* requests either got queued before the MIGRATING flag got set, in which
				1489	* case the lock data will reflect the change and a return message is on
				1490	* the way, or the request failed to get in before MIGRATING got set. In
				1491	* this case, the caller will be told to spin and wait for the MIGRATING
				1492	* flag to be dropped, then recheck the master.
				1493	* This holds true for the convert, cancel and unlock cases, and since lvb
				1494	* updates are tied to these same messages, it applies to lvb updates as
				1495	* well. For the lock case, there is no way a lock can be on the master
				1496	* queue and not be on the secondary queue since the lock is always added
				1497	* locally first. This means that the new target node will never be sent
				1498	* a lock that he doesn't already have on the list.
				1499	* In total, this means that the local lock is correct and should not be
				1500	* updated to match the one sent by the master. Any messages sent back
				1501	* from the master before the MIGRATING flag will bring the lock properly
				1502	* up-to-date, and the change will be ordered properly for the waiter.
				1503	* We will not attempt to modify the lock underneath the waiter.
				1504	*/
				1505
				1506	static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
				1507	struct dlm_lock_resource *res,
				1508	struct dlm_migratable_lockres *mres)
				1509	{
				1510	struct dlm_migratable_lock *ml;
				1511	struct list_head *queue;
				1512	struct dlm_lock *newlock = NULL;
				1513	struct dlm_lockstatus *lksb = NULL;
				1514	int ret = 0;
Kurt Hackel	c3187ce	2006-04-27 18:05:41 -0700	[diff] [blame^]	1515	int i, bad;
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1516	struct list_head *iter;
				1517	struct dlm_lock *lock = NULL;
				1518
				1519	mlog(0, "running %d locks for this lockres\n", mres->num_locks);
				1520	for (i=0; i<mres->num_locks; i++) {
				1521	ml = &(mres->ml[i]);
				1522	BUG_ON(ml->highest_blocked != LKM_IVMODE);
				1523	newlock = NULL;
				1524	lksb = NULL;
				1525
				1526	queue = dlm_list_num_to_pointer(res, ml->list);
				1527
				1528	/* if the lock is for the local node it needs to
				1529	* be moved to the proper location within the queue.
				1530	* do not allocate a new lock structure. */
				1531	if (ml->node == dlm->node_num) {
				1532	/* MIGRATION ONLY! */
				1533	BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
				1534
				1535	spin_lock(&res->spinlock);
				1536	list_for_each(iter, queue) {
				1537	lock = list_entry (iter, struct dlm_lock, list);
				1538	if (lock->ml.cookie != ml->cookie)
				1539	lock = NULL;
				1540	else
				1541	break;
				1542	}
				1543
				1544	/* lock is always created locally first, and
				1545	* destroyed locally last. it must be on the list */
				1546	if (!lock) {
Kurt Hackel	2900485	2006-03-02 16:43:36 -0800	[diff] [blame]	1547	u64 c = ml->cookie;
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1548	mlog(ML_ERROR, "could not find local lock "
Kurt Hackel	2900485	2006-03-02 16:43:36 -0800	[diff] [blame]	1549	"with cookie %u:%llu!\n",
				1550	dlm_get_lock_cookie_node(c),
				1551	dlm_get_lock_cookie_seq(c));
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1552	BUG();
				1553	}
				1554	BUG_ON(lock->ml.node != ml->node);
				1555
				1556	/* see NOTE above about why we do not update
				1557	* to match the master here */
				1558
				1559	/* move the lock to its proper place */
				1560	/* do not alter lock refcount. switching lists. */
Akinobu Mita	f116629	2006-06-26 00:24:46 -0700	[diff] [blame]	1561	list_move_tail(&lock->list, queue);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1562	spin_unlock(&res->spinlock);
				1563
				1564	mlog(0, "just reordered a local lock!\n");
				1565	continue;
				1566	}
				1567
				1568	/* lock is for another node. */
				1569	newlock = dlm_new_lock(ml->type, ml->node,
				1570	be64_to_cpu(ml->cookie), NULL);
				1571	if (!newlock) {
				1572	ret = -ENOMEM;
				1573	goto leave;
				1574	}
				1575	lksb = newlock->lksb;
				1576	dlm_lock_attach_lockres(newlock, res);
				1577
				1578	if (ml->convert_type != LKM_IVMODE) {
				1579	BUG_ON(queue != &res->converting);
				1580	newlock->ml.convert_type = ml->convert_type;
				1581	}
				1582	lksb->flags \|= (ml->flags &
				1583	(DLM_LKSB_PUT_LVB\|DLM_LKSB_GET_LVB));
				1584
Kurt Hackel	8bc674c	2006-04-27 18:02:10 -0700	[diff] [blame]	1585	if (!dlm_lvb_is_empty(mres->lvb)) {
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1586	if (lksb->flags & DLM_LKSB_PUT_LVB) {
				1587	/* other node was trying to update
				1588	* lvb when node died. recreate the
				1589	* lksb with the updated lvb. */
				1590	memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN);
				1591	} else {
				1592	/* otherwise, the node is sending its
				1593	* most recent valid lvb info */
				1594	BUG_ON(ml->type != LKM_EXMODE &&
				1595	ml->type != LKM_PRMODE);
Kurt Hackel	8bc674c	2006-04-27 18:02:10 -0700	[diff] [blame]	1596	if (!dlm_lvb_is_empty(res->lvb) &&
				1597	(ml->type == LKM_EXMODE \|\|
				1598	memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1599	mlog(ML_ERROR, "received bad lvb!\n");
				1600	__dlm_print_one_lock_resource(res);
				1601	BUG();
				1602	}
				1603	memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
				1604	}
				1605	}
				1606
				1607
				1608	/* NOTE:
				1609	* wrt lock queue ordering and recovery:
				1610	* 1. order of locks on granted queue is
				1611	* meaningless.
				1612	* 2. order of locks on converting queue is
				1613	* LOST with the node death. sorry charlie.
				1614	* 3. order of locks on the blocked queue is
				1615	* also LOST.
				1616	* order of locks does not affect integrity, it
				1617	* just means that a lock request may get pushed
				1618	* back in line as a result of the node death.
				1619	* also note that for a given node the lock order
				1620	* for its secondary queue locks is preserved
				1621	* relative to each other, but clearly not
				1622	* preserved relative to locks from other nodes.
				1623	*/
Kurt Hackel	c3187ce	2006-04-27 18:05:41 -0700	[diff] [blame^]	1624	bad = 0;
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1625	spin_lock(&res->spinlock);
Kurt Hackel	c3187ce	2006-04-27 18:05:41 -0700	[diff] [blame^]	1626	list_for_each_entry(lock, queue, list) {
				1627	if (lock->ml.cookie == ml->cookie) {
				1628	u64 c = lock->ml.cookie;
				1629	mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
				1630	"exists on this lockres!\n", dlm->name,
				1631	res->lockname.len, res->lockname.name,
				1632	dlm_get_lock_cookie_node(c),
				1633	dlm_get_lock_cookie_seq(c));
				1634
				1635	mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, "
				1636	"node=%u, cookie=%u:%llu, queue=%d\n",
				1637	ml->type, ml->convert_type, ml->node,
				1638	dlm_get_lock_cookie_node(ml->cookie),
				1639	dlm_get_lock_cookie_seq(ml->cookie),
				1640	ml->list);
				1641
				1642	__dlm_print_one_lock_resource(res);
				1643	bad = 1;
				1644	break;
				1645	}
				1646	}
				1647	if (!bad) {
				1648	dlm_lock_get(newlock);
				1649	list_add_tail(&newlock->list, queue);
				1650	}
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1651	spin_unlock(&res->spinlock);
				1652	}
				1653	mlog(0, "done running all the locks\n");
				1654
				1655	leave:
				1656	if (ret < 0) {
				1657	mlog_errno(ret);
				1658	if (newlock)
				1659	dlm_lock_put(newlock);
				1660	}
				1661
				1662	mlog_exit(ret);
				1663	return ret;
				1664	}
				1665
				1666	void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
				1667	struct dlm_lock_resource *res)
				1668	{
				1669	int i;
				1670	struct list_head queue, iter, *iter2;
				1671	struct dlm_lock *lock;
				1672
				1673	res->state \|= DLM_LOCK_RES_RECOVERING;
				1674	if (!list_empty(&res->recovering))
				1675	list_del_init(&res->recovering);
				1676	list_add_tail(&res->recovering, &dlm->reco.resources);
				1677
				1678	/* find any pending locks and put them back on proper list */
				1679	for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) {
				1680	queue = dlm_list_idx_to_ptr(res, i);
				1681	list_for_each_safe(iter, iter2, queue) {
				1682	lock = list_entry (iter, struct dlm_lock, list);
				1683	dlm_lock_get(lock);
				1684	if (lock->convert_pending) {
				1685	/* move converting lock back to granted */
				1686	BUG_ON(i != DLM_CONVERTING_LIST);
				1687	mlog(0, "node died with convert pending "
				1688	"on %.*s. move back to granted list.\n",
				1689	res->lockname.len, res->lockname.name);
				1690	dlm_revert_pending_convert(res, lock);
				1691	lock->convert_pending = 0;
				1692	} else if (lock->lock_pending) {
				1693	/* remove pending lock requests completely */
				1694	BUG_ON(i != DLM_BLOCKED_LIST);
				1695	mlog(0, "node died with lock pending "
				1696	"on %.*s. remove from blocked list and skip.\n",
				1697	res->lockname.len, res->lockname.name);
				1698	/* lock will be floating until ref in
				1699	* dlmlock_remote is freed after the network
				1700	* call returns. ok for it to not be on any
				1701	* list since no ast can be called
				1702	* (the master is dead). */
				1703	dlm_revert_pending_lock(res, lock);
				1704	lock->lock_pending = 0;
				1705	} else if (lock->unlock_pending) {
				1706	/* if an unlock was in progress, treat as
				1707	* if this had completed successfully
				1708	* before sending this lock state to the
				1709	* new master. note that the dlm_unlock
				1710	* call is still responsible for calling
				1711	* the unlockast. that will happen after
				1712	* the network call times out. for now,
				1713	* just move lists to prepare the new
				1714	* recovery master. */
				1715	BUG_ON(i != DLM_GRANTED_LIST);
				1716	mlog(0, "node died with unlock pending "
				1717	"on %.*s. remove from blocked list and skip.\n",
				1718	res->lockname.len, res->lockname.name);
				1719	dlm_commit_pending_unlock(res, lock);
				1720	lock->unlock_pending = 0;
				1721	} else if (lock->cancel_pending) {
				1722	/* if a cancel was in progress, treat as
				1723	* if this had completed successfully
				1724	* before sending this lock state to the
				1725	* new master */
				1726	BUG_ON(i != DLM_CONVERTING_LIST);
				1727	mlog(0, "node died with cancel pending "
				1728	"on %.*s. move back to granted list.\n",
				1729	res->lockname.len, res->lockname.name);
				1730	dlm_commit_pending_cancel(res, lock);
				1731	lock->cancel_pending = 0;
				1732	}
				1733	dlm_lock_put(lock);
				1734	}
				1735	}
				1736	}
				1737
				1738
				1739
				1740	/* removes all recovered locks from the recovery list.
				1741	* sets the res->owner to the new master.
				1742	* unsets the RECOVERY flag and wakes waiters. */
				1743	static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
				1744	u8 dead_node, u8 new_master)
				1745	{
				1746	int i;
Mark Fasheh	81f2094	2006-02-28 17:31:22 -0800	[diff] [blame]	1747	struct list_head iter, iter2;
				1748	struct hlist_node *hash_iter;
				1749	struct hlist_head *bucket;
				1750
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1751	struct dlm_lock_resource *res;
				1752
				1753	mlog_entry_void();
				1754
				1755	assert_spin_locked(&dlm->spinlock);
				1756
				1757	list_for_each_safe(iter, iter2, &dlm->reco.resources) {
				1758	res = list_entry (iter, struct dlm_lock_resource, recovering);
				1759	if (res->owner == dead_node) {
				1760	list_del_init(&res->recovering);
				1761	spin_lock(&res->spinlock);
				1762	dlm_change_lockres_owner(dlm, res, new_master);
				1763	res->state &= ~DLM_LOCK_RES_RECOVERING;
				1764	__dlm_dirty_lockres(dlm, res);
				1765	spin_unlock(&res->spinlock);
				1766	wake_up(&res->wq);
				1767	}
				1768	}
				1769
				1770	/* this will become unnecessary eventually, but
				1771	* for now we need to run the whole hash, clear
				1772	* the RECOVERING state and set the owner
				1773	* if necessary */
Mark Fasheh	81f2094	2006-02-28 17:31:22 -0800	[diff] [blame]	1774	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
Daniel Phillips	03d864c	2006-03-10 18:08:16 -0800	[diff] [blame]	1775	bucket = dlm_lockres_hash(dlm, i);
Mark Fasheh	81f2094	2006-02-28 17:31:22 -0800	[diff] [blame]	1776	hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1777	if (res->state & DLM_LOCK_RES_RECOVERING) {
				1778	if (res->owner == dead_node) {
				1779	mlog(0, "(this=%u) res %.*s owner=%u "
				1780	"was not on recovering list, but "
				1781	"clearing state anyway\n",
				1782	dlm->node_num, res->lockname.len,
				1783	res->lockname.name, new_master);
				1784	} else if (res->owner == dlm->node_num) {
				1785	mlog(0, "(this=%u) res %.*s owner=%u "
				1786	"was not on recovering list, "
				1787	"owner is THIS node, clearing\n",
				1788	dlm->node_num, res->lockname.len,
				1789	res->lockname.name, new_master);
				1790	} else
				1791	continue;
				1792
Kurt Hackel	c03872f	2006-03-06 14:08:49 -0800	[diff] [blame]	1793	if (!list_empty(&res->recovering)) {
				1794	mlog(0, "%s:%.*s: lockres was "
				1795	"marked RECOVERING, owner=%u\n",
				1796	dlm->name, res->lockname.len,
				1797	res->lockname.name, res->owner);
				1798	list_del_init(&res->recovering);
				1799	}
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1800	spin_lock(&res->spinlock);
				1801	dlm_change_lockres_owner(dlm, res, new_master);
				1802	res->state &= ~DLM_LOCK_RES_RECOVERING;
				1803	__dlm_dirty_lockres(dlm, res);
				1804	spin_unlock(&res->spinlock);
				1805	wake_up(&res->wq);
				1806	}
				1807	}
				1808	}
				1809	}
				1810
				1811	static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local)
				1812	{
				1813	if (local) {
				1814	if (lock->ml.type != LKM_EXMODE &&
				1815	lock->ml.type != LKM_PRMODE)
				1816	return 1;
				1817	} else if (lock->ml.type == LKM_EXMODE)
				1818	return 1;
				1819	return 0;
				1820	}
				1821
				1822	static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
				1823	struct dlm_lock_resource *res, u8 dead_node)
				1824	{
				1825	struct list_head iter, queue;
				1826	struct dlm_lock *lock;
				1827	int blank_lvb = 0, local = 0;
				1828	int i;
				1829	u8 search_node;
				1830
				1831	assert_spin_locked(&dlm->spinlock);
				1832	assert_spin_locked(&res->spinlock);
				1833
				1834	if (res->owner == dlm->node_num)
				1835	/* if this node owned the lockres, and if the dead node
				1836	* had an EX when he died, blank out the lvb */
				1837	search_node = dead_node;
				1838	else {
				1839	/* if this is a secondary lockres, and we had no EX or PR
				1840	* locks granted, we can no longer trust the lvb */
				1841	search_node = dlm->node_num;
				1842	local = 1; /* check local state for valid lvb */
				1843	}
				1844
				1845	for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) {
				1846	queue = dlm_list_idx_to_ptr(res, i);
				1847	list_for_each(iter, queue) {
				1848	lock = list_entry (iter, struct dlm_lock, list);
				1849	if (lock->ml.node == search_node) {
				1850	if (dlm_lvb_needs_invalidation(lock, local)) {
				1851	/* zero the lksb lvb and lockres lvb */
				1852	blank_lvb = 1;
				1853	memset(lock->lksb->lvb, 0, DLM_LVB_LEN);
				1854	}
				1855	}
				1856	}
				1857	}
				1858
				1859	if (blank_lvb) {
				1860	mlog(0, "clearing %.*s lvb, dead node %u had EX\n",
				1861	res->lockname.len, res->lockname.name, dead_node);
				1862	memset(res->lvb, 0, DLM_LVB_LEN);
				1863	}
				1864	}
				1865
				1866	static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
				1867	struct dlm_lock_resource *res, u8 dead_node)
				1868	{
				1869	struct list_head iter, tmpiter;
				1870	struct dlm_lock *lock;
				1871
				1872	/* this node is the lockres master:
				1873	* 1) remove any stale locks for the dead node
				1874	* 2) if the dead node had an EX when he died, blank out the lvb
				1875	*/
				1876	assert_spin_locked(&dlm->spinlock);
				1877	assert_spin_locked(&res->spinlock);
				1878
				1879	/* TODO: check pending_asts, pending_basts here */
				1880	list_for_each_safe(iter, tmpiter, &res->granted) {
				1881	lock = list_entry (iter, struct dlm_lock, list);
				1882	if (lock->ml.node == dead_node) {
				1883	list_del_init(&lock->list);
				1884	dlm_lock_put(lock);
				1885	}
				1886	}
				1887	list_for_each_safe(iter, tmpiter, &res->converting) {
				1888	lock = list_entry (iter, struct dlm_lock, list);
				1889	if (lock->ml.node == dead_node) {
				1890	list_del_init(&lock->list);
				1891	dlm_lock_put(lock);
				1892	}
				1893	}
				1894	list_for_each_safe(iter, tmpiter, &res->blocked) {
				1895	lock = list_entry (iter, struct dlm_lock, list);
				1896	if (lock->ml.node == dead_node) {
				1897	list_del_init(&lock->list);
				1898	dlm_lock_put(lock);
				1899	}
				1900	}
				1901
				1902	/* do not kick thread yet */
				1903	__dlm_dirty_lockres(dlm, res);
				1904	}
				1905
				1906	/* if this node is the recovery master, and there are no
				1907	* locks for a given lockres owned by this node that are in
				1908	* either PR or EX mode, zero out the lvb before requesting.
				1909	*
				1910	*/
				1911
				1912
				1913	static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
				1914	{
Mark Fasheh	81f2094	2006-02-28 17:31:22 -0800	[diff] [blame]	1915	struct hlist_node *iter;
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1916	struct dlm_lock_resource *res;
				1917	int i;
Mark Fasheh	81f2094	2006-02-28 17:31:22 -0800	[diff] [blame]	1918	struct hlist_head *bucket;
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	1919	struct dlm_lock *lock;
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1920
				1921
				1922	/* purge any stale mles */
				1923	dlm_clean_master_list(dlm, dead_node);
				1924
				1925	/*
				1926	* now clean up all lock resources. there are two rules:
				1927	*
				1928	* 1) if the dead node was the master, move the lockres
				1929	* to the recovering list. set the RECOVERING flag.
				1930	* this lockres needs to be cleaned up before it can
				1931	* be used further.
				1932	*
				1933	* 2) if this node was the master, remove all locks from
				1934	* each of the lockres queues that were owned by the
				1935	* dead node. once recovery finishes, the dlm thread
				1936	* can be kicked again to see if any ASTs or BASTs
				1937	* need to be fired as a result.
				1938	*/
Mark Fasheh	81f2094	2006-02-28 17:31:22 -0800	[diff] [blame]	1939	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
Daniel Phillips	03d864c	2006-03-10 18:08:16 -0800	[diff] [blame]	1940	bucket = dlm_lockres_hash(dlm, i);
Mark Fasheh	81f2094	2006-02-28 17:31:22 -0800	[diff] [blame]	1941	hlist_for_each_entry(res, iter, bucket, hash_node) {
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	1942	/* always prune any $RECOVERY entries for dead nodes,
				1943	* otherwise hangs can occur during later recovery */
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1944	if (dlm_is_recovery_lock(res->lockname.name,
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	1945	res->lockname.len)) {
				1946	spin_lock(&res->spinlock);
				1947	list_for_each_entry(lock, &res->granted, list) {
				1948	if (lock->ml.node == dead_node) {
				1949	mlog(0, "AHA! there was "
				1950	"a $RECOVERY lock for dead "
				1951	"node %u (%s)!\n",
				1952	dead_node, dlm->name);
				1953	list_del_init(&lock->list);
				1954	dlm_lock_put(lock);
				1955	break;
				1956	}
				1957	}
				1958	spin_unlock(&res->spinlock);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1959	continue;
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	1960	}
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	1961	spin_lock(&res->spinlock);
				1962	/* zero the lvb if necessary */
				1963	dlm_revalidate_lvb(dlm, res, dead_node);
				1964	if (res->owner == dead_node)
				1965	dlm_move_lockres_to_recovery_list(dlm, res);
				1966	else if (res->owner == dlm->node_num) {
				1967	dlm_free_dead_locks(dlm, res, dead_node);
				1968	__dlm_lockres_calc_usage(dlm, res);
				1969	}
				1970	spin_unlock(&res->spinlock);
				1971	}
				1972	}
				1973
				1974	}
				1975
				1976	static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
				1977	{
				1978	assert_spin_locked(&dlm->spinlock);
				1979
				1980	/* check to see if the node is already considered dead */
				1981	if (!test_bit(idx, dlm->live_nodes_map)) {
				1982	mlog(0, "for domain %s, node %d is already dead. "
				1983	"another node likely did recovery already.\n",
				1984	dlm->name, idx);
				1985	return;
				1986	}
				1987
				1988	/* check to see if we do not care about this node */
				1989	if (!test_bit(idx, dlm->domain_map)) {
				1990	/* This also catches the case that we get a node down
				1991	* but haven't joined the domain yet. */
				1992	mlog(0, "node %u already removed from domain!\n", idx);
				1993	return;
				1994	}
				1995
				1996	clear_bit(idx, dlm->live_nodes_map);
				1997
				1998	/* Clean up join state on node death. */
				1999	if (dlm->joining_node == idx) {
				2000	mlog(0, "Clearing join state for node %u\n", idx);
				2001	__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
				2002	}
				2003
				2004	/* make sure local cleanup occurs before the heartbeat events */
				2005	if (!test_bit(idx, dlm->recovery_map))
				2006	dlm_do_local_recovery_cleanup(dlm, idx);
				2007
				2008	/* notify anything attached to the heartbeat events */
				2009	dlm_hb_event_notify_attached(dlm, idx, 0);
				2010
				2011	mlog(0, "node %u being removed from domain map!\n", idx);
				2012	clear_bit(idx, dlm->domain_map);
				2013	/* wake up migration waiters if a node goes down.
				2014	* perhaps later we can genericize this for other waiters. */
				2015	wake_up(&dlm->migration_wq);
				2016
				2017	if (test_bit(idx, dlm->recovery_map))
				2018	mlog(0, "domain %s, node %u already added "
				2019	"to recovery map!\n", dlm->name, idx);
				2020	else
				2021	set_bit(idx, dlm->recovery_map);
				2022	}
				2023
				2024	void dlm_hb_node_down_cb(struct o2nm_node node, int idx, void data)
				2025	{
				2026	struct dlm_ctxt *dlm = data;
				2027
				2028	if (!dlm_grab(dlm))
				2029	return;
				2030
				2031	spin_lock(&dlm->spinlock);
				2032	__dlm_hb_node_down(dlm, idx);
				2033	spin_unlock(&dlm->spinlock);
				2034
				2035	dlm_put(dlm);
				2036	}
				2037
				2038	void dlm_hb_node_up_cb(struct o2nm_node node, int idx, void data)
				2039	{
				2040	struct dlm_ctxt *dlm = data;
				2041
				2042	if (!dlm_grab(dlm))
				2043	return;
				2044
				2045	spin_lock(&dlm->spinlock);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2046	set_bit(idx, dlm->live_nodes_map);
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	2047	/* do NOT notify mle attached to the heartbeat events.
				2048	* new nodes are not interesting in mastery until joined. */
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2049	spin_unlock(&dlm->spinlock);
				2050
				2051	dlm_put(dlm);
				2052	}
				2053
				2054	static void dlm_reco_ast(void *astdata)
				2055	{
				2056	struct dlm_ctxt *dlm = astdata;
				2057	mlog(0, "ast for recovery lock fired!, this=%u, dlm=%s\n",
				2058	dlm->node_num, dlm->name);
				2059	}
				2060	static void dlm_reco_bast(void *astdata, int blocked_type)
				2061	{
				2062	struct dlm_ctxt *dlm = astdata;
				2063	mlog(0, "bast for recovery lock fired!, this=%u, dlm=%s\n",
				2064	dlm->node_num, dlm->name);
				2065	}
				2066	static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
				2067	{
				2068	mlog(0, "unlockast for recovery lock fired!\n");
				2069	}
				2070
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	2071	/*
				2072	* dlm_pick_recovery_master will continually attempt to use
				2073	* dlmlock() on the special "$RECOVERY" lockres with the
				2074	* LKM_NOQUEUE flag to get an EX. every thread that enters
				2075	* this function on each node racing to become the recovery
				2076	* master will not stop attempting this until either:
				2077	* a) this node gets the EX (and becomes the recovery master),
				2078	* or b) dlm->reco.new_master gets set to some nodenum
				2079	* != O2NM_INVALID_NODE_NUM (another node will do the reco).
				2080	* so each time a recovery master is needed, the entire cluster
				2081	* will sync at this point. if the new master dies, that will
				2082	* be detected in dlm_do_recovery */
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2083	static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
				2084	{
				2085	enum dlm_status ret;
				2086	struct dlm_lockstatus lksb;
				2087	int status = -EINVAL;
				2088
				2089	mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
				2090	dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	2091	again:
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2092	memset(&lksb, 0, sizeof(lksb));
				2093
				2094	ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE\|LKM_RECOVERY,
				2095	DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast);
				2096
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	2097	mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n",
				2098	dlm->name, ret, lksb.status);
				2099
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2100	if (ret == DLM_NORMAL) {
				2101	mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
				2102	dlm->name, dlm->node_num);
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	2103
				2104	/* got the EX lock. check to see if another node
				2105	* just became the reco master */
				2106	if (dlm_reco_master_ready(dlm)) {
				2107	mlog(0, "%s: got reco EX lock, but %u will "
				2108	"do the recovery\n", dlm->name,
				2109	dlm->reco.new_master);
				2110	status = -EEXIST;
				2111	} else {
Kurt Hackel	898effa	2006-01-18 17:01:25 -0800	[diff] [blame]	2112	status = 0;
				2113
				2114	/* see if recovery was already finished elsewhere */
				2115	spin_lock(&dlm->spinlock);
				2116	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
				2117	status = -EINVAL;
				2118	mlog(0, "%s: got reco EX lock, but "
				2119	"node got recovered already\n", dlm->name);
				2120	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
				2121	mlog(ML_ERROR, "%s: new master is %u "
				2122	"but no dead node!\n",
				2123	dlm->name, dlm->reco.new_master);
				2124	BUG();
				2125	}
				2126	}
				2127	spin_unlock(&dlm->spinlock);
				2128	}
				2129
				2130	/* if this node has actually become the recovery master,
				2131	* set the master and send the messages to begin recovery */
				2132	if (!status) {
				2133	mlog(0, "%s: dead=%u, this=%u, sending "
				2134	"begin_reco now\n", dlm->name,
				2135	dlm->reco.dead_node, dlm->node_num);
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	2136	status = dlm_send_begin_reco_message(dlm,
				2137	dlm->reco.dead_node);
				2138	/* this always succeeds */
				2139	BUG_ON(status);
				2140
				2141	/* set the new_master to this node */
				2142	spin_lock(&dlm->spinlock);
Kurt Hackel	ab27eb6	2006-04-27 18:03:49 -0700	[diff] [blame]	2143	dlm_set_reco_master(dlm, dlm->node_num);
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	2144	spin_unlock(&dlm->spinlock);
				2145	}
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2146
				2147	/* recovery lock is a special case. ast will not get fired,
				2148	* so just go ahead and unlock it. */
				2149	ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm);
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	2150	if (ret == DLM_DENIED) {
				2151	mlog(0, "got DLM_DENIED, trying LKM_CANCEL\n");
				2152	ret = dlmunlock(dlm, &lksb, LKM_CANCEL, dlm_reco_unlock_ast, dlm);
				2153	}
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2154	if (ret != DLM_NORMAL) {
				2155	/* this would really suck. this could only happen
				2156	* if there was a network error during the unlock
				2157	* because of node death. this means the unlock
				2158	* is actually "done" and the lock structure is
				2159	* even freed. we can continue, but only
				2160	* because this specific lock name is special. */
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	2161	mlog(ML_ERROR, "dlmunlock returned %d\n", ret);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2162	}
				2163	} else if (ret == DLM_NOTQUEUED) {
				2164	mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
				2165	dlm->name, dlm->node_num);
				2166	/* another node is master. wait on
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	2167	* reco.new_master != O2NM_INVALID_NODE_NUM
				2168	* for at most one second */
				2169	wait_event_timeout(dlm->dlm_reco_thread_wq,
				2170	dlm_reco_master_ready(dlm),
				2171	msecs_to_jiffies(1000));
				2172	if (!dlm_reco_master_ready(dlm)) {
				2173	mlog(0, "%s: reco master taking awhile\n",
				2174	dlm->name);
				2175	goto again;
				2176	}
				2177	/* another node has informed this one that it is reco master */
				2178	mlog(0, "%s: reco master %u is ready to recover %u\n",
				2179	dlm->name, dlm->reco.new_master, dlm->reco.dead_node);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2180	status = -EEXIST;
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	2181	} else {
				2182	struct dlm_lock_resource *res;
				2183
				2184	/* dlmlock returned something other than NOTQUEUED or NORMAL */
				2185	mlog(ML_ERROR, "%s: got %s from dlmlock($RECOVERY), "
				2186	"lksb.status=%s\n", dlm->name, dlm_errname(ret),
				2187	dlm_errname(lksb.status));
				2188	res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
				2189	DLM_RECOVERY_LOCK_NAME_LEN);
				2190	if (res) {
				2191	dlm_print_one_lock_resource(res);
				2192	dlm_lockres_put(res);
				2193	} else {
				2194	mlog(ML_ERROR, "recovery lock not found\n");
				2195	}
				2196	BUG();
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2197	}
				2198
				2199	return status;
				2200	}
				2201
				2202	static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
				2203	{
				2204	struct dlm_begin_reco br;
				2205	int ret = 0;
				2206	struct dlm_node_iter iter;
				2207	int nodenum;
				2208	int status;
				2209
				2210	mlog_entry("%u\n", dead_node);
				2211
				2212	mlog(0, "dead node is %u\n", dead_node);
				2213
				2214	spin_lock(&dlm->spinlock);
				2215	dlm_node_iter_init(dlm->domain_map, &iter);
				2216	spin_unlock(&dlm->spinlock);
				2217
				2218	clear_bit(dead_node, iter.node_map);
				2219
				2220	memset(&br, 0, sizeof(br));
				2221	br.node_idx = dlm->node_num;
				2222	br.dead_node = dead_node;
				2223
				2224	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
				2225	ret = 0;
				2226	if (nodenum == dead_node) {
				2227	mlog(0, "not sending begin reco to dead node "
				2228	"%u\n", dead_node);
				2229	continue;
				2230	}
				2231	if (nodenum == dlm->node_num) {
				2232	mlog(0, "not sending begin reco to self\n");
				2233	continue;
				2234	}
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	2235	retry:
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2236	ret = -EINVAL;
				2237	mlog(0, "attempting to send begin reco msg to %d\n",
				2238	nodenum);
				2239	ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key,
				2240	&br, sizeof(br), nodenum, &status);
				2241	/* negative status is handled ok by caller here */
				2242	if (ret >= 0)
				2243	ret = status;
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	2244	if (dlm_is_host_down(ret)) {
				2245	/* node is down. not involved in recovery
				2246	* so just keep going */
				2247	mlog(0, "%s: node %u was down when sending "
				2248	"begin reco msg (%d)\n", dlm->name, nodenum, ret);
				2249	ret = 0;
				2250	}
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2251	if (ret < 0) {
				2252	struct dlm_lock_resource *res;
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	2253	/* this is now a serious problem, possibly ENOMEM
				2254	* in the network stack. must retry */
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2255	mlog_errno(ret);
				2256	mlog(ML_ERROR, "begin reco of dlm %s to node %u "
				2257	" returned %d\n", dlm->name, nodenum, ret);
				2258	res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
				2259	DLM_RECOVERY_LOCK_NAME_LEN);
				2260	if (res) {
				2261	dlm_print_one_lock_resource(res);
				2262	dlm_lockres_put(res);
				2263	} else {
				2264	mlog(ML_ERROR, "recovery lock not found\n");
				2265	}
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	2266	/* sleep for a bit in hopes that we can avoid
				2267	* another ENOMEM */
				2268	msleep(100);
				2269	goto retry;
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2270	}
				2271	}
				2272
				2273	return ret;
				2274	}
				2275
				2276	int dlm_begin_reco_handler(struct o2net_msg msg, u32 len, void data)
				2277	{
				2278	struct dlm_ctxt *dlm = data;
				2279	struct dlm_begin_reco br = (struct dlm_begin_reco )msg->buf;
				2280
				2281	/* ok to return 0, domain has gone away */
				2282	if (!dlm_grab(dlm))
				2283	return 0;
				2284
				2285	mlog(0, "node %u wants to recover node %u\n",
				2286	br->node_idx, br->dead_node);
				2287
				2288	dlm_fire_domain_eviction_callbacks(dlm, br->dead_node);
				2289
				2290	spin_lock(&dlm->spinlock);
				2291	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	2292	if (test_bit(dlm->reco.new_master, dlm->recovery_map)) {
				2293	mlog(0, "%s: new_master %u died, changing "
				2294	"to %u\n", dlm->name, dlm->reco.new_master,
				2295	br->node_idx);
				2296	} else {
				2297	mlog(0, "%s: new_master %u NOT DEAD, changing "
				2298	"to %u\n", dlm->name, dlm->reco.new_master,
				2299	br->node_idx);
				2300	/* may not have seen the new master as dead yet */
				2301	}
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2302	}
				2303	if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	2304	mlog(ML_NOTICE, "%s: dead_node previously set to %u, "
				2305	"node %u changing it to %u\n", dlm->name,
				2306	dlm->reco.dead_node, br->node_idx, br->dead_node);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2307	}
Kurt Hackel	ab27eb6	2006-04-27 18:03:49 -0700	[diff] [blame]	2308	dlm_set_reco_master(dlm, br->node_idx);
				2309	dlm_set_reco_dead_node(dlm, br->dead_node);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2310	if (!test_bit(br->dead_node, dlm->recovery_map)) {
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	2311	mlog(0, "recovery master %u sees %u as dead, but this "
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2312	"node has not yet. marking %u as dead\n",
				2313	br->node_idx, br->dead_node, br->dead_node);
Kurt Hackel	e2faea4	2006-01-12 14:24:55 -0800	[diff] [blame]	2314	if (!test_bit(br->dead_node, dlm->domain_map) \|\|
				2315	!test_bit(br->dead_node, dlm->live_nodes_map))
				2316	mlog(0, "%u not in domain/live_nodes map "
				2317	"so setting it in reco map manually\n",
				2318	br->dead_node);
Kurt Hackel	c03872f	2006-03-06 14:08:49 -0800	[diff] [blame]	2319	/* force the recovery cleanup in __dlm_hb_node_down
				2320	* both of these will be cleared in a moment */
				2321	set_bit(br->dead_node, dlm->domain_map);
				2322	set_bit(br->dead_node, dlm->live_nodes_map);
Kurt Hackel	6714d8e	2005-12-15 14:31:23 -0800	[diff] [blame]	2323	__dlm_hb_node_down(dlm, br->dead_node);
				2324	}
				2325	spin_unlock(&dlm->spinlock);
				2326
				2327	dlm_kick_recovery_thread(dlm);
				2328	dlm_put(dlm);
				2329	return 0;
				2330	}
				2331
				2332	static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
				2333	{
				2334	int ret = 0;
				2335	struct dlm_finalize_reco fr;
				2336	struct dlm_node_iter iter;
				2337	int nodenum;
				2338	int status;
				2339
				2340	mlog(0, "finishing recovery for node %s:%u\n",
				2341	dlm->name, dlm->reco.dead_node);
				2342
				2343	spin_lock(&dlm->spinlock);
				2344	dlm_node_iter_init(dlm->domain_map, &iter);
				2345	spin_unlock(&dlm->spinlock);
				2346
				2347	memset(&fr, 0, sizeof(fr));
				2348	fr.node_idx = dlm->node_num;
				2349	fr.dead_node = dlm->reco.dead_node;
				2350
				2351	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
				2352	if (nodenum == dlm->node_num)
				2353	continue;
				2354	ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key,
				2355	&fr, sizeof(fr), nodenum, &status);
				2356	if (ret >= 0) {
				2357	ret = status;
				2358	if (dlm_is_host_down(ret)) {
				2359	/* this has no effect on this recovery
				2360	* session, so set the status to zero to
				2361	* finish out the last recovery */
				2362	mlog(ML_ERROR, "node %u went down after this "
				2363	"node finished recovery.\n", nodenum);
				2364	ret = 0;
				2365	}
				2366	}
				2367	if (ret < 0) {
				2368	mlog_errno(ret);
				2369	break;
				2370	}
				2371	}
				2372
				2373	return ret;
				2374	}
				2375
				2376	int dlm_finalize_reco_handler(struct o2net_msg msg, u32 len, void data)
				2377	{
				2378	struct dlm_ctxt *dlm = data;
				2379	struct dlm_finalize_reco fr = (struct dlm_finalize_reco )msg->buf;
				2380
				2381	/* ok to return 0, domain has gone away */
				2382	if (!dlm_grab(dlm))
				2383	return 0;
				2384
				2385	mlog(0, "node %u finalizing recovery of node %u\n",
				2386	fr->node_idx, fr->dead_node);
				2387
				2388	spin_lock(&dlm->spinlock);
				2389
				2390	if (dlm->reco.new_master != fr->node_idx) {
				2391	mlog(ML_ERROR, "node %u sent recovery finalize msg, but node "
				2392	"%u is supposed to be the new master, dead=%u\n",
				2393	fr->node_idx, dlm->reco.new_master, fr->dead_node);
				2394	BUG();
				2395	}
				2396	if (dlm->reco.dead_node != fr->dead_node) {
				2397	mlog(ML_ERROR, "node %u sent recovery finalize msg for dead "
				2398	"node %u, but node %u is supposed to be dead\n",
				2399	fr->node_idx, fr->dead_node, dlm->reco.dead_node);
				2400	BUG();
				2401	}
				2402
				2403	dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
				2404
				2405	spin_unlock(&dlm->spinlock);
				2406
				2407	dlm_reset_recovery(dlm);
				2408
				2409	dlm_kick_recovery_thread(dlm);
				2410	dlm_put(dlm);
				2411	return 0;
				2412	}