Blame - fs/ocfs2/cluster/quorum.c - kernel/msm-4.9

blob: c19897d0fe142a5a9926da9552572daf78982938 [file] [log] [blame]

Zach Brown	9821148	2005-12-15 14:31:23 -0800	[diff] [blame]	1	/* -- mode: c; c-basic-offset: 8; --
				2	*
				3	* vim: noexpandtab sw=8 ts=8 sts=0:
				4	*
				5	* Copyright (C) 2005 Oracle. All rights reserved.
				6	*
				7	* This program is free software; you can redistribute it and/or
				8	* modify it under the terms of the GNU General Public
				9	* License as published by the Free Software Foundation; either
				10	* version 2 of the License, or (at your option) any later version.
				11	*
				12	* This program is distributed in the hope that it will be useful,
				13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				15	* General Public License for more details.
				16	*
				17	* You should have received a copy of the GNU General Public
				18	* License along with this program; if not, write to the
				19	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				20	* Boston, MA 021110-1307, USA.
				21	*/
				22
				23	/* This quorum hack is only here until we transition to some more rational
				24	* approach that is driven from userspace. Honest. No foolin'.
				25	*
				26	* Imagine two nodes lose network connectivity to each other but they're still
				27	* up and operating in every other way. Presumably a network timeout indicates
				28	* that a node is broken and should be recovered. They can't both recover each
				29	* other and both carry on without serialising their access to the file system.
				30	* They need to decide who is authoritative. Now extend that problem to
				31	* arbitrary groups of nodes losing connectivity between each other.
				32	*
				33	* So we declare that a node which has given up on connecting to a majority
				34	* of nodes who are still heartbeating will fence itself.
				35	*
				36	* There are huge opportunities for races here. After we give up on a node's
				37	* connection we need to wait long enough to give heartbeat an opportunity
				38	* to declare the node as truly dead. We also need to be careful with the
				39	* race between when we see a node start heartbeating and when we connect
				40	* to it.
				41	*
				42	* So nodes that are in this transtion put a hold on the quorum decision
				43	* with a counter. As they fall out of this transition they drop the count
				44	* and if they're the last, they fire off the decision.
				45	*/
				46	#include <linux/kernel.h>
Zach Brown	9821148	2005-12-15 14:31:23 -0800	[diff] [blame]	47	#include <linux/workqueue.h>
Sunil Mushran	bebe6f1	2007-04-17 13:53:38 -0700	[diff] [blame]	48	#include <linux/reboot.h>
Zach Brown	9821148	2005-12-15 14:31:23 -0800	[diff] [blame]	49
				50	#include "heartbeat.h"
				51	#include "nodemanager.h"
				52	#define MLOG_MASK_PREFIX ML_QUORUM
				53	#include "masklog.h"
				54	#include "quorum.h"
				55
				56	static struct o2quo_state {
				57	spinlock_t qs_lock;
				58	struct work_struct qs_work;
				59	int qs_pending;
				60	int qs_heartbeating;
				61	unsigned long qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
				62	int qs_connected;
				63	unsigned long qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
				64	int qs_holds;
				65	unsigned long qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
				66	} o2quo_state;
				67
				68	/* this is horribly heavy-handed. It should instead flip the file
				69	* system RO and call some userspace script. */
				70	static void o2quo_fence_self(void)
				71	{
				72	/* panic spins with interrupts enabled. with preempt
				73	* threads can still schedule, etc, etc */
				74	o2hb_stop_all_regions();
Sunil Mushran	bebe6f1	2007-04-17 13:53:38 -0700	[diff] [blame]	75
Sunil Mushran	f6656d2	2009-11-17 16:29:19 -0800	[diff] [blame]	76	switch (o2nm_single_cluster->cl_fence_method) {
				77	case O2NM_FENCE_PANIC:
				78	panic("*** ocfs2 is very sorry to be fencing this system by "
				79	"panicing ***\n");
				80	break;
				81	default:
				82	WARN_ON(o2nm_single_cluster->cl_fence_method >=
				83	O2NM_FENCE_METHODS);
				84	case O2NM_FENCE_RESET:
				85	printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this "
				86	"system by restarting ***\n");
				87	emergency_restart();
				88	break;
				89	};
Zach Brown	9821148	2005-12-15 14:31:23 -0800	[diff] [blame]	90	}
				91
Lucas De Marchi	25985ed	2011-03-30 22:57:33 -0300	[diff] [blame]	92	/* Indicate that a timeout occurred on a hearbeat region write. The
Zach Brown	9821148	2005-12-15 14:31:23 -0800	[diff] [blame]	93	* other nodes in the cluster may consider us dead at that time so we
				94	* want to "fence" ourselves so that we don't scribble on the disk
				95	* after they think they've recovered us. This can't solve all
				96	* problems related to writeout after recovery but this hack can at
				97	* least close some of those gaps. When we have real fencing, this can
				98	* go away as our node would be fenced externally before other nodes
				99	* begin recovery. */
				100	void o2quo_disk_timeout(void)
				101	{
				102	o2quo_fence_self();
				103	}
				104
David Howells	c402895	2006-11-22 14:57:56 +0000	[diff] [blame]	105	static void o2quo_make_decision(struct work_struct *work)
Zach Brown	9821148	2005-12-15 14:31:23 -0800	[diff] [blame]	106	{
				107	int quorum;
				108	int lowest_hb, lowest_reachable = 0, fence = 0;
				109	struct o2quo_state *qs = &o2quo_state;
				110
				111	spin_lock(&qs->qs_lock);
				112
				113	lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES);
				114	if (lowest_hb != O2NM_MAX_NODES)
				115	lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm);
				116
				117	mlog(0, "heartbeating: %d, connected: %d, "
				118	"lowest: %d (%sreachable)\n", qs->qs_heartbeating,
				119	qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un");
				120
				121	if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) \|\|
				122	qs->qs_heartbeating == 1)
				123	goto out;
				124
				125	if (qs->qs_heartbeating & 1) {
				126	/* the odd numbered cluster case is straight forward --
				127	* if we can't talk to the majority we're hosed */
				128	quorum = (qs->qs_heartbeating + 1)/2;
				129	if (qs->qs_connected < quorum) {
				130	mlog(ML_ERROR, "fencing this node because it is "
				131	"only connected to %u nodes and %u is needed "
				132	"to make a quorum out of %u heartbeating nodes\n",
				133	qs->qs_connected, quorum,
				134	qs->qs_heartbeating);
				135	fence = 1;
				136	}
				137	} else {
				138	/* the even numbered cluster adds the possibility of each half
				139	* of the cluster being able to talk amongst themselves.. in
				140	* that case we're hosed if we can't talk to the group that has
				141	* the lowest numbered node */
				142	quorum = qs->qs_heartbeating / 2;
				143	if (qs->qs_connected < quorum) {
				144	mlog(ML_ERROR, "fencing this node because it is "
				145	"only connected to %u nodes and %u is needed "
				146	"to make a quorum out of %u heartbeating nodes\n",
				147	qs->qs_connected, quorum,
				148	qs->qs_heartbeating);
				149	fence = 1;
				150	}
				151	else if ((qs->qs_connected == quorum) &&
				152	!lowest_reachable) {
				153	mlog(ML_ERROR, "fencing this node because it is "
				154	"connected to a half-quorum of %u out of %u "
				155	"nodes which doesn't include the lowest active "
				156	"node %u\n", quorum, qs->qs_heartbeating,
				157	lowest_hb);
				158	fence = 1;
				159	}
				160	}
				161
				162	out:
				163	spin_unlock(&qs->qs_lock);
				164	if (fence)
				165	o2quo_fence_self();
				166	}
				167
				168	static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
				169	{
				170	assert_spin_locked(&qs->qs_lock);
				171
				172	if (!test_and_set_bit(node, qs->qs_hold_bm)) {
				173	qs->qs_holds++;
				174	mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES,
				175	"node %u\n", node);
				176	mlog(0, "node %u, %d total\n", node, qs->qs_holds);
				177	}
				178	}
				179
				180	static void o2quo_clear_hold(struct o2quo_state *qs, u8 node)
				181	{
				182	assert_spin_locked(&qs->qs_lock);
				183
				184	if (test_and_clear_bit(node, qs->qs_hold_bm)) {
				185	mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1);
				186	if (--qs->qs_holds == 0) {
				187	if (qs->qs_pending) {
				188	qs->qs_pending = 0;
				189	schedule_work(&qs->qs_work);
				190	}
				191	}
				192	mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n",
				193	node, qs->qs_holds);
				194	}
				195	}
				196
				197	/* as a node comes up we delay the quorum decision until we know the fate of
				198	* the connection. the hold will be droped in conn_up or hb_down. it might be
				199	* perpetuated by con_err until hb_down. if we already have a conn, we might
				200	* be dropping a hold that conn_up got. */
				201	void o2quo_hb_up(u8 node)
				202	{
				203	struct o2quo_state *qs = &o2quo_state;
				204
				205	spin_lock(&qs->qs_lock);
				206
				207	qs->qs_heartbeating++;
				208	mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES,
				209	"node %u\n", node);
				210	mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node);
				211	set_bit(node, qs->qs_hb_bm);
				212
				213	mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
				214
				215	if (!test_bit(node, qs->qs_conn_bm))
				216	o2quo_set_hold(qs, node);
				217	else
				218	o2quo_clear_hold(qs, node);
				219
				220	spin_unlock(&qs->qs_lock);
				221	}
				222
				223	/* hb going down releases any holds we might have had due to this node from
				224	* conn_up, conn_err, or hb_up */
				225	void o2quo_hb_down(u8 node)
				226	{
				227	struct o2quo_state *qs = &o2quo_state;
				228
				229	spin_lock(&qs->qs_lock);
				230
				231	qs->qs_heartbeating--;
				232	mlog_bug_on_msg(qs->qs_heartbeating < 0,
				233	"node %u, %d heartbeating\n",
				234	node, qs->qs_heartbeating);
				235	mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node);
				236	clear_bit(node, qs->qs_hb_bm);
				237
				238	mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
				239
				240	o2quo_clear_hold(qs, node);
				241
				242	spin_unlock(&qs->qs_lock);
				243	}
				244
				245	/* this tells us that we've decided that the node is still heartbeating
				246	* even though we've lost it's conn. it must only be called after conn_err
				247	* and indicates that we must now make a quorum decision in the future,
				248	* though we might be doing so after waiting for holds to drain. Here
				249	* we'll be dropping the hold from conn_err. */
				250	void o2quo_hb_still_up(u8 node)
				251	{
				252	struct o2quo_state *qs = &o2quo_state;
				253
				254	spin_lock(&qs->qs_lock);
				255
				256	mlog(0, "node %u\n", node);
				257
				258	qs->qs_pending = 1;
				259	o2quo_clear_hold(qs, node);
				260
				261	spin_unlock(&qs->qs_lock);
				262	}
				263
Lucas De Marchi	25985ed	2011-03-30 22:57:33 -0300	[diff] [blame]	264	/* This is analogous to hb_up. as a node's connection comes up we delay the
Zach Brown	9821148	2005-12-15 14:31:23 -0800	[diff] [blame]	265	* quorum decision until we see it heartbeating. the hold will be droped in
				266	* hb_up or hb_down. it might be perpetuated by con_err until hb_down. if
				267	* it's already heartbeating we we might be dropping a hold that conn_up got.
				268	* */
				269	void o2quo_conn_up(u8 node)
				270	{
				271	struct o2quo_state *qs = &o2quo_state;
				272
				273	spin_lock(&qs->qs_lock);
				274
				275	qs->qs_connected++;
				276	mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES,
				277	"node %u\n", node);
				278	mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node);
				279	set_bit(node, qs->qs_conn_bm);
				280
				281	mlog(0, "node %u, %d total\n", node, qs->qs_connected);
				282
				283	if (!test_bit(node, qs->qs_hb_bm))
				284	o2quo_set_hold(qs, node);
				285	else
				286	o2quo_clear_hold(qs, node);
				287
				288	spin_unlock(&qs->qs_lock);
				289	}
				290
				291	/* we've decided that we won't ever be connecting to the node again. if it's
				292	* still heartbeating we grab a hold that will delay decisions until either the
				293	* node stops heartbeating from hb_down or the caller decides that the node is
				294	* still up and calls still_up */
				295	void o2quo_conn_err(u8 node)
				296	{
				297	struct o2quo_state *qs = &o2quo_state;
				298
				299	spin_lock(&qs->qs_lock);
				300
				301	if (test_bit(node, qs->qs_conn_bm)) {
				302	qs->qs_connected--;
				303	mlog_bug_on_msg(qs->qs_connected < 0,
				304	"node %u, connected %d\n",
				305	node, qs->qs_connected);
				306
				307	clear_bit(node, qs->qs_conn_bm);
				308	}
				309
				310	mlog(0, "node %u, %d total\n", node, qs->qs_connected);
				311
				312	if (test_bit(node, qs->qs_hb_bm))
				313	o2quo_set_hold(qs, node);
				314
				315	spin_unlock(&qs->qs_lock);
				316	}
				317
				318	void o2quo_init(void)
				319	{
				320	struct o2quo_state *qs = &o2quo_state;
				321
				322	spin_lock_init(&qs->qs_lock);
David Howells	c402895	2006-11-22 14:57:56 +0000	[diff] [blame]	323	INIT_WORK(&qs->qs_work, o2quo_make_decision);
Zach Brown	9821148	2005-12-15 14:31:23 -0800	[diff] [blame]	324	}
				325
				326	void o2quo_exit(void)
				327	{
Tejun Heo	9b00a81	2010-12-24 15:59:06 +0100	[diff] [blame]	328	struct o2quo_state *qs = &o2quo_state;
				329
Tejun Heo	4382973	2012-08-20 14:51:24 -0700	[diff] [blame]	330	flush_work(&qs->qs_work);
Zach Brown	9821148	2005-12-15 14:31:23 -0800	[diff] [blame]	331	}