Blame - fs/ocfs2/super.c - kernel/msm

blob: d17e33e66a1e5228cead65463a3255adb383fa71 [file] [log] [blame]

Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1	/* -- mode: c; c-basic-offset: 8; --
				2	* vim: noexpandtab sw=8 ts=8 sts=0:
				3	*
				4	* super.c
				5	*
				6	* load/unload driver, mount/dismount volumes
				7	*
				8	* Copyright (C) 2002, 2004 Oracle. All rights reserved.
				9	*
				10	* This program is free software; you can redistribute it and/or
				11	* modify it under the terms of the GNU General Public
				12	* License as published by the Free Software Foundation; either
				13	* version 2 of the License, or (at your option) any later version.
				14	*
				15	* This program is distributed in the hope that it will be useful,
				16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				18	* General Public License for more details.
				19	*
				20	* You should have received a copy of the GNU General Public
				21	* License along with this program; if not, write to the
				22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				23	* Boston, MA 021110-1307, USA.
				24	*/
				25
				26	#include <linux/module.h>
				27	#include <linux/fs.h>
				28	#include <linux/types.h>
				29	#include <linux/slab.h>
				30	#include <linux/highmem.h>
				31	#include <linux/utsname.h>
				32	#include <linux/init.h>
				33	#include <linux/random.h>
				34	#include <linux/statfs.h>
				35	#include <linux/moduleparam.h>
				36	#include <linux/blkdev.h>
				37	#include <linux/socket.h>
				38	#include <linux/inet.h>
				39	#include <linux/parser.h>
				40	#include <linux/crc32.h>
				41	#include <linux/debugfs.h>
				42
				43	#include <cluster/nodemanager.h>
				44
				45	#define MLOG_MASK_PREFIX ML_SUPER
				46	#include <cluster/masklog.h>
				47
				48	#include "ocfs2.h"
				49
				50	/* this should be the only file to include a version 1 header */
				51	#include "ocfs1_fs_compat.h"
				52
				53	#include "alloc.h"
				54	#include "dlmglue.h"
				55	#include "export.h"
				56	#include "extent_map.h"
				57	#include "heartbeat.h"
				58	#include "inode.h"
				59	#include "journal.h"
				60	#include "localalloc.h"
				61	#include "namei.h"
				62	#include "slot_map.h"
				63	#include "super.h"
				64	#include "sysfile.h"
				65	#include "uptodate.h"
				66	#include "ver.h"
				67	#include "vote.h"
				68
				69	#include "buffer_head_io.h"
				70
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	71	static kmem_cache_t *ocfs2_inode_cachep = NULL;
				72
				73	kmem_cache_t *ocfs2_lock_cache = NULL;
				74
				75	/* OCFS2 needs to schedule several differnt types of work which
				76	* require cluster locking, disk I/O, recovery waits, etc. Since these
				77	* types of work tend to be heavy we avoid using the kernel events
				78	* workqueue and schedule on our own. */
				79	struct workqueue_struct *ocfs2_wq = NULL;
				80
				81	static struct dentry *ocfs2_debugfs_root = NULL;
				82
				83	MODULE_AUTHOR("Oracle");
				84	MODULE_LICENSE("GPL");
				85
				86	static int ocfs2_parse_options(struct super_block sb, char options,
				87	unsigned long *mount_opt, int is_remount);
				88	static void ocfs2_put_super(struct super_block *sb);
				89	static int ocfs2_mount_volume(struct super_block *sb);
				90	static int ocfs2_remount(struct super_block sb, int flags, char *data);
				91	static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err);
				92	static int ocfs2_initialize_mem_caches(void);
				93	static void ocfs2_free_mem_caches(void);
				94	static void ocfs2_delete_osb(struct ocfs2_super *osb);
				95
David Howells	726c334	2006-06-23 02:02:58 -0700	[diff] [blame]	96	static int ocfs2_statfs(struct dentry dentry, struct kstatfs buf);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	97
				98	static int ocfs2_sync_fs(struct super_block *sb, int wait);
				99
				100	static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
				101	static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
				102	static int ocfs2_release_system_inodes(struct ocfs2_super *osb);
				103	static int ocfs2_fill_local_node_info(struct ocfs2_super *osb);
				104	static int ocfs2_check_volume(struct ocfs2_super *osb);
				105	static int ocfs2_verify_volume(struct ocfs2_dinode *di,
				106	struct buffer_head *bh,
				107	u32 sectsize);
				108	static int ocfs2_initialize_super(struct super_block *sb,
				109	struct buffer_head *bh,
				110	int sector_size);
				111	static int ocfs2_get_sector(struct super_block *sb,
				112	struct buffer_head **bh,
				113	int block,
				114	int sect_size);
				115	static void ocfs2_write_super(struct super_block *sb);
				116	static struct inode ocfs2_alloc_inode(struct super_block sb);
				117	static void ocfs2_destroy_inode(struct inode *inode);
				118
				119	static unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
				120
				121	static struct super_operations ocfs2_sops = {
				122	.statfs = ocfs2_statfs,
				123	.alloc_inode = ocfs2_alloc_inode,
				124	.destroy_inode = ocfs2_destroy_inode,
				125	.drop_inode = ocfs2_drop_inode,
				126	.clear_inode = ocfs2_clear_inode,
				127	.delete_inode = ocfs2_delete_inode,
				128	.sync_fs = ocfs2_sync_fs,
				129	.write_super = ocfs2_write_super,
				130	.put_super = ocfs2_put_super,
				131	.remount_fs = ocfs2_remount,
				132	};
				133
				134	enum {
				135	Opt_barrier,
				136	Opt_err_panic,
				137	Opt_err_ro,
				138	Opt_intr,
				139	Opt_nointr,
				140	Opt_hb_none,
				141	Opt_hb_local,
				142	Opt_data_ordered,
				143	Opt_data_writeback,
				144	Opt_err,
				145	};
				146
				147	static match_table_t tokens = {
				148	{Opt_barrier, "barrier=%u"},
				149	{Opt_err_panic, "errors=panic"},
				150	{Opt_err_ro, "errors=remount-ro"},
				151	{Opt_intr, "intr"},
				152	{Opt_nointr, "nointr"},
				153	{Opt_hb_none, OCFS2_HB_NONE},
				154	{Opt_hb_local, OCFS2_HB_LOCAL},
				155	{Opt_data_ordered, "data=ordered"},
				156	{Opt_data_writeback, "data=writeback"},
				157	{Opt_err, NULL}
				158	};
				159
				160	/*
				161	* write_super and sync_fs ripped right out of ext3.
				162	*/
				163	static void ocfs2_write_super(struct super_block *sb)
				164	{
Ingo Molnar	7892f2f	2006-01-09 15:59:25 -0800	[diff] [blame]	165	if (mutex_trylock(&sb->s_lock) != 0)
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	166	BUG();
				167	sb->s_dirt = 0;
				168	}
				169
				170	static int ocfs2_sync_fs(struct super_block *sb, int wait)
				171	{
				172	int status = 0;
				173	tid_t target;
				174	struct ocfs2_super *osb = OCFS2_SB(sb);
				175
				176	sb->s_dirt = 0;
				177
				178	if (ocfs2_is_hard_readonly(osb))
				179	return -EROFS;
				180
				181	if (wait) {
				182	status = ocfs2_flush_truncate_log(osb);
				183	if (status < 0)
				184	mlog_errno(status);
				185	} else {
				186	ocfs2_schedule_truncate_log_flush(osb, 0);
				187	}
				188
				189	if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) {
				190	if (wait)
				191	log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
				192	target);
				193	}
				194	return 0;
				195	}
				196
				197	static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
				198	{
				199	struct inode *new = NULL;
				200	int status = 0;
				201	int i;
				202
				203	mlog_entry_void();
				204
				205	new = ocfs2_iget(osb, osb->root_blkno);
				206	if (IS_ERR(new)) {
				207	status = PTR_ERR(new);
				208	mlog_errno(status);
				209	goto bail;
				210	}
				211	osb->root_inode = new;
				212
				213	new = ocfs2_iget(osb, osb->system_dir_blkno);
				214	if (IS_ERR(new)) {
				215	status = PTR_ERR(new);
				216	mlog_errno(status);
				217	goto bail;
				218	}
				219	osb->sys_root_inode = new;
				220
				221	for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
				222	i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
				223	new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
				224	if (!new) {
				225	ocfs2_release_system_inodes(osb);
				226	status = -EINVAL;
				227	mlog_errno(status);
				228	/* FIXME: Should ERROR_RO_FS */
				229	mlog(ML_ERROR, "Unable to load system inode %d, "
				230	"possibly corrupt fs?", i);
				231	goto bail;
				232	}
				233	// the array now has one ref, so drop this one
				234	iput(new);
				235	}
				236
				237	bail:
				238	mlog_exit(status);
				239	return status;
				240	}
				241
				242	static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
				243	{
				244	struct inode *new = NULL;
				245	int status = 0;
				246	int i;
				247
				248	mlog_entry_void();
				249
				250	for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
				251	i < NUM_SYSTEM_INODES;
				252	i++) {
				253	new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
				254	if (!new) {
				255	ocfs2_release_system_inodes(osb);
				256	status = -EINVAL;
				257	mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n",
				258	status, i, osb->slot_num);
				259	goto bail;
				260	}
				261	/* the array now has one ref, so drop this one */
				262	iput(new);
				263	}
				264
				265	bail:
				266	mlog_exit(status);
				267	return status;
				268	}
				269
				270	static int ocfs2_release_system_inodes(struct ocfs2_super *osb)
				271	{
				272	int status = 0, i;
				273	struct inode *inode;
				274
				275	mlog_entry_void();
				276
				277	for (i = 0; i < NUM_SYSTEM_INODES; i++) {
				278	inode = osb->system_inodes[i];
				279	if (inode) {
				280	iput(inode);
				281	osb->system_inodes[i] = NULL;
				282	}
				283	}
				284
				285	inode = osb->sys_root_inode;
				286	if (inode) {
				287	iput(inode);
				288	osb->sys_root_inode = NULL;
				289	}
				290
				291	inode = osb->root_inode;
				292	if (inode) {
				293	iput(inode);
				294	osb->root_inode = NULL;
				295	}
				296
				297	mlog_exit(status);
				298	return status;
				299	}
				300
				301	/* We're allocating fs objects, use GFP_NOFS */
				302	static struct inode ocfs2_alloc_inode(struct super_block sb)
				303	{
				304	struct ocfs2_inode_info *oi;
				305
				306	oi = kmem_cache_alloc(ocfs2_inode_cachep, SLAB_NOFS);
				307	if (!oi)
				308	return NULL;
				309
				310	return &oi->vfs_inode;
				311	}
				312
				313	static void ocfs2_destroy_inode(struct inode *inode)
				314	{
				315	kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
				316	}
				317
				318	/* From xfs_super.c:xfs_max_file_offset
				319	* Copyright (c) 2000-2004 Silicon Graphics, Inc.
				320	*/
				321	static unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
				322	{
				323	unsigned int pagefactor = 1;
				324	unsigned int bitshift = BITS_PER_LONG - 1;
				325
				326	/* Figure out maximum filesize, on Linux this can depend on
				327	* the filesystem blocksize (on 32 bit platforms).
				328	* __block_prepare_write does this in an [unsigned] long...
				329	* page->index << (PAGE_CACHE_SHIFT - bbits)
				330	* So, for page sized blocks (4K on 32 bit platforms),
				331	* this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
				332	* (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
				333	* but for smaller blocksizes it is less (bbits = log2 bsize).
				334	* Note1: get_block_t takes a long (implicit cast from above)
				335	* Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
				336	* can optionally convert the [unsigned] long from above into
				337	* an [unsigned] long long.
				338	*/
				339
				340	#if BITS_PER_LONG == 32
				341	# if defined(CONFIG_LBD)
				342	BUG_ON(sizeof(sector_t) != 8);
				343	pagefactor = PAGE_CACHE_SIZE;
				344	bitshift = BITS_PER_LONG;
				345	# else
				346	pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
				347	# endif
				348	#endif
				349
				350	return (((unsigned long long)pagefactor) << bitshift) - 1;
				351	}
				352
				353	static int ocfs2_remount(struct super_block sb, int flags, char *data)
				354	{
				355	int incompat_features;
				356	int ret = 0;
				357	unsigned long parsed_options;
				358	struct ocfs2_super *osb = OCFS2_SB(sb);
				359
				360	if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
				361	ret = -EINVAL;
				362	goto out;
				363	}
				364
				365	if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) !=
				366	(parsed_options & OCFS2_MOUNT_HB_LOCAL)) {
				367	ret = -EINVAL;
				368	mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
				369	goto out;
				370	}
				371
				372	if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) !=
				373	(parsed_options & OCFS2_MOUNT_DATA_WRITEBACK)) {
				374	ret = -EINVAL;
				375	mlog(ML_ERROR, "Cannot change data mode on remount\n");
				376	goto out;
				377	}
				378
				379	/* We're going to/from readonly mode. */
				380	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
				381	/* Lock here so the check of HARD_RO and the potential
				382	* setting of SOFT_RO is atomic. */
				383	spin_lock(&osb->osb_lock);
				384	if (osb->osb_flags & OCFS2_OSB_HARD_RO) {
				385	mlog(ML_ERROR, "Remount on readonly device is forbidden.\n");
				386	ret = -EROFS;
				387	goto unlock_osb;
				388	}
				389
				390	if (*flags & MS_RDONLY) {
				391	mlog(0, "Going to ro mode.\n");
				392	sb->s_flags \|= MS_RDONLY;
				393	osb->osb_flags \|= OCFS2_OSB_SOFT_RO;
				394	} else {
				395	mlog(0, "Making ro filesystem writeable.\n");
				396
				397	if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
				398	mlog(ML_ERROR, "Cannot remount RDWR "
				399	"filesystem due to previous errors.\n");
				400	ret = -EROFS;
				401	goto unlock_osb;
				402	}
				403	incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP);
				404	if (incompat_features) {
				405	mlog(ML_ERROR, "Cannot remount RDWR because "
				406	"of unsupported optional features "
				407	"(%x).\n", incompat_features);
				408	ret = -EINVAL;
				409	goto unlock_osb;
				410	}
				411	sb->s_flags &= ~MS_RDONLY;
				412	osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
				413	}
				414	unlock_osb:
				415	spin_unlock(&osb->osb_lock);
				416	}
				417
				418	if (!ret) {
				419	if (!ocfs2_is_hard_readonly(osb))
				420	ocfs2_set_journal_params(osb);
				421
				422	/* Only save off the new mount options in case of a successful
				423	* remount. */
				424	osb->s_mount_opt = parsed_options;
				425	}
				426	out:
				427	return ret;
				428	}
				429
				430	static int ocfs2_sb_probe(struct super_block *sb,
				431	struct buffer_head **bh,
				432	int *sector_size)
				433	{
				434	int status = 0, tmpstat;
				435	struct ocfs1_vol_disk_hdr *hdr;
				436	struct ocfs2_dinode *di;
				437	int blksize;
				438
				439	*bh = NULL;
				440
				441	/* may be > 512 */
				442	*sector_size = bdev_hardsect_size(sb->s_bdev);
				443	if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
				444	mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
				445	*sector_size, OCFS2_MAX_BLOCKSIZE);
				446	status = -EINVAL;
				447	goto bail;
				448	}
				449
				450	/* Can this really happen? */
				451	if (*sector_size < OCFS2_MIN_BLOCKSIZE)
				452	*sector_size = OCFS2_MIN_BLOCKSIZE;
				453
				454	/* check block zero for old format */
				455	status = ocfs2_get_sector(sb, bh, 0, *sector_size);
				456	if (status < 0) {
				457	mlog_errno(status);
				458	goto bail;
				459	}
				460	hdr = (struct ocfs1_vol_disk_hdr ) (bh)->b_data;
				461	if (hdr->major_version == OCFS1_MAJOR_VERSION) {
				462	mlog(ML_ERROR, "incompatible version: %u.%u\n",
				463	hdr->major_version, hdr->minor_version);
				464	status = -EINVAL;
				465	}
				466	if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE,
				467	strlen(OCFS1_VOLUME_SIGNATURE)) == 0) {
				468	mlog(ML_ERROR, "incompatible volume signature: %8s\n",
				469	hdr->signature);
				470	status = -EINVAL;
				471	}
				472	brelse(*bh);
				473	*bh = NULL;
				474	if (status < 0) {
				475	mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be "
				476	"upgraded before mounting with ocfs v2\n");
				477	goto bail;
				478	}
				479
				480	/*
				481	* Now check at magic offset for 512, 1024, 2048, 4096
				482	* blocksizes. 4096 is the maximum blocksize because it is
				483	* the minimum clustersize.
				484	*/
				485	status = -EINVAL;
				486	for (blksize = *sector_size;
				487	blksize <= OCFS2_MAX_BLOCKSIZE;
				488	blksize <<= 1) {
				489	tmpstat = ocfs2_get_sector(sb, bh,
				490	OCFS2_SUPER_BLOCK_BLKNO,
				491	blksize);
				492	if (tmpstat < 0) {
				493	status = tmpstat;
				494	mlog_errno(status);
				495	goto bail;
				496	}
				497	di = (struct ocfs2_dinode ) (bh)->b_data;
				498	status = ocfs2_verify_volume(di, *bh, blksize);
				499	if (status >= 0)
				500	goto bail;
				501	brelse(*bh);
				502	*bh = NULL;
				503	if (status != -EAGAIN)
				504	break;
				505	}
				506
				507	bail:
				508	return status;
				509	}
				510
				511	static int ocfs2_fill_super(struct super_block sb, void data, int silent)
				512	{
				513	struct dentry *root;
				514	int status, sector_size;
				515	unsigned long parsed_opt;
				516	struct inode *inode = NULL;
				517	struct ocfs2_super *osb = NULL;
				518	struct buffer_head *bh = NULL;
				519
				520	mlog_entry("%p, %p, %i", sb, data, silent);
				521
				522	/* for now we only have one cluster/node, make sure we see it
				523	* in the heartbeat universe */
				524	if (!o2hb_check_local_node_heartbeating()) {
				525	status = -EINVAL;
				526	goto read_super_error;
				527	}
				528
				529	/* probe for superblock */
				530	status = ocfs2_sb_probe(sb, &bh, &sector_size);
				531	if (status < 0) {
				532	mlog(ML_ERROR, "superblock probe failed!\n");
				533	goto read_super_error;
				534	}
				535
				536	status = ocfs2_initialize_super(sb, bh, sector_size);
				537	osb = OCFS2_SB(sb);
				538	if (status < 0) {
				539	mlog_errno(status);
				540	goto read_super_error;
				541	}
				542	brelse(bh);
				543	bh = NULL;
				544
				545	if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) {
				546	status = -EINVAL;
				547	goto read_super_error;
				548	}
				549	osb->s_mount_opt = parsed_opt;
				550
				551	sb->s_magic = OCFS2_SUPER_MAGIC;
				552
				553	/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
				554	* heartbeat=none */
				555	if (bdev_read_only(sb->s_bdev)) {
				556	if (!(sb->s_flags & MS_RDONLY)) {
				557	status = -EACCES;
				558	mlog(ML_ERROR, "Readonly device detected but readonly "
				559	"mount was not specified.\n");
				560	goto read_super_error;
				561	}
				562
				563	/* You should not be able to start a local heartbeat
				564	* on a readonly device. */
				565	if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
				566	status = -EROFS;
				567	mlog(ML_ERROR, "Local heartbeat specified on readonly "
				568	"device.\n");
				569	goto read_super_error;
				570	}
				571
				572	status = ocfs2_check_journals_nolocks(osb);
				573	if (status < 0) {
				574	if (status == -EROFS)
				575	mlog(ML_ERROR, "Recovery required on readonly "
				576	"file system, but write access is "
				577	"unavailable.\n");
				578	else
				579	mlog_errno(status);
				580	goto read_super_error;
				581	}
				582
				583	ocfs2_set_ro_flag(osb, 1);
				584
				585	printk(KERN_NOTICE "Readonly device detected. No cluster "
				586	"services will be utilized for this mount. Recovery "
				587	"will be skipped.\n");
				588	}
				589
				590	if (!ocfs2_is_hard_readonly(osb)) {
				591	/* If this isn't a hard readonly mount, then we need
				592	* to make sure that heartbeat is in a valid state,
				593	* and that we mark ourselves soft readonly is -oro
				594	* was specified. */
				595	if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
				596	mlog(ML_ERROR, "No heartbeat for device (%s)\n",
				597	sb->s_id);
				598	status = -EINVAL;
				599	goto read_super_error;
				600	}
				601
				602	if (sb->s_flags & MS_RDONLY)
				603	ocfs2_set_ro_flag(osb, 0);
				604	}
				605
				606	osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
				607	ocfs2_debugfs_root);
				608	if (!osb->osb_debug_root) {
				609	status = -EINVAL;
				610	mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n");
				611	goto read_super_error;
				612	}
				613
				614	status = ocfs2_mount_volume(sb);
				615	if (osb->root_inode)
				616	inode = igrab(osb->root_inode);
				617
				618	if (status < 0)
				619	goto read_super_error;
				620
				621	if (!inode) {
				622	status = -EIO;
				623	mlog_errno(status);
				624	goto read_super_error;
				625	}
				626
				627	root = d_alloc_root(inode);
				628	if (!root) {
				629	status = -ENOMEM;
				630	mlog_errno(status);
				631	goto read_super_error;
				632	}
				633
				634	sb->s_root = root;
				635
				636	ocfs2_complete_mount_recovery(osb);
				637
Sunil Mushran	781ee3e	2006-04-27 16:41:31 -0700	[diff] [blame]	638	printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %d, slot %d) "
				639	"with %s data mode.\n",
				640	osb->dev_str, osb->node_num, osb->slot_num,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	641	osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
				642	"ordered");
				643
				644	atomic_set(&osb->vol_state, VOLUME_MOUNTED);
				645	wake_up(&osb->osb_mount_event);
				646
				647	mlog_exit(status);
				648	return status;
				649
				650	read_super_error:
				651	if (bh != NULL)
				652	brelse(bh);
				653
				654	if (inode)
				655	iput(inode);
				656
				657	if (osb) {
				658	atomic_set(&osb->vol_state, VOLUME_DISABLED);
				659	wake_up(&osb->osb_mount_event);
				660	ocfs2_dismount_volume(sb, 1);
				661	}
				662
				663	mlog_exit(status);
				664	return status;
				665	}
				666
David Howells	454e239	2006-06-23 02:02:57 -0700	[diff] [blame]	667	static int ocfs2_get_sb(struct file_system_type *fs_type,
				668	int flags,
				669	const char *dev_name,
				670	void *data,
				671	struct vfsmount *mnt)
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	672	{
David Howells	454e239	2006-06-23 02:02:57 -0700	[diff] [blame]	673	return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super,
				674	mnt);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	675	}
				676
				677	static struct file_system_type ocfs2_fs_type = {
				678	.owner = THIS_MODULE,
				679	.name = "ocfs2",
				680	.get_sb = ocfs2_get_sb, /* is this called when we mount
				681	* the fs? */
				682	.kill_sb = kill_block_super, /* set to the generic one
				683	* right now, but do we
				684	* need to change that? */
				685	.fs_flags = FS_REQUIRES_DEV,
				686	.next = NULL
				687	};
				688
				689	static int ocfs2_parse_options(struct super_block *sb,
				690	char *options,
				691	unsigned long *mount_opt,
				692	int is_remount)
				693	{
				694	int status;
				695	char *p;
				696
				697	mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
				698	options ? options : "(none)");
				699
				700	*mount_opt = 0;
				701
				702	if (!options) {
				703	status = 1;
				704	goto bail;
				705	}
				706
				707	while ((p = strsep(&options, ",")) != NULL) {
				708	int token, option;
				709	substring_t args[MAX_OPT_ARGS];
				710
				711	if (!*p)
				712	continue;
				713
				714	token = match_token(p, tokens, args);
				715	switch (token) {
				716	case Opt_hb_local:
				717	*mount_opt \|= OCFS2_MOUNT_HB_LOCAL;
				718	break;
				719	case Opt_hb_none:
				720	*mount_opt &= ~OCFS2_MOUNT_HB_LOCAL;
				721	break;
				722	case Opt_barrier:
				723	if (match_int(&args[0], &option)) {
				724	status = 0;
				725	goto bail;
				726	}
				727	if (option)
				728	*mount_opt \|= OCFS2_MOUNT_BARRIER;
				729	else
				730	*mount_opt &= ~OCFS2_MOUNT_BARRIER;
				731	break;
				732	case Opt_intr:
				733	*mount_opt &= ~OCFS2_MOUNT_NOINTR;
				734	break;
				735	case Opt_nointr:
				736	*mount_opt \|= OCFS2_MOUNT_NOINTR;
				737	break;
				738	case Opt_err_panic:
				739	*mount_opt \|= OCFS2_MOUNT_ERRORS_PANIC;
				740	break;
				741	case Opt_err_ro:
				742	*mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
				743	break;
				744	case Opt_data_ordered:
				745	*mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
				746	break;
				747	case Opt_data_writeback:
				748	*mount_opt \|= OCFS2_MOUNT_DATA_WRITEBACK;
				749	break;
				750	default:
				751	mlog(ML_ERROR,
				752	"Unrecognized mount option \"%s\" "
				753	"or missing value\n", p);
				754	status = 0;
				755	goto bail;
				756	}
				757	}
				758
				759	status = 1;
				760
				761	bail:
				762	mlog_exit(status);
				763	return status;
				764	}
				765
				766	static int __init ocfs2_init(void)
				767	{
				768	int status;
				769
				770	mlog_entry_void();
				771
				772	ocfs2_print_version();
				773
				774	if (init_ocfs2_extent_maps())
				775	return -ENOMEM;
				776
				777	status = init_ocfs2_uptodate_cache();
				778	if (status < 0) {
				779	mlog_errno(status);
				780	goto leave;
				781	}
				782
				783	status = ocfs2_initialize_mem_caches();
				784	if (status < 0) {
				785	mlog_errno(status);
				786	goto leave;
				787	}
				788
				789	ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
				790	if (!ocfs2_wq) {
				791	status = -ENOMEM;
				792	goto leave;
				793	}
				794
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	795	ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
				796	if (!ocfs2_debugfs_root) {
				797	status = -EFAULT;
				798	mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
				799	}
				800
				801	leave:
				802	if (status < 0) {
				803	ocfs2_free_mem_caches();
				804	exit_ocfs2_uptodate_cache();
				805	exit_ocfs2_extent_maps();
				806	}
				807
				808	mlog_exit(status);
				809
				810	if (status >= 0) {
				811	return register_filesystem(&ocfs2_fs_type);
				812	} else
				813	return -1;
				814	}
				815
				816	static void __exit ocfs2_exit(void)
				817	{
				818	mlog_entry_void();
				819
				820	if (ocfs2_wq) {
				821	flush_workqueue(ocfs2_wq);
				822	destroy_workqueue(ocfs2_wq);
				823	}
				824
				825	debugfs_remove(ocfs2_debugfs_root);
				826
				827	ocfs2_free_mem_caches();
				828
				829	unregister_filesystem(&ocfs2_fs_type);
				830
				831	exit_ocfs2_extent_maps();
				832
				833	exit_ocfs2_uptodate_cache();
				834
				835	mlog_exit_void();
				836	}
				837
				838	static void ocfs2_put_super(struct super_block *sb)
				839	{
				840	mlog_entry("(0x%p)\n", sb);
				841
				842	ocfs2_sync_blockdev(sb);
				843	ocfs2_dismount_volume(sb, 0);
				844
				845	mlog_exit_void();
				846	}
				847
David Howells	726c334	2006-06-23 02:02:58 -0700	[diff] [blame]	848	static int ocfs2_statfs(struct dentry dentry, struct kstatfs buf)
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	849	{
				850	struct ocfs2_super *osb;
				851	u32 numbits, freebits;
				852	int status;
				853	struct ocfs2_dinode *bm_lock;
				854	struct buffer_head *bh = NULL;
				855	struct inode *inode = NULL;
				856
David Howells	726c334	2006-06-23 02:02:58 -0700	[diff] [blame]	857	mlog_entry("(%p, %p)\n", dentry->d_sb, buf);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	858
David Howells	726c334	2006-06-23 02:02:58 -0700	[diff] [blame]	859	osb = OCFS2_SB(dentry->d_sb);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	860
				861	inode = ocfs2_get_system_file_inode(osb,
				862	GLOBAL_BITMAP_SYSTEM_INODE,
				863	OCFS2_INVALID_SLOT);
				864	if (!inode) {
				865	mlog(ML_ERROR, "failed to get bitmap inode\n");
				866	status = -EIO;
				867	goto bail;
				868	}
				869
				870	status = ocfs2_meta_lock(inode, NULL, &bh, 0);
				871	if (status < 0) {
				872	mlog_errno(status);
				873	goto bail;
				874	}
				875
				876	bm_lock = (struct ocfs2_dinode *) bh->b_data;
				877
				878	numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total);
				879	freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used);
				880
				881	buf->f_type = OCFS2_SUPER_MAGIC;
David Howells	726c334	2006-06-23 02:02:58 -0700	[diff] [blame]	882	buf->f_bsize = dentry->d_sb->s_blocksize;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	883	buf->f_namelen = OCFS2_MAX_FILENAME_LEN;
				884	buf->f_blocks = ((sector_t) numbits) *
				885	(osb->s_clustersize >> osb->sb->s_blocksize_bits);
				886	buf->f_bfree = ((sector_t) freebits) *
				887	(osb->s_clustersize >> osb->sb->s_blocksize_bits);
				888	buf->f_bavail = buf->f_bfree;
				889	buf->f_files = numbits;
				890	buf->f_ffree = freebits;
				891
				892	brelse(bh);
				893
				894	ocfs2_meta_unlock(inode, 0);
				895	status = 0;
				896	bail:
				897	if (inode)
				898	iput(inode);
				899
				900	mlog_exit(status);
				901
				902	return status;
				903	}
				904
				905	static void ocfs2_inode_init_once(void *data,
				906	kmem_cache_t *cachep,
				907	unsigned long flags)
				908	{
				909	struct ocfs2_inode_info *oi = data;
				910
				911	if ((flags & (SLAB_CTOR_VERIFY\|SLAB_CTOR_CONSTRUCTOR)) ==
				912	SLAB_CTOR_CONSTRUCTOR) {
				913	oi->ip_flags = 0;
				914	oi->ip_open_count = 0;
				915	spin_lock_init(&oi->ip_lock);
				916	ocfs2_extent_map_init(&oi->vfs_inode);
				917	INIT_LIST_HEAD(&oi->ip_handle_list);
				918	INIT_LIST_HEAD(&oi->ip_io_markers);
				919	oi->ip_handle = NULL;
				920	oi->ip_created_trans = 0;
				921	oi->ip_last_trans = 0;
				922	oi->ip_dir_start_lookup = 0;
				923
				924	init_rwsem(&oi->ip_alloc_sem);
Mark Fasheh	251b6ec	2006-01-10 15:41:43 -0800	[diff] [blame]	925	mutex_init(&oi->ip_io_mutex);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	926
				927	oi->ip_blkno = 0ULL;
				928	oi->ip_clusters = 0;
				929
				930	ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
				931	ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
				932	ocfs2_lock_res_init_once(&oi->ip_data_lockres);
				933
				934	ocfs2_metadata_cache_init(&oi->vfs_inode);
				935
				936	inode_init_once(&oi->vfs_inode);
				937	}
				938	}
				939
				940	static int ocfs2_initialize_mem_caches(void)
				941	{
				942	ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache",
Paul Jackson	fffb60f	2006-03-24 03:16:06 -0800	[diff] [blame]	943	sizeof(struct ocfs2_inode_info),
				944	0,
				945	(SLAB_HWCACHE_ALIGN\|SLAB_RECLAIM_ACCOUNT\|
				946	SLAB_MEM_SPREAD),
				947	ocfs2_inode_init_once, NULL);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	948	if (!ocfs2_inode_cachep)
				949	return -ENOMEM;
				950
				951	ocfs2_lock_cache = kmem_cache_create("ocfs2_lock",
				952	sizeof(struct ocfs2_journal_lock),
				953	0,
Christoph Lameter	ac2b898	2006-03-22 00:08:15 -0800	[diff] [blame]	954	SLAB_HWCACHE_ALIGN,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	955	NULL, NULL);
				956	if (!ocfs2_lock_cache)
				957	return -ENOMEM;
				958
				959	return 0;
				960	}
				961
				962	static void ocfs2_free_mem_caches(void)
				963	{
				964	if (ocfs2_inode_cachep)
				965	kmem_cache_destroy(ocfs2_inode_cachep);
				966	if (ocfs2_lock_cache)
				967	kmem_cache_destroy(ocfs2_lock_cache);
				968
				969	ocfs2_inode_cachep = NULL;
				970	ocfs2_lock_cache = NULL;
				971	}
				972
				973	static int ocfs2_get_sector(struct super_block *sb,
				974	struct buffer_head **bh,
				975	int block,
				976	int sect_size)
				977	{
				978	if (!sb_set_blocksize(sb, sect_size)) {
				979	mlog(ML_ERROR, "unable to set blocksize\n");
				980	return -EIO;
				981	}
				982
				983	*bh = sb_getblk(sb, block);
				984	if (!*bh) {
				985	mlog_errno(-EIO);
				986	return -EIO;
				987	}
				988	lock_buffer(*bh);
				989	if (!buffer_dirty(*bh))
				990	clear_buffer_uptodate(*bh);
				991	unlock_buffer(*bh);
				992	ll_rw_block(READ, 1, bh);
				993	wait_on_buffer(*bh);
				994	return 0;
				995	}
				996
				997	/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */
				998	static int ocfs2_fill_local_node_info(struct ocfs2_super *osb)
				999	{
				1000	int status;
				1001
				1002	/* XXX hold a ref on the node while mounte? easy enough, if
				1003	* desirable. */
				1004	osb->node_num = o2nm_this_node();
				1005	if (osb->node_num == O2NM_MAX_NODES) {
				1006	mlog(ML_ERROR, "could not find this host's node number\n");
				1007	status = -ENOENT;
				1008	goto bail;
				1009	}
				1010
Sunil Mushran	781ee3e	2006-04-27 16:41:31 -0700	[diff] [blame]	1011	mlog(0, "I am node %d\n", osb->node_num);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1012
				1013	status = 0;
				1014	bail:
				1015	return status;
				1016	}
				1017
				1018	static int ocfs2_mount_volume(struct super_block *sb)
				1019	{
				1020	int status = 0;
				1021	int unlock_super = 0;
				1022	struct ocfs2_super *osb = OCFS2_SB(sb);
				1023
				1024	mlog_entry_void();
				1025
				1026	if (ocfs2_is_hard_readonly(osb))
				1027	goto leave;
				1028
				1029	status = ocfs2_fill_local_node_info(osb);
				1030	if (status < 0) {
				1031	mlog_errno(status);
				1032	goto leave;
				1033	}
				1034
				1035	status = ocfs2_register_hb_callbacks(osb);
				1036	if (status < 0) {
				1037	mlog_errno(status);
				1038	goto leave;
				1039	}
				1040
				1041	status = ocfs2_dlm_init(osb);
				1042	if (status < 0) {
				1043	mlog_errno(status);
				1044	goto leave;
				1045	}
				1046
				1047	/* requires vote_thread to be running. */
				1048	status = ocfs2_register_net_handlers(osb);
				1049	if (status < 0) {
				1050	mlog_errno(status);
				1051	goto leave;
				1052	}
				1053
				1054	status = ocfs2_super_lock(osb, 1);
				1055	if (status < 0) {
				1056	mlog_errno(status);
				1057	goto leave;
				1058	}
				1059	unlock_super = 1;
				1060
				1061	/* This will load up the node map and add ourselves to it. */
				1062	status = ocfs2_find_slot(osb);
				1063	if (status < 0) {
				1064	mlog_errno(status);
				1065	goto leave;
				1066	}
				1067
				1068	ocfs2_populate_mounted_map(osb);
				1069
				1070	/* load all node-local system inodes */
				1071	status = ocfs2_init_local_system_inodes(osb);
				1072	if (status < 0) {
				1073	mlog_errno(status);
				1074	goto leave;
				1075	}
				1076
				1077	status = ocfs2_check_volume(osb);
				1078	if (status < 0) {
				1079	mlog_errno(status);
				1080	goto leave;
				1081	}
				1082
				1083	status = ocfs2_truncate_log_init(osb);
				1084	if (status < 0) {
				1085	mlog_errno(status);
				1086	goto leave;
				1087	}
				1088
				1089	/* This should be sent after we recovered our journal as it
				1090	* will cause other nodes to unmark us as needing
				1091	* recovery. However, we need to send it before dropping the
				1092	* super block lock as otherwise their recovery threads might
				1093	* try to clean us up while we're live! */
				1094	status = ocfs2_request_mount_vote(osb);
				1095	if (status < 0)
				1096	mlog_errno(status);
				1097
				1098	leave:
				1099	if (unlock_super)
				1100	ocfs2_super_unlock(osb, 1);
				1101
				1102	mlog_exit(status);
				1103	return status;
				1104	}
				1105
				1106	/* we can't grab the goofy sem lock from inside wait_event, so we use
				1107	* memory barriers to make sure that we'll see the null task before
				1108	* being woken up */
				1109	static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
				1110	{
				1111	mb();
				1112	return osb->recovery_thread_task != NULL;
				1113	}
				1114
				1115	static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
				1116	{
				1117	int tmp;
				1118	struct ocfs2_super *osb = NULL;
				1119
				1120	mlog_entry("(0x%p)\n", sb);
				1121
				1122	BUG_ON(!sb);
				1123	osb = OCFS2_SB(sb);
				1124	BUG_ON(!osb);
				1125
				1126	ocfs2_shutdown_local_alloc(osb);
				1127
				1128	ocfs2_truncate_log_shutdown(osb);
				1129
				1130	/* disable any new recovery threads and wait for any currently
				1131	* running ones to exit. Do this before setting the vol_state. */
Arjan van de Ven	c74ec2f	2006-01-13 21:54:23 -0800	[diff] [blame]	1132	mutex_lock(&osb->recovery_lock);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1133	osb->disable_recovery = 1;
Arjan van de Ven	c74ec2f	2006-01-13 21:54:23 -0800	[diff] [blame]	1134	mutex_unlock(&osb->recovery_lock);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1135	wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
				1136
				1137	/* At this point, we know that no more recovery threads can be
				1138	* launched, so wait for any recovery completion work to
				1139	* complete. */
				1140	flush_workqueue(ocfs2_wq);
				1141
				1142	ocfs2_journal_shutdown(osb);
				1143
				1144	ocfs2_sync_blockdev(sb);
				1145
				1146	/* No dlm means we've failed during mount, so skip all the
				1147	* steps which depended on that to complete. */
				1148	if (osb->dlm) {
				1149	tmp = ocfs2_super_lock(osb, 1);
				1150	if (tmp < 0) {
				1151	mlog_errno(tmp);
				1152	return;
				1153	}
				1154
				1155	tmp = ocfs2_request_umount_vote(osb);
				1156	if (tmp < 0)
				1157	mlog_errno(tmp);
				1158
				1159	if (osb->slot_num != OCFS2_INVALID_SLOT)
				1160	ocfs2_put_slot(osb);
				1161
				1162	ocfs2_super_unlock(osb, 1);
				1163	}
				1164
				1165	ocfs2_release_system_inodes(osb);
				1166
				1167	if (osb->dlm) {
				1168	ocfs2_unregister_net_handlers(osb);
				1169
				1170	ocfs2_dlm_shutdown(osb);
				1171	}
				1172
				1173	ocfs2_clear_hb_callbacks(osb);
				1174
				1175	debugfs_remove(osb->osb_debug_root);
				1176
				1177	if (!mnt_err)
				1178	ocfs2_stop_heartbeat(osb);
				1179
				1180	atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
				1181
Sunil Mushran	781ee3e	2006-04-27 16:41:31 -0700	[diff] [blame]	1182	printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %d)\n",
				1183	osb->dev_str, osb->node_num);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1184
				1185	ocfs2_delete_osb(osb);
				1186	kfree(osb);
				1187	sb->s_dev = 0;
				1188	sb->s_fs_info = NULL;
				1189	}
				1190
				1191	static int ocfs2_setup_osb_uuid(struct ocfs2_super osb, const unsigned char uuid,
				1192	unsigned uuid_bytes)
				1193	{
				1194	int i, ret;
				1195	char *ptr;
				1196
				1197	BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN);
				1198
				1199	osb->uuid_str = kcalloc(1, OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL);
				1200	if (osb->uuid_str == NULL)
				1201	return -ENOMEM;
				1202
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1203	for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) {
				1204	/* print with null */
				1205	ret = snprintf(ptr, 3, "%02X", uuid[i]);
				1206	if (ret != 2) /* drop super cleans up */
				1207	return -EINVAL;
				1208	/* then only advance past the last char */
				1209	ptr += 2;
				1210	}
				1211
				1212	return 0;
				1213	}
				1214
				1215	static int ocfs2_initialize_super(struct super_block *sb,
				1216	struct buffer_head *bh,
				1217	int sector_size)
				1218	{
				1219	int status = 0;
				1220	int i;
				1221	struct ocfs2_dinode *di = NULL;
				1222	struct inode *inode = NULL;
				1223	struct buffer_head *bitmap_bh = NULL;
				1224	struct ocfs2_journal *journal;
				1225	__le32 uuid_net_key;
				1226	struct ocfs2_super *osb;
				1227
				1228	mlog_entry_void();
				1229
				1230	osb = kcalloc(1, sizeof(struct ocfs2_super), GFP_KERNEL);
				1231	if (!osb) {
				1232	status = -ENOMEM;
				1233	mlog_errno(status);
				1234	goto bail;
				1235	}
				1236
				1237	sb->s_fs_info = osb;
				1238	sb->s_op = &ocfs2_sops;
				1239	sb->s_export_op = &ocfs2_export_ops;
				1240	sb->s_flags \|= MS_NOATIME;
				1241	/* this is needed to support O_LARGEFILE */
				1242	sb->s_maxbytes = ocfs2_max_file_offset(sb->s_blocksize_bits);
				1243
				1244	osb->sb = sb;
				1245	/* Save off for ocfs2_rw_direct */
				1246	osb->s_sectsize_bits = blksize_bits(sector_size);
Eric Sesterhenn / snakebyte	ebdec83	2006-01-27 10:32:52 +0100	[diff] [blame]	1247	BUG_ON(!osb->s_sectsize_bits);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1248
				1249	osb->net_response_ids = 0;
				1250	spin_lock_init(&osb->net_response_lock);
				1251	INIT_LIST_HEAD(&osb->net_response_list);
				1252
				1253	INIT_LIST_HEAD(&osb->osb_net_handlers);
				1254	init_waitqueue_head(&osb->recovery_event);
				1255	spin_lock_init(&osb->vote_task_lock);
				1256	init_waitqueue_head(&osb->vote_event);
				1257	osb->vote_work_sequence = 0;
				1258	osb->vote_wake_sequence = 0;
				1259	INIT_LIST_HEAD(&osb->blocked_lock_list);
				1260	osb->blocked_lock_count = 0;
				1261	INIT_LIST_HEAD(&osb->vote_list);
				1262	spin_lock_init(&osb->osb_lock);
				1263
				1264	atomic_set(&osb->alloc_stats.moves, 0);
				1265	atomic_set(&osb->alloc_stats.local_data, 0);
				1266	atomic_set(&osb->alloc_stats.bitmap_data, 0);
				1267	atomic_set(&osb->alloc_stats.bg_allocs, 0);
				1268	atomic_set(&osb->alloc_stats.bg_extends, 0);
				1269
				1270	ocfs2_init_node_maps(osb);
				1271
				1272	snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
				1273	MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
				1274
Arjan van de Ven	c74ec2f	2006-01-13 21:54:23 -0800	[diff] [blame]	1275	mutex_init(&osb->recovery_lock);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1276
				1277	osb->disable_recovery = 0;
				1278	osb->recovery_thread_task = NULL;
				1279
				1280	init_waitqueue_head(&osb->checkpoint_event);
				1281	atomic_set(&osb->needs_checkpoint, 0);
				1282
				1283	osb->node_num = O2NM_INVALID_NODE_NUM;
				1284	osb->slot_num = OCFS2_INVALID_SLOT;
				1285
				1286	osb->local_alloc_state = OCFS2_LA_UNUSED;
				1287	osb->local_alloc_bh = NULL;
				1288
				1289	ocfs2_setup_hb_callbacks(osb);
				1290
				1291	init_waitqueue_head(&osb->osb_mount_event);
				1292
				1293	osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
				1294	if (!osb->vol_label) {
				1295	mlog(ML_ERROR, "unable to alloc vol label\n");
				1296	status = -ENOMEM;
				1297	goto bail;
				1298	}
				1299
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1300	di = (struct ocfs2_dinode *)bh->b_data;
				1301
				1302	osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
				1303	if (osb->max_slots > OCFS2_MAX_SLOTS \|\| osb->max_slots == 0) {
				1304	mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
				1305	osb->max_slots);
				1306	status = -EINVAL;
				1307	goto bail;
				1308	}
Sunil Mushran	781ee3e	2006-04-27 16:41:31 -0700	[diff] [blame]	1309	mlog(0, "max_slots for this device: %u\n", osb->max_slots);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1310
Mark Fasheh	b4df6ed	2006-02-22 17:35:08 -0800	[diff] [blame]	1311	init_waitqueue_head(&osb->osb_wipe_event);
				1312	osb->osb_orphan_wipes = kcalloc(osb->max_slots,
				1313	sizeof(*osb->osb_orphan_wipes),
				1314	GFP_KERNEL);
				1315	if (!osb->osb_orphan_wipes) {
				1316	status = -ENOMEM;
				1317	mlog_errno(status);
				1318	goto bail;
				1319	}
				1320
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1321	osb->s_feature_compat =
				1322	le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
				1323	osb->s_feature_ro_compat =
				1324	le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat);
				1325	osb->s_feature_incompat =
				1326	le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat);
				1327
				1328	if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) {
				1329	mlog(ML_ERROR, "couldn't mount because of unsupported "
				1330	"optional features (%x).\n", i);
				1331	status = -EINVAL;
				1332	goto bail;
				1333	}
				1334	if (!(osb->sb->s_flags & MS_RDONLY) &&
				1335	(i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) {
				1336	mlog(ML_ERROR, "couldn't mount RDWR because of "
				1337	"unsupported optional features (%x).\n", i);
				1338	status = -EINVAL;
				1339	goto bail;
				1340	}
				1341
				1342	get_random_bytes(&osb->s_next_generation, sizeof(u32));
				1343
				1344	/* FIXME
				1345	* This should be done in ocfs2_journal_init(), but unknown
				1346	* ordering issues will cause the filesystem to crash.
				1347	* If anyone wants to figure out what part of the code
				1348	* refers to osb->journal before ocfs2_journal_init() is run,
				1349	* be my guest.
				1350	*/
				1351	/* initialize our journal structure */
				1352
				1353	journal = kcalloc(1, sizeof(struct ocfs2_journal), GFP_KERNEL);
				1354	if (!journal) {
				1355	mlog(ML_ERROR, "unable to alloc journal\n");
				1356	status = -ENOMEM;
				1357	goto bail;
				1358	}
				1359	osb->journal = journal;
				1360	journal->j_osb = osb;
				1361
				1362	atomic_set(&journal->j_num_trans, 0);
				1363	init_rwsem(&journal->j_trans_barrier);
				1364	init_waitqueue_head(&journal->j_checkpointed);
				1365	spin_lock_init(&journal->j_lock);
				1366	journal->j_trans_id = (unsigned long) 1;
				1367	INIT_LIST_HEAD(&journal->j_la_cleanups);
				1368	INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery, osb);
				1369	journal->j_state = OCFS2_JOURNAL_FREE;
				1370
				1371	/* get some pseudo constants for clustersize bits */
				1372	osb->s_clustersize_bits =
				1373	le32_to_cpu(di->id2.i_super.s_clustersize_bits);
				1374	osb->s_clustersize = 1 << osb->s_clustersize_bits;
				1375	mlog(0, "clusterbits=%d\n", osb->s_clustersize_bits);
				1376
				1377	if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE \|\|
				1378	osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) {
				1379	mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n",
				1380	osb->s_clustersize);
				1381	status = -EINVAL;
				1382	goto bail;
				1383	}
				1384
				1385	if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1)
				1386	> (u32)~0UL) {
				1387	mlog(ML_ERROR, "Volume might try to write to blocks beyond "
				1388	"what jbd can address in 32 bits.\n");
				1389	status = -EINVAL;
				1390	goto bail;
				1391	}
				1392
				1393	if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid,
				1394	sizeof(di->id2.i_super.s_uuid))) {
				1395	mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n");
				1396	status = -ENOMEM;
				1397	goto bail;
				1398	}
				1399
Mark Fasheh	7842704	2006-05-04 12:03:26 -0700	[diff] [blame]	1400	memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1401	osb->net_key = le32_to_cpu(uuid_net_key);
				1402
				1403	strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
				1404	osb->vol_label[63] = '\0';
				1405	osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
				1406	osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
				1407	osb->first_cluster_group_blkno =
				1408	le64_to_cpu(di->id2.i_super.s_first_cluster_group);
				1409	osb->fs_generation = le32_to_cpu(di->i_fs_generation);
				1410	mlog(0, "vol_label: %s\n", osb->vol_label);
				1411	mlog(0, "uuid: %s\n", osb->uuid_str);
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	1412	mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n",
				1413	(unsigned long long)osb->root_blkno,
				1414	(unsigned long long)osb->system_dir_blkno);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1415
				1416	osb->osb_dlm_debug = ocfs2_new_dlm_debug();
				1417	if (!osb->osb_dlm_debug) {
				1418	status = -ENOMEM;
				1419	mlog_errno(status);
				1420	goto bail;
				1421	}
				1422
				1423	atomic_set(&osb->vol_state, VOLUME_INIT);
				1424
				1425	/* load root, system_dir, and all global system inodes */
				1426	status = ocfs2_init_global_system_inodes(osb);
				1427	if (status < 0) {
				1428	mlog_errno(status);
				1429	goto bail;
				1430	}
				1431
				1432	/*
				1433	* global bitmap
				1434	*/
				1435	inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
				1436	OCFS2_INVALID_SLOT);
				1437	if (!inode) {
				1438	status = -EINVAL;
				1439	mlog_errno(status);
				1440	goto bail;
				1441	}
				1442
				1443	osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
				1444
Mark Fasheh	101ebf2	2006-05-02 17:54:45 -0700	[diff] [blame]	1445	/* We don't have a cluster lock on the bitmap here because
				1446	* we're only interested in static information and the extra
				1447	* complexity at mount time isn't worht it. Don't pass the
				1448	* inode in to the read function though as we don't want it to
				1449	* be put in the cache. */
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1450	status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0,
Mark Fasheh	101ebf2	2006-05-02 17:54:45 -0700	[diff] [blame]	1451	NULL);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1452	iput(inode);
				1453	if (status < 0) {
				1454	mlog_errno(status);
				1455	goto bail;
				1456	}
				1457
				1458	di = (struct ocfs2_dinode *) bitmap_bh->b_data;
				1459	osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1460	brelse(bitmap_bh);
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	1461	mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n",
				1462	(unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1463
				1464	status = ocfs2_init_slot_info(osb);
				1465	if (status < 0) {
				1466	mlog_errno(status);
				1467	goto bail;
				1468	}
				1469
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1470	bail:
				1471	mlog_exit(status);
				1472	return status;
				1473	}
				1474
				1475	/*
				1476	* will return: -EAGAIN if it is ok to keep searching for superblocks
				1477	* -EINVAL if there is a bad superblock
				1478	* 0 on success
				1479	*/
				1480	static int ocfs2_verify_volume(struct ocfs2_dinode *di,
				1481	struct buffer_head *bh,
				1482	u32 blksz)
				1483	{
				1484	int status = -EAGAIN;
				1485
				1486	mlog_entry_void();
				1487
				1488	if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
				1489	strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
				1490	status = -EINVAL;
				1491	if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
				1492	mlog(ML_ERROR, "found superblock with incorrect block "
				1493	"size: found %u, should be %u\n",
				1494	1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits),
				1495	blksz);
				1496	} else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) !=
				1497	OCFS2_MAJOR_REV_LEVEL \|\|
				1498	le16_to_cpu(di->id2.i_super.s_minor_rev_level) !=
				1499	OCFS2_MINOR_REV_LEVEL) {
				1500	mlog(ML_ERROR, "found superblock with bad version: "
				1501	"found %u.%u, should be %u.%u\n",
				1502	le16_to_cpu(di->id2.i_super.s_major_rev_level),
				1503	le16_to_cpu(di->id2.i_super.s_minor_rev_level),
				1504	OCFS2_MAJOR_REV_LEVEL,
				1505	OCFS2_MINOR_REV_LEVEL);
				1506	} else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) {
				1507	mlog(ML_ERROR, "bad block number on superblock: "
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	1508	"found %llu, should be %llu\n",
				1509	(unsigned long long)di->i_blkno,
				1510	(unsigned long long)bh->b_blocknr);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1511	} else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 \|\|
				1512	le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) {
				1513	mlog(ML_ERROR, "bad cluster size found: %u\n",
				1514	1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits));
				1515	} else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) {
				1516	mlog(ML_ERROR, "bad root_blkno: 0\n");
				1517	} else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) {
				1518	mlog(ML_ERROR, "bad system_dir_blkno: 0\n");
				1519	} else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) {
				1520	mlog(ML_ERROR,
				1521	"Superblock slots found greater than file system "
				1522	"maximum: found %u, max %u\n",
				1523	le16_to_cpu(di->id2.i_super.s_max_slots),
				1524	OCFS2_MAX_SLOTS);
				1525	} else {
				1526	/* found it! */
				1527	status = 0;
				1528	}
				1529	}
				1530
				1531	mlog_exit(status);
				1532	return status;
				1533	}
				1534
				1535	static int ocfs2_check_volume(struct ocfs2_super *osb)
				1536	{
				1537	int status = 0;
				1538	int dirty;
				1539	struct ocfs2_dinode local_alloc = NULL; / only used if we
				1540	* recover
				1541	* ourselves. */
				1542
				1543	mlog_entry_void();
				1544
				1545	/* Init our journal object. */
				1546	status = ocfs2_journal_init(osb->journal, &dirty);
				1547	if (status < 0) {
				1548	mlog(ML_ERROR, "Could not initialize journal!\n");
				1549	goto finally;
				1550	}
				1551
				1552	/* If the journal was unmounted cleanly then we don't want to
				1553	* recover anything. Otherwise, journal_load will do that
				1554	* dirty work for us :) */
				1555	if (!dirty) {
				1556	status = ocfs2_journal_wipe(osb->journal, 0);
				1557	if (status < 0) {
				1558	mlog_errno(status);
				1559	goto finally;
				1560	}
				1561	} else {
				1562	mlog(ML_NOTICE, "File system was not unmounted cleanly, "
				1563	"recovering volume.\n");
				1564	}
				1565
				1566	/* will play back anything left in the journal. */
				1567	ocfs2_journal_load(osb->journal);
				1568
				1569	if (dirty) {
				1570	/* recover my local alloc if we didn't unmount cleanly. */
				1571	status = ocfs2_begin_local_alloc_recovery(osb,
				1572	osb->slot_num,
				1573	&local_alloc);
				1574	if (status < 0) {
				1575	mlog_errno(status);
				1576	goto finally;
				1577	}
				1578	/* we complete the recovery process after we've marked
				1579	* ourselves as mounted. */
				1580	}
				1581
				1582	mlog(0, "Journal loaded.\n");
				1583
				1584	status = ocfs2_load_local_alloc(osb);
				1585	if (status < 0) {
				1586	mlog_errno(status);
				1587	goto finally;
				1588	}
				1589
				1590	if (dirty) {
				1591	/* Recovery will be completed after we've mounted the
				1592	* rest of the volume. */
				1593	osb->dirty = 1;
				1594	osb->local_alloc_copy = local_alloc;
				1595	local_alloc = NULL;
				1596	}
				1597
				1598	/* go through each journal, trylock it and if you get the
				1599	* lock, and it's marked as dirty, set the bit in the recover
				1600	* map and launch a recovery thread for it. */
				1601	status = ocfs2_mark_dead_nodes(osb);
				1602	if (status < 0)
				1603	mlog_errno(status);
				1604
				1605	finally:
				1606	if (local_alloc)
				1607	kfree(local_alloc);
				1608
				1609	mlog_exit(status);
				1610	return status;
				1611	}
				1612
				1613	/*
				1614	* The routine gets called from dismount or close whenever a dismount on
				1615	* volume is requested and the osb open count becomes 1.
				1616	* It will remove the osb from the global list and also free up all the
				1617	* initialized resources and fileobject.
				1618	*/
				1619	static void ocfs2_delete_osb(struct ocfs2_super *osb)
				1620	{
				1621	mlog_entry_void();
				1622
				1623	/* This function assumes that the caller has the main osb resource */
				1624
				1625	if (osb->slot_info)
				1626	ocfs2_free_slot_info(osb->slot_info);
				1627
Mark Fasheh	b4df6ed	2006-02-22 17:35:08 -0800	[diff] [blame]	1628	kfree(osb->osb_orphan_wipes);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1629	/* FIXME
				1630	* This belongs in journal shutdown, but because we have to
				1631	* allocate osb->journal at the start of ocfs2_initalize_osb(),
				1632	* we free it here.
				1633	*/
				1634	kfree(osb->journal);
				1635	if (osb->local_alloc_copy)
				1636	kfree(osb->local_alloc_copy);
				1637	kfree(osb->uuid_str);
				1638	ocfs2_put_dlm_debug(osb->osb_dlm_debug);
				1639	memset(osb, 0, sizeof(struct ocfs2_super));
				1640
				1641	mlog_exit_void();
				1642	}
				1643
				1644	/* Put OCFS2 into a readonly state, or (if the user specifies it),
				1645	* panic(). We do not support continue-on-error operation. */
				1646	static void ocfs2_handle_error(struct super_block *sb)
				1647	{
				1648	struct ocfs2_super *osb = OCFS2_SB(sb);
				1649
				1650	if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
				1651	panic("OCFS2: (device %s): panic forced after error\n",
				1652	sb->s_id);
				1653
				1654	ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
				1655
				1656	if (sb->s_flags & MS_RDONLY &&
				1657	(ocfs2_is_soft_readonly(osb) \|\|
				1658	ocfs2_is_hard_readonly(osb)))
				1659	return;
				1660
				1661	printk(KERN_CRIT "File system is now read-only due to the potential "
				1662	"of on-disk corruption. Please run fsck.ocfs2 once the file "
				1663	"system is unmounted.\n");
				1664	sb->s_flags \|= MS_RDONLY;
				1665	ocfs2_set_ro_flag(osb, 0);
				1666	}
				1667
				1668	static char error_buf[1024];
				1669
				1670	void __ocfs2_error(struct super_block *sb,
				1671	const char *function,
				1672	const char *fmt, ...)
				1673	{
				1674	va_list args;
				1675
				1676	va_start(args, fmt);
				1677	vsprintf(error_buf, fmt, args);
				1678	va_end(args);
				1679
				1680	/* Not using mlog here because we want to show the actual
				1681	* function the error came from. */
				1682	printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n",
				1683	sb->s_id, function, error_buf);
				1684
				1685	ocfs2_handle_error(sb);
				1686	}
				1687
				1688	/* Handle critical errors. This is intentionally more drastic than
				1689	* ocfs2_handle_error, so we only use for things like journal errors,
				1690	* etc. */
				1691	void __ocfs2_abort(struct super_block* sb,
				1692	const char *function,
				1693	const char *fmt, ...)
				1694	{
				1695	va_list args;
				1696
				1697	va_start(args, fmt);
				1698	vsprintf(error_buf, fmt, args);
				1699	va_end(args);
				1700
				1701	printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n",
				1702	sb->s_id, function, error_buf);
				1703
				1704	/* We don't have the cluster support yet to go straight to
				1705	* hard readonly in here. Until then, we want to keep
				1706	* ocfs2_abort() so that we can at least mark critical
				1707	* errors.
				1708	*
				1709	* TODO: This should abort the journal and alert other nodes
				1710	* that our slot needs recovery. */
				1711
				1712	/* Force a panic(). This stinks, but it's better than letting
				1713	* things continue without having a proper hard readonly
				1714	* here. */
				1715	OCFS2_SB(sb)->s_mount_opt \|= OCFS2_MOUNT_ERRORS_PANIC;
				1716	ocfs2_handle_error(sb);
				1717	}
				1718
				1719	module_init(ocfs2_init);
				1720	module_exit(ocfs2_exit);