Blame - fs/ocfs2/super.c - kernel/msm

blob: 949b3dac30f142127cefd25226d149c4c8bf465a [file] [log] [blame]

Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1	/* -- mode: c; c-basic-offset: 8; --
				2	* vim: noexpandtab sw=8 ts=8 sts=0:
				3	*
				4	* super.c
				5	*
				6	* load/unload driver, mount/dismount volumes
				7	*
				8	* Copyright (C) 2002, 2004 Oracle. All rights reserved.
				9	*
				10	* This program is free software; you can redistribute it and/or
				11	* modify it under the terms of the GNU General Public
				12	* License as published by the Free Software Foundation; either
				13	* version 2 of the License, or (at your option) any later version.
				14	*
				15	* This program is distributed in the hope that it will be useful,
				16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				18	* General Public License for more details.
				19	*
				20	* You should have received a copy of the GNU General Public
				21	* License along with this program; if not, write to the
				22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				23	* Boston, MA 021110-1307, USA.
				24	*/
				25
				26	#include <linux/module.h>
				27	#include <linux/fs.h>
				28	#include <linux/types.h>
				29	#include <linux/slab.h>
				30	#include <linux/highmem.h>
				31	#include <linux/utsname.h>
				32	#include <linux/init.h>
				33	#include <linux/random.h>
				34	#include <linux/statfs.h>
				35	#include <linux/moduleparam.h>
				36	#include <linux/blkdev.h>
				37	#include <linux/socket.h>
				38	#include <linux/inet.h>
				39	#include <linux/parser.h>
				40	#include <linux/crc32.h>
				41	#include <linux/debugfs.h>
				42
				43	#include <cluster/nodemanager.h>
				44
				45	#define MLOG_MASK_PREFIX ML_SUPER
				46	#include <cluster/masklog.h>
				47
				48	#include "ocfs2.h"
				49
				50	/* this should be the only file to include a version 1 header */
				51	#include "ocfs1_fs_compat.h"
				52
				53	#include "alloc.h"
				54	#include "dlmglue.h"
				55	#include "export.h"
				56	#include "extent_map.h"
				57	#include "heartbeat.h"
				58	#include "inode.h"
				59	#include "journal.h"
				60	#include "localalloc.h"
				61	#include "namei.h"
				62	#include "slot_map.h"
				63	#include "super.h"
				64	#include "sysfile.h"
				65	#include "uptodate.h"
				66	#include "ver.h"
				67	#include "vote.h"
				68
				69	#include "buffer_head_io.h"
				70
				71	/*
				72	* Globals
				73	*/
				74	static spinlock_t ocfs2_globals_lock = SPIN_LOCK_UNLOCKED;
				75
				76	static u32 osb_id; /* Keeps track of next available OSB Id */
				77
				78	static kmem_cache_t *ocfs2_inode_cachep = NULL;
				79
				80	kmem_cache_t *ocfs2_lock_cache = NULL;
				81
				82	/* OCFS2 needs to schedule several differnt types of work which
				83	* require cluster locking, disk I/O, recovery waits, etc. Since these
				84	* types of work tend to be heavy we avoid using the kernel events
				85	* workqueue and schedule on our own. */
				86	struct workqueue_struct *ocfs2_wq = NULL;
				87
				88	static struct dentry *ocfs2_debugfs_root = NULL;
				89
				90	MODULE_AUTHOR("Oracle");
				91	MODULE_LICENSE("GPL");
				92
				93	static int ocfs2_parse_options(struct super_block sb, char options,
				94	unsigned long *mount_opt, int is_remount);
				95	static void ocfs2_put_super(struct super_block *sb);
				96	static int ocfs2_mount_volume(struct super_block *sb);
				97	static int ocfs2_remount(struct super_block sb, int flags, char *data);
				98	static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err);
				99	static int ocfs2_initialize_mem_caches(void);
				100	static void ocfs2_free_mem_caches(void);
				101	static void ocfs2_delete_osb(struct ocfs2_super *osb);
				102
				103	static int ocfs2_statfs(struct super_block sb, struct kstatfs buf);
				104
				105	static int ocfs2_sync_fs(struct super_block *sb, int wait);
				106
				107	static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
				108	static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
				109	static int ocfs2_release_system_inodes(struct ocfs2_super *osb);
				110	static int ocfs2_fill_local_node_info(struct ocfs2_super *osb);
				111	static int ocfs2_check_volume(struct ocfs2_super *osb);
				112	static int ocfs2_verify_volume(struct ocfs2_dinode *di,
				113	struct buffer_head *bh,
				114	u32 sectsize);
				115	static int ocfs2_initialize_super(struct super_block *sb,
				116	struct buffer_head *bh,
				117	int sector_size);
				118	static int ocfs2_get_sector(struct super_block *sb,
				119	struct buffer_head **bh,
				120	int block,
				121	int sect_size);
				122	static void ocfs2_write_super(struct super_block *sb);
				123	static struct inode ocfs2_alloc_inode(struct super_block sb);
				124	static void ocfs2_destroy_inode(struct inode *inode);
				125
				126	static unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
				127
				128	static struct super_operations ocfs2_sops = {
				129	.statfs = ocfs2_statfs,
				130	.alloc_inode = ocfs2_alloc_inode,
				131	.destroy_inode = ocfs2_destroy_inode,
				132	.drop_inode = ocfs2_drop_inode,
				133	.clear_inode = ocfs2_clear_inode,
				134	.delete_inode = ocfs2_delete_inode,
				135	.sync_fs = ocfs2_sync_fs,
				136	.write_super = ocfs2_write_super,
				137	.put_super = ocfs2_put_super,
				138	.remount_fs = ocfs2_remount,
				139	};
				140
				141	enum {
				142	Opt_barrier,
				143	Opt_err_panic,
				144	Opt_err_ro,
				145	Opt_intr,
				146	Opt_nointr,
				147	Opt_hb_none,
				148	Opt_hb_local,
				149	Opt_data_ordered,
				150	Opt_data_writeback,
				151	Opt_err,
				152	};
				153
				154	static match_table_t tokens = {
				155	{Opt_barrier, "barrier=%u"},
				156	{Opt_err_panic, "errors=panic"},
				157	{Opt_err_ro, "errors=remount-ro"},
				158	{Opt_intr, "intr"},
				159	{Opt_nointr, "nointr"},
				160	{Opt_hb_none, OCFS2_HB_NONE},
				161	{Opt_hb_local, OCFS2_HB_LOCAL},
				162	{Opt_data_ordered, "data=ordered"},
				163	{Opt_data_writeback, "data=writeback"},
				164	{Opt_err, NULL}
				165	};
				166
				167	/*
				168	* write_super and sync_fs ripped right out of ext3.
				169	*/
				170	static void ocfs2_write_super(struct super_block *sb)
				171	{
Ingo Molnar	7892f2f	2006-01-09 15:59:25 -0800	[diff] [blame]	172	if (mutex_trylock(&sb->s_lock) != 0)
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	173	BUG();
				174	sb->s_dirt = 0;
				175	}
				176
				177	static int ocfs2_sync_fs(struct super_block *sb, int wait)
				178	{
				179	int status = 0;
				180	tid_t target;
				181	struct ocfs2_super *osb = OCFS2_SB(sb);
				182
				183	sb->s_dirt = 0;
				184
				185	if (ocfs2_is_hard_readonly(osb))
				186	return -EROFS;
				187
				188	if (wait) {
				189	status = ocfs2_flush_truncate_log(osb);
				190	if (status < 0)
				191	mlog_errno(status);
				192	} else {
				193	ocfs2_schedule_truncate_log_flush(osb, 0);
				194	}
				195
				196	if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) {
				197	if (wait)
				198	log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
				199	target);
				200	}
				201	return 0;
				202	}
				203
				204	static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
				205	{
				206	struct inode *new = NULL;
				207	int status = 0;
				208	int i;
				209
				210	mlog_entry_void();
				211
				212	new = ocfs2_iget(osb, osb->root_blkno);
				213	if (IS_ERR(new)) {
				214	status = PTR_ERR(new);
				215	mlog_errno(status);
				216	goto bail;
				217	}
				218	osb->root_inode = new;
				219
				220	new = ocfs2_iget(osb, osb->system_dir_blkno);
				221	if (IS_ERR(new)) {
				222	status = PTR_ERR(new);
				223	mlog_errno(status);
				224	goto bail;
				225	}
				226	osb->sys_root_inode = new;
				227
				228	for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
				229	i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
				230	new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
				231	if (!new) {
				232	ocfs2_release_system_inodes(osb);
				233	status = -EINVAL;
				234	mlog_errno(status);
				235	/* FIXME: Should ERROR_RO_FS */
				236	mlog(ML_ERROR, "Unable to load system inode %d, "
				237	"possibly corrupt fs?", i);
				238	goto bail;
				239	}
				240	// the array now has one ref, so drop this one
				241	iput(new);
				242	}
				243
				244	bail:
				245	mlog_exit(status);
				246	return status;
				247	}
				248
				249	static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
				250	{
				251	struct inode *new = NULL;
				252	int status = 0;
				253	int i;
				254
				255	mlog_entry_void();
				256
				257	for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
				258	i < NUM_SYSTEM_INODES;
				259	i++) {
				260	new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
				261	if (!new) {
				262	ocfs2_release_system_inodes(osb);
				263	status = -EINVAL;
				264	mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n",
				265	status, i, osb->slot_num);
				266	goto bail;
				267	}
				268	/* the array now has one ref, so drop this one */
				269	iput(new);
				270	}
				271
				272	bail:
				273	mlog_exit(status);
				274	return status;
				275	}
				276
				277	static int ocfs2_release_system_inodes(struct ocfs2_super *osb)
				278	{
				279	int status = 0, i;
				280	struct inode *inode;
				281
				282	mlog_entry_void();
				283
				284	for (i = 0; i < NUM_SYSTEM_INODES; i++) {
				285	inode = osb->system_inodes[i];
				286	if (inode) {
				287	iput(inode);
				288	osb->system_inodes[i] = NULL;
				289	}
				290	}
				291
				292	inode = osb->sys_root_inode;
				293	if (inode) {
				294	iput(inode);
				295	osb->sys_root_inode = NULL;
				296	}
				297
				298	inode = osb->root_inode;
				299	if (inode) {
				300	iput(inode);
				301	osb->root_inode = NULL;
				302	}
				303
				304	mlog_exit(status);
				305	return status;
				306	}
				307
				308	/* We're allocating fs objects, use GFP_NOFS */
				309	static struct inode ocfs2_alloc_inode(struct super_block sb)
				310	{
				311	struct ocfs2_inode_info *oi;
				312
				313	oi = kmem_cache_alloc(ocfs2_inode_cachep, SLAB_NOFS);
				314	if (!oi)
				315	return NULL;
				316
				317	return &oi->vfs_inode;
				318	}
				319
				320	static void ocfs2_destroy_inode(struct inode *inode)
				321	{
				322	kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
				323	}
				324
				325	/* From xfs_super.c:xfs_max_file_offset
				326	* Copyright (c) 2000-2004 Silicon Graphics, Inc.
				327	*/
				328	static unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
				329	{
				330	unsigned int pagefactor = 1;
				331	unsigned int bitshift = BITS_PER_LONG - 1;
				332
				333	/* Figure out maximum filesize, on Linux this can depend on
				334	* the filesystem blocksize (on 32 bit platforms).
				335	* __block_prepare_write does this in an [unsigned] long...
				336	* page->index << (PAGE_CACHE_SHIFT - bbits)
				337	* So, for page sized blocks (4K on 32 bit platforms),
				338	* this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
				339	* (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
				340	* but for smaller blocksizes it is less (bbits = log2 bsize).
				341	* Note1: get_block_t takes a long (implicit cast from above)
				342	* Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
				343	* can optionally convert the [unsigned] long from above into
				344	* an [unsigned] long long.
				345	*/
				346
				347	#if BITS_PER_LONG == 32
				348	# if defined(CONFIG_LBD)
				349	BUG_ON(sizeof(sector_t) != 8);
				350	pagefactor = PAGE_CACHE_SIZE;
				351	bitshift = BITS_PER_LONG;
				352	# else
				353	pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
				354	# endif
				355	#endif
				356
				357	return (((unsigned long long)pagefactor) << bitshift) - 1;
				358	}
				359
				360	static int ocfs2_remount(struct super_block sb, int flags, char *data)
				361	{
				362	int incompat_features;
				363	int ret = 0;
				364	unsigned long parsed_options;
				365	struct ocfs2_super *osb = OCFS2_SB(sb);
				366
				367	if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
				368	ret = -EINVAL;
				369	goto out;
				370	}
				371
				372	if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) !=
				373	(parsed_options & OCFS2_MOUNT_HB_LOCAL)) {
				374	ret = -EINVAL;
				375	mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
				376	goto out;
				377	}
				378
				379	if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) !=
				380	(parsed_options & OCFS2_MOUNT_DATA_WRITEBACK)) {
				381	ret = -EINVAL;
				382	mlog(ML_ERROR, "Cannot change data mode on remount\n");
				383	goto out;
				384	}
				385
				386	/* We're going to/from readonly mode. */
				387	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
				388	/* Lock here so the check of HARD_RO and the potential
				389	* setting of SOFT_RO is atomic. */
				390	spin_lock(&osb->osb_lock);
				391	if (osb->osb_flags & OCFS2_OSB_HARD_RO) {
				392	mlog(ML_ERROR, "Remount on readonly device is forbidden.\n");
				393	ret = -EROFS;
				394	goto unlock_osb;
				395	}
				396
				397	if (*flags & MS_RDONLY) {
				398	mlog(0, "Going to ro mode.\n");
				399	sb->s_flags \|= MS_RDONLY;
				400	osb->osb_flags \|= OCFS2_OSB_SOFT_RO;
				401	} else {
				402	mlog(0, "Making ro filesystem writeable.\n");
				403
				404	if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
				405	mlog(ML_ERROR, "Cannot remount RDWR "
				406	"filesystem due to previous errors.\n");
				407	ret = -EROFS;
				408	goto unlock_osb;
				409	}
				410	incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP);
				411	if (incompat_features) {
				412	mlog(ML_ERROR, "Cannot remount RDWR because "
				413	"of unsupported optional features "
				414	"(%x).\n", incompat_features);
				415	ret = -EINVAL;
				416	goto unlock_osb;
				417	}
				418	sb->s_flags &= ~MS_RDONLY;
				419	osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
				420	}
				421	unlock_osb:
				422	spin_unlock(&osb->osb_lock);
				423	}
				424
				425	if (!ret) {
				426	if (!ocfs2_is_hard_readonly(osb))
				427	ocfs2_set_journal_params(osb);
				428
				429	/* Only save off the new mount options in case of a successful
				430	* remount. */
				431	osb->s_mount_opt = parsed_options;
				432	}
				433	out:
				434	return ret;
				435	}
				436
				437	static int ocfs2_sb_probe(struct super_block *sb,
				438	struct buffer_head **bh,
				439	int *sector_size)
				440	{
				441	int status = 0, tmpstat;
				442	struct ocfs1_vol_disk_hdr *hdr;
				443	struct ocfs2_dinode *di;
				444	int blksize;
				445
				446	*bh = NULL;
				447
				448	/* may be > 512 */
				449	*sector_size = bdev_hardsect_size(sb->s_bdev);
				450	if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
				451	mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
				452	*sector_size, OCFS2_MAX_BLOCKSIZE);
				453	status = -EINVAL;
				454	goto bail;
				455	}
				456
				457	/* Can this really happen? */
				458	if (*sector_size < OCFS2_MIN_BLOCKSIZE)
				459	*sector_size = OCFS2_MIN_BLOCKSIZE;
				460
				461	/* check block zero for old format */
				462	status = ocfs2_get_sector(sb, bh, 0, *sector_size);
				463	if (status < 0) {
				464	mlog_errno(status);
				465	goto bail;
				466	}
				467	hdr = (struct ocfs1_vol_disk_hdr ) (bh)->b_data;
				468	if (hdr->major_version == OCFS1_MAJOR_VERSION) {
				469	mlog(ML_ERROR, "incompatible version: %u.%u\n",
				470	hdr->major_version, hdr->minor_version);
				471	status = -EINVAL;
				472	}
				473	if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE,
				474	strlen(OCFS1_VOLUME_SIGNATURE)) == 0) {
				475	mlog(ML_ERROR, "incompatible volume signature: %8s\n",
				476	hdr->signature);
				477	status = -EINVAL;
				478	}
				479	brelse(*bh);
				480	*bh = NULL;
				481	if (status < 0) {
				482	mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be "
				483	"upgraded before mounting with ocfs v2\n");
				484	goto bail;
				485	}
				486
				487	/*
				488	* Now check at magic offset for 512, 1024, 2048, 4096
				489	* blocksizes. 4096 is the maximum blocksize because it is
				490	* the minimum clustersize.
				491	*/
				492	status = -EINVAL;
				493	for (blksize = *sector_size;
				494	blksize <= OCFS2_MAX_BLOCKSIZE;
				495	blksize <<= 1) {
				496	tmpstat = ocfs2_get_sector(sb, bh,
				497	OCFS2_SUPER_BLOCK_BLKNO,
				498	blksize);
				499	if (tmpstat < 0) {
				500	status = tmpstat;
				501	mlog_errno(status);
				502	goto bail;
				503	}
				504	di = (struct ocfs2_dinode ) (bh)->b_data;
				505	status = ocfs2_verify_volume(di, *bh, blksize);
				506	if (status >= 0)
				507	goto bail;
				508	brelse(*bh);
				509	*bh = NULL;
				510	if (status != -EAGAIN)
				511	break;
				512	}
				513
				514	bail:
				515	return status;
				516	}
				517
				518	static int ocfs2_fill_super(struct super_block sb, void data, int silent)
				519	{
				520	struct dentry *root;
				521	int status, sector_size;
				522	unsigned long parsed_opt;
				523	struct inode *inode = NULL;
				524	struct ocfs2_super *osb = NULL;
				525	struct buffer_head *bh = NULL;
				526
				527	mlog_entry("%p, %p, %i", sb, data, silent);
				528
				529	/* for now we only have one cluster/node, make sure we see it
				530	* in the heartbeat universe */
				531	if (!o2hb_check_local_node_heartbeating()) {
				532	status = -EINVAL;
				533	goto read_super_error;
				534	}
				535
				536	/* probe for superblock */
				537	status = ocfs2_sb_probe(sb, &bh, &sector_size);
				538	if (status < 0) {
				539	mlog(ML_ERROR, "superblock probe failed!\n");
				540	goto read_super_error;
				541	}
				542
				543	status = ocfs2_initialize_super(sb, bh, sector_size);
				544	osb = OCFS2_SB(sb);
				545	if (status < 0) {
				546	mlog_errno(status);
				547	goto read_super_error;
				548	}
				549	brelse(bh);
				550	bh = NULL;
				551
				552	if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) {
				553	status = -EINVAL;
				554	goto read_super_error;
				555	}
				556	osb->s_mount_opt = parsed_opt;
				557
				558	sb->s_magic = OCFS2_SUPER_MAGIC;
				559
				560	/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
				561	* heartbeat=none */
				562	if (bdev_read_only(sb->s_bdev)) {
				563	if (!(sb->s_flags & MS_RDONLY)) {
				564	status = -EACCES;
				565	mlog(ML_ERROR, "Readonly device detected but readonly "
				566	"mount was not specified.\n");
				567	goto read_super_error;
				568	}
				569
				570	/* You should not be able to start a local heartbeat
				571	* on a readonly device. */
				572	if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
				573	status = -EROFS;
				574	mlog(ML_ERROR, "Local heartbeat specified on readonly "
				575	"device.\n");
				576	goto read_super_error;
				577	}
				578
				579	status = ocfs2_check_journals_nolocks(osb);
				580	if (status < 0) {
				581	if (status == -EROFS)
				582	mlog(ML_ERROR, "Recovery required on readonly "
				583	"file system, but write access is "
				584	"unavailable.\n");
				585	else
				586	mlog_errno(status);
				587	goto read_super_error;
				588	}
				589
				590	ocfs2_set_ro_flag(osb, 1);
				591
				592	printk(KERN_NOTICE "Readonly device detected. No cluster "
				593	"services will be utilized for this mount. Recovery "
				594	"will be skipped.\n");
				595	}
				596
				597	if (!ocfs2_is_hard_readonly(osb)) {
				598	/* If this isn't a hard readonly mount, then we need
				599	* to make sure that heartbeat is in a valid state,
				600	* and that we mark ourselves soft readonly is -oro
				601	* was specified. */
				602	if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
				603	mlog(ML_ERROR, "No heartbeat for device (%s)\n",
				604	sb->s_id);
				605	status = -EINVAL;
				606	goto read_super_error;
				607	}
				608
				609	if (sb->s_flags & MS_RDONLY)
				610	ocfs2_set_ro_flag(osb, 0);
				611	}
				612
				613	osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
				614	ocfs2_debugfs_root);
				615	if (!osb->osb_debug_root) {
				616	status = -EINVAL;
				617	mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n");
				618	goto read_super_error;
				619	}
				620
				621	status = ocfs2_mount_volume(sb);
				622	if (osb->root_inode)
				623	inode = igrab(osb->root_inode);
				624
				625	if (status < 0)
				626	goto read_super_error;
				627
				628	if (!inode) {
				629	status = -EIO;
				630	mlog_errno(status);
				631	goto read_super_error;
				632	}
				633
				634	root = d_alloc_root(inode);
				635	if (!root) {
				636	status = -ENOMEM;
				637	mlog_errno(status);
				638	goto read_super_error;
				639	}
				640
				641	sb->s_root = root;
				642
				643	ocfs2_complete_mount_recovery(osb);
				644
				645	printk("ocfs2: Mounting device (%u,%u) on (node %d, slot %d) with %s "
				646	"data mode.\n",
				647	MAJOR(sb->s_dev), MINOR(sb->s_dev), osb->node_num,
				648	osb->slot_num,
				649	osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
				650	"ordered");
				651
				652	atomic_set(&osb->vol_state, VOLUME_MOUNTED);
				653	wake_up(&osb->osb_mount_event);
				654
				655	mlog_exit(status);
				656	return status;
				657
				658	read_super_error:
				659	if (bh != NULL)
				660	brelse(bh);
				661
				662	if (inode)
				663	iput(inode);
				664
				665	if (osb) {
				666	atomic_set(&osb->vol_state, VOLUME_DISABLED);
				667	wake_up(&osb->osb_mount_event);
				668	ocfs2_dismount_volume(sb, 1);
				669	}
				670
				671	mlog_exit(status);
				672	return status;
				673	}
				674
				675	static struct super_block ocfs2_get_sb(struct file_system_type fs_type,
				676	int flags,
				677	const char *dev_name,
				678	void *data)
				679	{
				680	return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
				681	}
				682
				683	static struct file_system_type ocfs2_fs_type = {
				684	.owner = THIS_MODULE,
				685	.name = "ocfs2",
				686	.get_sb = ocfs2_get_sb, /* is this called when we mount
				687	* the fs? */
				688	.kill_sb = kill_block_super, /* set to the generic one
				689	* right now, but do we
				690	* need to change that? */
				691	.fs_flags = FS_REQUIRES_DEV,
				692	.next = NULL
				693	};
				694
				695	static int ocfs2_parse_options(struct super_block *sb,
				696	char *options,
				697	unsigned long *mount_opt,
				698	int is_remount)
				699	{
				700	int status;
				701	char *p;
				702
				703	mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
				704	options ? options : "(none)");
				705
				706	*mount_opt = 0;
				707
				708	if (!options) {
				709	status = 1;
				710	goto bail;
				711	}
				712
				713	while ((p = strsep(&options, ",")) != NULL) {
				714	int token, option;
				715	substring_t args[MAX_OPT_ARGS];
				716
				717	if (!*p)
				718	continue;
				719
				720	token = match_token(p, tokens, args);
				721	switch (token) {
				722	case Opt_hb_local:
				723	*mount_opt \|= OCFS2_MOUNT_HB_LOCAL;
				724	break;
				725	case Opt_hb_none:
				726	*mount_opt &= ~OCFS2_MOUNT_HB_LOCAL;
				727	break;
				728	case Opt_barrier:
				729	if (match_int(&args[0], &option)) {
				730	status = 0;
				731	goto bail;
				732	}
				733	if (option)
				734	*mount_opt \|= OCFS2_MOUNT_BARRIER;
				735	else
				736	*mount_opt &= ~OCFS2_MOUNT_BARRIER;
				737	break;
				738	case Opt_intr:
				739	*mount_opt &= ~OCFS2_MOUNT_NOINTR;
				740	break;
				741	case Opt_nointr:
				742	*mount_opt \|= OCFS2_MOUNT_NOINTR;
				743	break;
				744	case Opt_err_panic:
				745	*mount_opt \|= OCFS2_MOUNT_ERRORS_PANIC;
				746	break;
				747	case Opt_err_ro:
				748	*mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
				749	break;
				750	case Opt_data_ordered:
				751	*mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
				752	break;
				753	case Opt_data_writeback:
				754	*mount_opt \|= OCFS2_MOUNT_DATA_WRITEBACK;
				755	break;
				756	default:
				757	mlog(ML_ERROR,
				758	"Unrecognized mount option \"%s\" "
				759	"or missing value\n", p);
				760	status = 0;
				761	goto bail;
				762	}
				763	}
				764
				765	status = 1;
				766
				767	bail:
				768	mlog_exit(status);
				769	return status;
				770	}
				771
				772	static int __init ocfs2_init(void)
				773	{
				774	int status;
				775
				776	mlog_entry_void();
				777
				778	ocfs2_print_version();
				779
				780	if (init_ocfs2_extent_maps())
				781	return -ENOMEM;
				782
				783	status = init_ocfs2_uptodate_cache();
				784	if (status < 0) {
				785	mlog_errno(status);
				786	goto leave;
				787	}
				788
				789	status = ocfs2_initialize_mem_caches();
				790	if (status < 0) {
				791	mlog_errno(status);
				792	goto leave;
				793	}
				794
				795	ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
				796	if (!ocfs2_wq) {
				797	status = -ENOMEM;
				798	goto leave;
				799	}
				800
				801	spin_lock(&ocfs2_globals_lock);
				802	osb_id = 0;
				803	spin_unlock(&ocfs2_globals_lock);
				804
				805	ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
				806	if (!ocfs2_debugfs_root) {
				807	status = -EFAULT;
				808	mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
				809	}
				810
				811	leave:
				812	if (status < 0) {
				813	ocfs2_free_mem_caches();
				814	exit_ocfs2_uptodate_cache();
				815	exit_ocfs2_extent_maps();
				816	}
				817
				818	mlog_exit(status);
				819
				820	if (status >= 0) {
				821	return register_filesystem(&ocfs2_fs_type);
				822	} else
				823	return -1;
				824	}
				825
				826	static void __exit ocfs2_exit(void)
				827	{
				828	mlog_entry_void();
				829
				830	if (ocfs2_wq) {
				831	flush_workqueue(ocfs2_wq);
				832	destroy_workqueue(ocfs2_wq);
				833	}
				834
				835	debugfs_remove(ocfs2_debugfs_root);
				836
				837	ocfs2_free_mem_caches();
				838
				839	unregister_filesystem(&ocfs2_fs_type);
				840
				841	exit_ocfs2_extent_maps();
				842
				843	exit_ocfs2_uptodate_cache();
				844
				845	mlog_exit_void();
				846	}
				847
				848	static void ocfs2_put_super(struct super_block *sb)
				849	{
				850	mlog_entry("(0x%p)\n", sb);
				851
				852	ocfs2_sync_blockdev(sb);
				853	ocfs2_dismount_volume(sb, 0);
				854
				855	mlog_exit_void();
				856	}
				857
				858	static int ocfs2_statfs(struct super_block sb, struct kstatfs buf)
				859	{
				860	struct ocfs2_super *osb;
				861	u32 numbits, freebits;
				862	int status;
				863	struct ocfs2_dinode *bm_lock;
				864	struct buffer_head *bh = NULL;
				865	struct inode *inode = NULL;
				866
				867	mlog_entry("(%p, %p)\n", sb, buf);
				868
				869	osb = OCFS2_SB(sb);
				870
				871	inode = ocfs2_get_system_file_inode(osb,
				872	GLOBAL_BITMAP_SYSTEM_INODE,
				873	OCFS2_INVALID_SLOT);
				874	if (!inode) {
				875	mlog(ML_ERROR, "failed to get bitmap inode\n");
				876	status = -EIO;
				877	goto bail;
				878	}
				879
				880	status = ocfs2_meta_lock(inode, NULL, &bh, 0);
				881	if (status < 0) {
				882	mlog_errno(status);
				883	goto bail;
				884	}
				885
				886	bm_lock = (struct ocfs2_dinode *) bh->b_data;
				887
				888	numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total);
				889	freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used);
				890
				891	buf->f_type = OCFS2_SUPER_MAGIC;
				892	buf->f_bsize = sb->s_blocksize;
				893	buf->f_namelen = OCFS2_MAX_FILENAME_LEN;
				894	buf->f_blocks = ((sector_t) numbits) *
				895	(osb->s_clustersize >> osb->sb->s_blocksize_bits);
				896	buf->f_bfree = ((sector_t) freebits) *
				897	(osb->s_clustersize >> osb->sb->s_blocksize_bits);
				898	buf->f_bavail = buf->f_bfree;
				899	buf->f_files = numbits;
				900	buf->f_ffree = freebits;
				901
				902	brelse(bh);
				903
				904	ocfs2_meta_unlock(inode, 0);
				905	status = 0;
				906	bail:
				907	if (inode)
				908	iput(inode);
				909
				910	mlog_exit(status);
				911
				912	return status;
				913	}
				914
				915	static void ocfs2_inode_init_once(void *data,
				916	kmem_cache_t *cachep,
				917	unsigned long flags)
				918	{
				919	struct ocfs2_inode_info *oi = data;
				920
				921	if ((flags & (SLAB_CTOR_VERIFY\|SLAB_CTOR_CONSTRUCTOR)) ==
				922	SLAB_CTOR_CONSTRUCTOR) {
				923	oi->ip_flags = 0;
				924	oi->ip_open_count = 0;
				925	spin_lock_init(&oi->ip_lock);
				926	ocfs2_extent_map_init(&oi->vfs_inode);
				927	INIT_LIST_HEAD(&oi->ip_handle_list);
				928	INIT_LIST_HEAD(&oi->ip_io_markers);
				929	oi->ip_handle = NULL;
				930	oi->ip_created_trans = 0;
				931	oi->ip_last_trans = 0;
				932	oi->ip_dir_start_lookup = 0;
				933
				934	init_rwsem(&oi->ip_alloc_sem);
Mark Fasheh	251b6ec	2006-01-10 15:41:43 -0800	[diff] [blame]	935	mutex_init(&oi->ip_io_mutex);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	936
				937	oi->ip_blkno = 0ULL;
				938	oi->ip_clusters = 0;
				939
				940	ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
				941	ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
				942	ocfs2_lock_res_init_once(&oi->ip_data_lockres);
				943
				944	ocfs2_metadata_cache_init(&oi->vfs_inode);
				945
				946	inode_init_once(&oi->vfs_inode);
				947	}
				948	}
				949
				950	static int ocfs2_initialize_mem_caches(void)
				951	{
				952	ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache",
Paul Jackson	fffb60f	2006-03-24 03:16:06 -0800	[diff] [blame]	953	sizeof(struct ocfs2_inode_info),
				954	0,
				955	(SLAB_HWCACHE_ALIGN\|SLAB_RECLAIM_ACCOUNT\|
				956	SLAB_MEM_SPREAD),
				957	ocfs2_inode_init_once, NULL);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	958	if (!ocfs2_inode_cachep)
				959	return -ENOMEM;
				960
				961	ocfs2_lock_cache = kmem_cache_create("ocfs2_lock",
				962	sizeof(struct ocfs2_journal_lock),
				963	0,
Christoph Lameter	ac2b898	2006-03-22 00:08:15 -0800	[diff] [blame]	964	SLAB_HWCACHE_ALIGN,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	965	NULL, NULL);
				966	if (!ocfs2_lock_cache)
				967	return -ENOMEM;
				968
				969	return 0;
				970	}
				971
				972	static void ocfs2_free_mem_caches(void)
				973	{
				974	if (ocfs2_inode_cachep)
				975	kmem_cache_destroy(ocfs2_inode_cachep);
				976	if (ocfs2_lock_cache)
				977	kmem_cache_destroy(ocfs2_lock_cache);
				978
				979	ocfs2_inode_cachep = NULL;
				980	ocfs2_lock_cache = NULL;
				981	}
				982
				983	static int ocfs2_get_sector(struct super_block *sb,
				984	struct buffer_head **bh,
				985	int block,
				986	int sect_size)
				987	{
				988	if (!sb_set_blocksize(sb, sect_size)) {
				989	mlog(ML_ERROR, "unable to set blocksize\n");
				990	return -EIO;
				991	}
				992
				993	*bh = sb_getblk(sb, block);
				994	if (!*bh) {
				995	mlog_errno(-EIO);
				996	return -EIO;
				997	}
				998	lock_buffer(*bh);
				999	if (!buffer_dirty(*bh))
				1000	clear_buffer_uptodate(*bh);
				1001	unlock_buffer(*bh);
				1002	ll_rw_block(READ, 1, bh);
				1003	wait_on_buffer(*bh);
				1004	return 0;
				1005	}
				1006
				1007	/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */
				1008	static int ocfs2_fill_local_node_info(struct ocfs2_super *osb)
				1009	{
				1010	int status;
				1011
				1012	/* XXX hold a ref on the node while mounte? easy enough, if
				1013	* desirable. */
				1014	osb->node_num = o2nm_this_node();
				1015	if (osb->node_num == O2NM_MAX_NODES) {
				1016	mlog(ML_ERROR, "could not find this host's node number\n");
				1017	status = -ENOENT;
				1018	goto bail;
				1019	}
				1020
				1021	mlog(ML_NOTICE, "I am node %d\n", osb->node_num);
				1022
				1023	status = 0;
				1024	bail:
				1025	return status;
				1026	}
				1027
				1028	static int ocfs2_mount_volume(struct super_block *sb)
				1029	{
				1030	int status = 0;
				1031	int unlock_super = 0;
				1032	struct ocfs2_super *osb = OCFS2_SB(sb);
				1033
				1034	mlog_entry_void();
				1035
				1036	if (ocfs2_is_hard_readonly(osb))
				1037	goto leave;
				1038
				1039	status = ocfs2_fill_local_node_info(osb);
				1040	if (status < 0) {
				1041	mlog_errno(status);
				1042	goto leave;
				1043	}
				1044
				1045	status = ocfs2_register_hb_callbacks(osb);
				1046	if (status < 0) {
				1047	mlog_errno(status);
				1048	goto leave;
				1049	}
				1050
				1051	status = ocfs2_dlm_init(osb);
				1052	if (status < 0) {
				1053	mlog_errno(status);
				1054	goto leave;
				1055	}
				1056
				1057	/* requires vote_thread to be running. */
				1058	status = ocfs2_register_net_handlers(osb);
				1059	if (status < 0) {
				1060	mlog_errno(status);
				1061	goto leave;
				1062	}
				1063
				1064	status = ocfs2_super_lock(osb, 1);
				1065	if (status < 0) {
				1066	mlog_errno(status);
				1067	goto leave;
				1068	}
				1069	unlock_super = 1;
				1070
				1071	/* This will load up the node map and add ourselves to it. */
				1072	status = ocfs2_find_slot(osb);
				1073	if (status < 0) {
				1074	mlog_errno(status);
				1075	goto leave;
				1076	}
				1077
				1078	ocfs2_populate_mounted_map(osb);
				1079
				1080	/* load all node-local system inodes */
				1081	status = ocfs2_init_local_system_inodes(osb);
				1082	if (status < 0) {
				1083	mlog_errno(status);
				1084	goto leave;
				1085	}
				1086
				1087	status = ocfs2_check_volume(osb);
				1088	if (status < 0) {
				1089	mlog_errno(status);
				1090	goto leave;
				1091	}
				1092
				1093	status = ocfs2_truncate_log_init(osb);
				1094	if (status < 0) {
				1095	mlog_errno(status);
				1096	goto leave;
				1097	}
				1098
				1099	/* This should be sent after we recovered our journal as it
				1100	* will cause other nodes to unmark us as needing
				1101	* recovery. However, we need to send it before dropping the
				1102	* super block lock as otherwise their recovery threads might
				1103	* try to clean us up while we're live! */
				1104	status = ocfs2_request_mount_vote(osb);
				1105	if (status < 0)
				1106	mlog_errno(status);
				1107
				1108	leave:
				1109	if (unlock_super)
				1110	ocfs2_super_unlock(osb, 1);
				1111
				1112	mlog_exit(status);
				1113	return status;
				1114	}
				1115
				1116	/* we can't grab the goofy sem lock from inside wait_event, so we use
				1117	* memory barriers to make sure that we'll see the null task before
				1118	* being woken up */
				1119	static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
				1120	{
				1121	mb();
				1122	return osb->recovery_thread_task != NULL;
				1123	}
				1124
				1125	static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
				1126	{
				1127	int tmp;
				1128	struct ocfs2_super *osb = NULL;
				1129
				1130	mlog_entry("(0x%p)\n", sb);
				1131
				1132	BUG_ON(!sb);
				1133	osb = OCFS2_SB(sb);
				1134	BUG_ON(!osb);
				1135
				1136	ocfs2_shutdown_local_alloc(osb);
				1137
				1138	ocfs2_truncate_log_shutdown(osb);
				1139
				1140	/* disable any new recovery threads and wait for any currently
				1141	* running ones to exit. Do this before setting the vol_state. */
Arjan van de Ven	c74ec2f	2006-01-13 21:54:23 -0800	[diff] [blame]	1142	mutex_lock(&osb->recovery_lock);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1143	osb->disable_recovery = 1;
Arjan van de Ven	c74ec2f	2006-01-13 21:54:23 -0800	[diff] [blame]	1144	mutex_unlock(&osb->recovery_lock);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1145	wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
				1146
				1147	/* At this point, we know that no more recovery threads can be
				1148	* launched, so wait for any recovery completion work to
				1149	* complete. */
				1150	flush_workqueue(ocfs2_wq);
				1151
				1152	ocfs2_journal_shutdown(osb);
				1153
				1154	ocfs2_sync_blockdev(sb);
				1155
				1156	/* No dlm means we've failed during mount, so skip all the
				1157	* steps which depended on that to complete. */
				1158	if (osb->dlm) {
				1159	tmp = ocfs2_super_lock(osb, 1);
				1160	if (tmp < 0) {
				1161	mlog_errno(tmp);
				1162	return;
				1163	}
				1164
				1165	tmp = ocfs2_request_umount_vote(osb);
				1166	if (tmp < 0)
				1167	mlog_errno(tmp);
				1168
				1169	if (osb->slot_num != OCFS2_INVALID_SLOT)
				1170	ocfs2_put_slot(osb);
				1171
				1172	ocfs2_super_unlock(osb, 1);
				1173	}
				1174
				1175	ocfs2_release_system_inodes(osb);
				1176
				1177	if (osb->dlm) {
				1178	ocfs2_unregister_net_handlers(osb);
				1179
				1180	ocfs2_dlm_shutdown(osb);
				1181	}
				1182
				1183	ocfs2_clear_hb_callbacks(osb);
				1184
				1185	debugfs_remove(osb->osb_debug_root);
				1186
				1187	if (!mnt_err)
				1188	ocfs2_stop_heartbeat(osb);
				1189
				1190	atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
				1191
				1192	printk("ocfs2: Unmounting device (%u,%u) on (node %d)\n",
				1193	MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev), osb->node_num);
				1194
				1195	ocfs2_delete_osb(osb);
				1196	kfree(osb);
				1197	sb->s_dev = 0;
				1198	sb->s_fs_info = NULL;
				1199	}
				1200
				1201	static int ocfs2_setup_osb_uuid(struct ocfs2_super osb, const unsigned char uuid,
				1202	unsigned uuid_bytes)
				1203	{
				1204	int i, ret;
				1205	char *ptr;
				1206
				1207	BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN);
				1208
				1209	osb->uuid_str = kcalloc(1, OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL);
				1210	if (osb->uuid_str == NULL)
				1211	return -ENOMEM;
				1212
				1213	memcpy(osb->uuid, uuid, OCFS2_VOL_UUID_LEN);
				1214
				1215	for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) {
				1216	/* print with null */
				1217	ret = snprintf(ptr, 3, "%02X", uuid[i]);
				1218	if (ret != 2) /* drop super cleans up */
				1219	return -EINVAL;
				1220	/* then only advance past the last char */
				1221	ptr += 2;
				1222	}
				1223
				1224	return 0;
				1225	}
				1226
				1227	static int ocfs2_initialize_super(struct super_block *sb,
				1228	struct buffer_head *bh,
				1229	int sector_size)
				1230	{
				1231	int status = 0;
				1232	int i;
				1233	struct ocfs2_dinode *di = NULL;
				1234	struct inode *inode = NULL;
				1235	struct buffer_head *bitmap_bh = NULL;
				1236	struct ocfs2_journal *journal;
				1237	__le32 uuid_net_key;
				1238	struct ocfs2_super *osb;
				1239
				1240	mlog_entry_void();
				1241
				1242	osb = kcalloc(1, sizeof(struct ocfs2_super), GFP_KERNEL);
				1243	if (!osb) {
				1244	status = -ENOMEM;
				1245	mlog_errno(status);
				1246	goto bail;
				1247	}
				1248
				1249	sb->s_fs_info = osb;
				1250	sb->s_op = &ocfs2_sops;
				1251	sb->s_export_op = &ocfs2_export_ops;
				1252	sb->s_flags \|= MS_NOATIME;
				1253	/* this is needed to support O_LARGEFILE */
				1254	sb->s_maxbytes = ocfs2_max_file_offset(sb->s_blocksize_bits);
				1255
				1256	osb->sb = sb;
				1257	/* Save off for ocfs2_rw_direct */
				1258	osb->s_sectsize_bits = blksize_bits(sector_size);
Eric Sesterhenn / snakebyte	ebdec83	2006-01-27 10:32:52 +0100	[diff] [blame]	1259	BUG_ON(!osb->s_sectsize_bits);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1260
				1261	osb->net_response_ids = 0;
				1262	spin_lock_init(&osb->net_response_lock);
				1263	INIT_LIST_HEAD(&osb->net_response_list);
				1264
				1265	INIT_LIST_HEAD(&osb->osb_net_handlers);
				1266	init_waitqueue_head(&osb->recovery_event);
				1267	spin_lock_init(&osb->vote_task_lock);
				1268	init_waitqueue_head(&osb->vote_event);
				1269	osb->vote_work_sequence = 0;
				1270	osb->vote_wake_sequence = 0;
				1271	INIT_LIST_HEAD(&osb->blocked_lock_list);
				1272	osb->blocked_lock_count = 0;
				1273	INIT_LIST_HEAD(&osb->vote_list);
				1274	spin_lock_init(&osb->osb_lock);
				1275
				1276	atomic_set(&osb->alloc_stats.moves, 0);
				1277	atomic_set(&osb->alloc_stats.local_data, 0);
				1278	atomic_set(&osb->alloc_stats.bitmap_data, 0);
				1279	atomic_set(&osb->alloc_stats.bg_allocs, 0);
				1280	atomic_set(&osb->alloc_stats.bg_extends, 0);
				1281
				1282	ocfs2_init_node_maps(osb);
				1283
				1284	snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
				1285	MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
				1286
Arjan van de Ven	c74ec2f	2006-01-13 21:54:23 -0800	[diff] [blame]	1287	mutex_init(&osb->recovery_lock);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1288
				1289	osb->disable_recovery = 0;
				1290	osb->recovery_thread_task = NULL;
				1291
				1292	init_waitqueue_head(&osb->checkpoint_event);
				1293	atomic_set(&osb->needs_checkpoint, 0);
				1294
				1295	osb->node_num = O2NM_INVALID_NODE_NUM;
				1296	osb->slot_num = OCFS2_INVALID_SLOT;
				1297
				1298	osb->local_alloc_state = OCFS2_LA_UNUSED;
				1299	osb->local_alloc_bh = NULL;
				1300
				1301	ocfs2_setup_hb_callbacks(osb);
				1302
				1303	init_waitqueue_head(&osb->osb_mount_event);
				1304
				1305	osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
				1306	if (!osb->vol_label) {
				1307	mlog(ML_ERROR, "unable to alloc vol label\n");
				1308	status = -ENOMEM;
				1309	goto bail;
				1310	}
				1311
				1312	osb->uuid = kmalloc(OCFS2_VOL_UUID_LEN, GFP_KERNEL);
				1313	if (!osb->uuid) {
				1314	mlog(ML_ERROR, "unable to alloc uuid\n");
				1315	status = -ENOMEM;
				1316	goto bail;
				1317	}
				1318
				1319	di = (struct ocfs2_dinode *)bh->b_data;
				1320
				1321	osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
				1322	if (osb->max_slots > OCFS2_MAX_SLOTS \|\| osb->max_slots == 0) {
				1323	mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
				1324	osb->max_slots);
				1325	status = -EINVAL;
				1326	goto bail;
				1327	}
				1328	mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots);
				1329
Mark Fasheh	b4df6ed	2006-02-22 17:35:08 -0800	[diff] [blame]	1330	init_waitqueue_head(&osb->osb_wipe_event);
				1331	osb->osb_orphan_wipes = kcalloc(osb->max_slots,
				1332	sizeof(*osb->osb_orphan_wipes),
				1333	GFP_KERNEL);
				1334	if (!osb->osb_orphan_wipes) {
				1335	status = -ENOMEM;
				1336	mlog_errno(status);
				1337	goto bail;
				1338	}
				1339
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1340	osb->s_feature_compat =
				1341	le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
				1342	osb->s_feature_ro_compat =
				1343	le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat);
				1344	osb->s_feature_incompat =
				1345	le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat);
				1346
				1347	if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) {
				1348	mlog(ML_ERROR, "couldn't mount because of unsupported "
				1349	"optional features (%x).\n", i);
				1350	status = -EINVAL;
				1351	goto bail;
				1352	}
				1353	if (!(osb->sb->s_flags & MS_RDONLY) &&
				1354	(i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) {
				1355	mlog(ML_ERROR, "couldn't mount RDWR because of "
				1356	"unsupported optional features (%x).\n", i);
				1357	status = -EINVAL;
				1358	goto bail;
				1359	}
				1360
				1361	get_random_bytes(&osb->s_next_generation, sizeof(u32));
				1362
				1363	/* FIXME
				1364	* This should be done in ocfs2_journal_init(), but unknown
				1365	* ordering issues will cause the filesystem to crash.
				1366	* If anyone wants to figure out what part of the code
				1367	* refers to osb->journal before ocfs2_journal_init() is run,
				1368	* be my guest.
				1369	*/
				1370	/* initialize our journal structure */
				1371
				1372	journal = kcalloc(1, sizeof(struct ocfs2_journal), GFP_KERNEL);
				1373	if (!journal) {
				1374	mlog(ML_ERROR, "unable to alloc journal\n");
				1375	status = -ENOMEM;
				1376	goto bail;
				1377	}
				1378	osb->journal = journal;
				1379	journal->j_osb = osb;
				1380
				1381	atomic_set(&journal->j_num_trans, 0);
				1382	init_rwsem(&journal->j_trans_barrier);
				1383	init_waitqueue_head(&journal->j_checkpointed);
				1384	spin_lock_init(&journal->j_lock);
				1385	journal->j_trans_id = (unsigned long) 1;
				1386	INIT_LIST_HEAD(&journal->j_la_cleanups);
				1387	INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery, osb);
				1388	journal->j_state = OCFS2_JOURNAL_FREE;
				1389
				1390	/* get some pseudo constants for clustersize bits */
				1391	osb->s_clustersize_bits =
				1392	le32_to_cpu(di->id2.i_super.s_clustersize_bits);
				1393	osb->s_clustersize = 1 << osb->s_clustersize_bits;
				1394	mlog(0, "clusterbits=%d\n", osb->s_clustersize_bits);
				1395
				1396	if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE \|\|
				1397	osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) {
				1398	mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n",
				1399	osb->s_clustersize);
				1400	status = -EINVAL;
				1401	goto bail;
				1402	}
				1403
				1404	if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1)
				1405	> (u32)~0UL) {
				1406	mlog(ML_ERROR, "Volume might try to write to blocks beyond "
				1407	"what jbd can address in 32 bits.\n");
				1408	status = -EINVAL;
				1409	goto bail;
				1410	}
				1411
				1412	if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid,
				1413	sizeof(di->id2.i_super.s_uuid))) {
				1414	mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n");
				1415	status = -ENOMEM;
				1416	goto bail;
				1417	}
				1418
				1419	memcpy(&uuid_net_key, &osb->uuid[i], sizeof(osb->net_key));
				1420	osb->net_key = le32_to_cpu(uuid_net_key);
				1421
				1422	strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
				1423	osb->vol_label[63] = '\0';
				1424	osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
				1425	osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
				1426	osb->first_cluster_group_blkno =
				1427	le64_to_cpu(di->id2.i_super.s_first_cluster_group);
				1428	osb->fs_generation = le32_to_cpu(di->i_fs_generation);
				1429	mlog(0, "vol_label: %s\n", osb->vol_label);
				1430	mlog(0, "uuid: %s\n", osb->uuid_str);
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	1431	mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n",
				1432	(unsigned long long)osb->root_blkno,
				1433	(unsigned long long)osb->system_dir_blkno);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1434
				1435	osb->osb_dlm_debug = ocfs2_new_dlm_debug();
				1436	if (!osb->osb_dlm_debug) {
				1437	status = -ENOMEM;
				1438	mlog_errno(status);
				1439	goto bail;
				1440	}
				1441
				1442	atomic_set(&osb->vol_state, VOLUME_INIT);
				1443
				1444	/* load root, system_dir, and all global system inodes */
				1445	status = ocfs2_init_global_system_inodes(osb);
				1446	if (status < 0) {
				1447	mlog_errno(status);
				1448	goto bail;
				1449	}
				1450
				1451	/*
				1452	* global bitmap
				1453	*/
				1454	inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
				1455	OCFS2_INVALID_SLOT);
				1456	if (!inode) {
				1457	status = -EINVAL;
				1458	mlog_errno(status);
				1459	goto bail;
				1460	}
				1461
				1462	osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
				1463
				1464	status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0,
				1465	inode);
				1466	iput(inode);
				1467	if (status < 0) {
				1468	mlog_errno(status);
				1469	goto bail;
				1470	}
				1471
				1472	di = (struct ocfs2_dinode *) bitmap_bh->b_data;
				1473	osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
				1474	osb->num_clusters = le32_to_cpu(di->id1.bitmap1.i_total);
				1475	brelse(bitmap_bh);
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	1476	mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n",
				1477	(unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1478
				1479	status = ocfs2_init_slot_info(osb);
				1480	if (status < 0) {
				1481	mlog_errno(status);
				1482	goto bail;
				1483	}
				1484
				1485	/* Link this osb onto the global linked list of all osb structures. */
				1486	/* The Global Link List is mainted for the whole driver . */
				1487	spin_lock(&ocfs2_globals_lock);
				1488	osb->osb_id = osb_id;
				1489	if (osb_id < OCFS2_MAX_OSB_ID)
				1490	osb_id++;
				1491	else {
				1492	mlog(ML_ERROR, "Too many volumes mounted\n");
				1493	status = -ENOMEM;
				1494	}
				1495	spin_unlock(&ocfs2_globals_lock);
				1496
				1497	bail:
				1498	mlog_exit(status);
				1499	return status;
				1500	}
				1501
				1502	/*
				1503	* will return: -EAGAIN if it is ok to keep searching for superblocks
				1504	* -EINVAL if there is a bad superblock
				1505	* 0 on success
				1506	*/
				1507	static int ocfs2_verify_volume(struct ocfs2_dinode *di,
				1508	struct buffer_head *bh,
				1509	u32 blksz)
				1510	{
				1511	int status = -EAGAIN;
				1512
				1513	mlog_entry_void();
				1514
				1515	if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
				1516	strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
				1517	status = -EINVAL;
				1518	if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
				1519	mlog(ML_ERROR, "found superblock with incorrect block "
				1520	"size: found %u, should be %u\n",
				1521	1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits),
				1522	blksz);
				1523	} else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) !=
				1524	OCFS2_MAJOR_REV_LEVEL \|\|
				1525	le16_to_cpu(di->id2.i_super.s_minor_rev_level) !=
				1526	OCFS2_MINOR_REV_LEVEL) {
				1527	mlog(ML_ERROR, "found superblock with bad version: "
				1528	"found %u.%u, should be %u.%u\n",
				1529	le16_to_cpu(di->id2.i_super.s_major_rev_level),
				1530	le16_to_cpu(di->id2.i_super.s_minor_rev_level),
				1531	OCFS2_MAJOR_REV_LEVEL,
				1532	OCFS2_MINOR_REV_LEVEL);
				1533	} else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) {
				1534	mlog(ML_ERROR, "bad block number on superblock: "
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	1535	"found %llu, should be %llu\n",
				1536	(unsigned long long)di->i_blkno,
				1537	(unsigned long long)bh->b_blocknr);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1538	} else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 \|\|
				1539	le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) {
				1540	mlog(ML_ERROR, "bad cluster size found: %u\n",
				1541	1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits));
				1542	} else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) {
				1543	mlog(ML_ERROR, "bad root_blkno: 0\n");
				1544	} else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) {
				1545	mlog(ML_ERROR, "bad system_dir_blkno: 0\n");
				1546	} else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) {
				1547	mlog(ML_ERROR,
				1548	"Superblock slots found greater than file system "
				1549	"maximum: found %u, max %u\n",
				1550	le16_to_cpu(di->id2.i_super.s_max_slots),
				1551	OCFS2_MAX_SLOTS);
				1552	} else {
				1553	/* found it! */
				1554	status = 0;
				1555	}
				1556	}
				1557
				1558	mlog_exit(status);
				1559	return status;
				1560	}
				1561
				1562	static int ocfs2_check_volume(struct ocfs2_super *osb)
				1563	{
				1564	int status = 0;
				1565	int dirty;
				1566	struct ocfs2_dinode local_alloc = NULL; / only used if we
				1567	* recover
				1568	* ourselves. */
				1569
				1570	mlog_entry_void();
				1571
				1572	/* Init our journal object. */
				1573	status = ocfs2_journal_init(osb->journal, &dirty);
				1574	if (status < 0) {
				1575	mlog(ML_ERROR, "Could not initialize journal!\n");
				1576	goto finally;
				1577	}
				1578
				1579	/* If the journal was unmounted cleanly then we don't want to
				1580	* recover anything. Otherwise, journal_load will do that
				1581	* dirty work for us :) */
				1582	if (!dirty) {
				1583	status = ocfs2_journal_wipe(osb->journal, 0);
				1584	if (status < 0) {
				1585	mlog_errno(status);
				1586	goto finally;
				1587	}
				1588	} else {
				1589	mlog(ML_NOTICE, "File system was not unmounted cleanly, "
				1590	"recovering volume.\n");
				1591	}
				1592
				1593	/* will play back anything left in the journal. */
				1594	ocfs2_journal_load(osb->journal);
				1595
				1596	if (dirty) {
				1597	/* recover my local alloc if we didn't unmount cleanly. */
				1598	status = ocfs2_begin_local_alloc_recovery(osb,
				1599	osb->slot_num,
				1600	&local_alloc);
				1601	if (status < 0) {
				1602	mlog_errno(status);
				1603	goto finally;
				1604	}
				1605	/* we complete the recovery process after we've marked
				1606	* ourselves as mounted. */
				1607	}
				1608
				1609	mlog(0, "Journal loaded.\n");
				1610
				1611	status = ocfs2_load_local_alloc(osb);
				1612	if (status < 0) {
				1613	mlog_errno(status);
				1614	goto finally;
				1615	}
				1616
				1617	if (dirty) {
				1618	/* Recovery will be completed after we've mounted the
				1619	* rest of the volume. */
				1620	osb->dirty = 1;
				1621	osb->local_alloc_copy = local_alloc;
				1622	local_alloc = NULL;
				1623	}
				1624
				1625	/* go through each journal, trylock it and if you get the
				1626	* lock, and it's marked as dirty, set the bit in the recover
				1627	* map and launch a recovery thread for it. */
				1628	status = ocfs2_mark_dead_nodes(osb);
				1629	if (status < 0)
				1630	mlog_errno(status);
				1631
				1632	finally:
				1633	if (local_alloc)
				1634	kfree(local_alloc);
				1635
				1636	mlog_exit(status);
				1637	return status;
				1638	}
				1639
				1640	/*
				1641	* The routine gets called from dismount or close whenever a dismount on
				1642	* volume is requested and the osb open count becomes 1.
				1643	* It will remove the osb from the global list and also free up all the
				1644	* initialized resources and fileobject.
				1645	*/
				1646	static void ocfs2_delete_osb(struct ocfs2_super *osb)
				1647	{
				1648	mlog_entry_void();
				1649
				1650	/* This function assumes that the caller has the main osb resource */
				1651
				1652	if (osb->slot_info)
				1653	ocfs2_free_slot_info(osb->slot_info);
				1654
Mark Fasheh	b4df6ed	2006-02-22 17:35:08 -0800	[diff] [blame]	1655	kfree(osb->osb_orphan_wipes);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1656	/* FIXME
				1657	* This belongs in journal shutdown, but because we have to
				1658	* allocate osb->journal at the start of ocfs2_initalize_osb(),
				1659	* we free it here.
				1660	*/
				1661	kfree(osb->journal);
				1662	if (osb->local_alloc_copy)
				1663	kfree(osb->local_alloc_copy);
				1664	kfree(osb->uuid_str);
				1665	ocfs2_put_dlm_debug(osb->osb_dlm_debug);
				1666	memset(osb, 0, sizeof(struct ocfs2_super));
				1667
				1668	mlog_exit_void();
				1669	}
				1670
				1671	/* Put OCFS2 into a readonly state, or (if the user specifies it),
				1672	* panic(). We do not support continue-on-error operation. */
				1673	static void ocfs2_handle_error(struct super_block *sb)
				1674	{
				1675	struct ocfs2_super *osb = OCFS2_SB(sb);
				1676
				1677	if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
				1678	panic("OCFS2: (device %s): panic forced after error\n",
				1679	sb->s_id);
				1680
				1681	ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
				1682
				1683	if (sb->s_flags & MS_RDONLY &&
				1684	(ocfs2_is_soft_readonly(osb) \|\|
				1685	ocfs2_is_hard_readonly(osb)))
				1686	return;
				1687
				1688	printk(KERN_CRIT "File system is now read-only due to the potential "
				1689	"of on-disk corruption. Please run fsck.ocfs2 once the file "
				1690	"system is unmounted.\n");
				1691	sb->s_flags \|= MS_RDONLY;
				1692	ocfs2_set_ro_flag(osb, 0);
				1693	}
				1694
				1695	static char error_buf[1024];
				1696
				1697	void __ocfs2_error(struct super_block *sb,
				1698	const char *function,
				1699	const char *fmt, ...)
				1700	{
				1701	va_list args;
				1702
				1703	va_start(args, fmt);
				1704	vsprintf(error_buf, fmt, args);
				1705	va_end(args);
				1706
				1707	/* Not using mlog here because we want to show the actual
				1708	* function the error came from. */
				1709	printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n",
				1710	sb->s_id, function, error_buf);
				1711
				1712	ocfs2_handle_error(sb);
				1713	}
				1714
				1715	/* Handle critical errors. This is intentionally more drastic than
				1716	* ocfs2_handle_error, so we only use for things like journal errors,
				1717	* etc. */
				1718	void __ocfs2_abort(struct super_block* sb,
				1719	const char *function,
				1720	const char *fmt, ...)
				1721	{
				1722	va_list args;
				1723
				1724	va_start(args, fmt);
				1725	vsprintf(error_buf, fmt, args);
				1726	va_end(args);
				1727
				1728	printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n",
				1729	sb->s_id, function, error_buf);
				1730
				1731	/* We don't have the cluster support yet to go straight to
				1732	* hard readonly in here. Until then, we want to keep
				1733	* ocfs2_abort() so that we can at least mark critical
				1734	* errors.
				1735	*
				1736	* TODO: This should abort the journal and alert other nodes
				1737	* that our slot needs recovery. */
				1738
				1739	/* Force a panic(). This stinks, but it's better than letting
				1740	* things continue without having a proper hard readonly
				1741	* here. */
				1742	OCFS2_SB(sb)->s_mount_opt \|= OCFS2_MOUNT_ERRORS_PANIC;
				1743	ocfs2_handle_error(sb);
				1744	}
				1745
				1746	module_init(ocfs2_init);
				1747	module_exit(ocfs2_exit);