Blame - fs/ocfs2/file.c - kernel/msm

blob: f2cd3bf9efb2f6b2dff8b44c44592355e2b44a6e [file] [log] [blame]

Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1	/* -- mode: c; c-basic-offset: 8; --
				2	* vim: noexpandtab sw=8 ts=8 sts=0:
				3	*
				4	* file.c
				5	*
				6	* File open, close, extend, truncate
				7	*
				8	* Copyright (C) 2002, 2004 Oracle. All rights reserved.
				9	*
				10	* This program is free software; you can redistribute it and/or
				11	* modify it under the terms of the GNU General Public
				12	* License as published by the Free Software Foundation; either
				13	* version 2 of the License, or (at your option) any later version.
				14	*
				15	* This program is distributed in the hope that it will be useful,
				16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				18	* General Public License for more details.
				19	*
				20	* You should have received a copy of the GNU General Public
				21	* License along with this program; if not, write to the
				22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				23	* Boston, MA 021110-1307, USA.
				24	*/
				25
Randy Dunlap	16f7e0f	2006-01-11 12:17:46 -0800	[diff] [blame]	26	#include <linux/capability.h>
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	27	#include <linux/fs.h>
				28	#include <linux/types.h>
				29	#include <linux/slab.h>
				30	#include <linux/highmem.h>
				31	#include <linux/pagemap.h>
				32	#include <linux/uio.h>
Mark Fasheh	e2057c5	2006-10-03 17:53:05 -0700	[diff] [blame]	33	#include <linux/sched.h>
Tiger Yang	8659ac2	2006-10-17 18:29:52 -0700	[diff] [blame]	34	#include <linux/pipe_fs_i.h>
Tiger Yang	7f1a37e	2006-11-15 15:48:42 +0800	[diff] [blame]	35	#include <linux/mount.h>
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	36
				37	#define MLOG_MASK_PREFIX ML_INODE
				38	#include <cluster/masklog.h>
				39
				40	#include "ocfs2.h"
				41
				42	#include "alloc.h"
				43	#include "aops.h"
				44	#include "dir.h"
				45	#include "dlmglue.h"
				46	#include "extent_map.h"
				47	#include "file.h"
				48	#include "sysfile.h"
				49	#include "inode.h"
Herbert Poetzl	ca4d147	2006-07-03 17:27:12 -0700	[diff] [blame]	50	#include "ioctl.h"
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	51	#include "journal.h"
				52	#include "mmap.h"
				53	#include "suballoc.h"
				54	#include "super.h"
				55
				56	#include "buffer_head_io.h"
				57
				58	static int ocfs2_sync_inode(struct inode *inode)
				59	{
				60	filemap_fdatawrite(inode->i_mapping);
				61	return sync_mapping_buffers(inode->i_mapping);
				62	}
				63
				64	static int ocfs2_file_open(struct inode inode, struct file file)
				65	{
				66	int status;
				67	int mode = file->f_flags;
				68	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				69
				70	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
Josef Sipek	d28c917	2006-12-08 02:37:25 -0800	[diff] [blame]	71	file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	72
				73	spin_lock(&oi->ip_lock);
				74
				75	/* Check that the inode hasn't been wiped from disk by another
				76	* node. If it hasn't then we're safe as long as we hold the
				77	* spin lock until our increment of open count. */
				78	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
				79	spin_unlock(&oi->ip_lock);
				80
				81	status = -ENOENT;
				82	goto leave;
				83	}
				84
				85	if (mode & O_DIRECT)
				86	oi->ip_flags \|= OCFS2_INODE_OPEN_DIRECT;
				87
				88	oi->ip_open_count++;
				89	spin_unlock(&oi->ip_lock);
				90	status = 0;
				91	leave:
				92	mlog_exit(status);
				93	return status;
				94	}
				95
				96	static int ocfs2_file_release(struct inode inode, struct file file)
				97	{
				98	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				99
				100	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
Josef Sipek	d28c917	2006-12-08 02:37:25 -0800	[diff] [blame]	101	file->f_path.dentry->d_name.len,
				102	file->f_path.dentry->d_name.name);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	103
				104	spin_lock(&oi->ip_lock);
				105	if (!--oi->ip_open_count)
				106	oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
				107	spin_unlock(&oi->ip_lock);
				108
				109	mlog_exit(0);
				110
				111	return 0;
				112	}
				113
				114	static int ocfs2_sync_file(struct file *file,
				115	struct dentry *dentry,
				116	int datasync)
				117	{
				118	int err = 0;
				119	journal_t *journal;
				120	struct inode *inode = dentry->d_inode;
				121	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				122
				123	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
				124	dentry->d_name.len, dentry->d_name.name);
				125
				126	err = ocfs2_sync_inode(dentry->d_inode);
				127	if (err)
				128	goto bail;
				129
				130	journal = osb->journal->j_journal;
				131	err = journal_force_commit(journal);
				132
				133	bail:
				134	mlog_exit(err);
				135
				136	return (err < 0) ? -EIO : 0;
				137	}
				138
Tiger Yang	7f1a37e	2006-11-15 15:48:42 +0800	[diff] [blame]	139	int ocfs2_should_update_atime(struct inode *inode,
				140	struct vfsmount *vfsmnt)
				141	{
				142	struct timespec now;
				143	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				144
				145	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_is_soft_readonly(osb))
				146	return 0;
				147
				148	if ((inode->i_flags & S_NOATIME) \|\|
				149	((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
				150	return 0;
				151
Mark Fasheh	6c2aad0	2006-12-19 15:25:52 -0800	[diff] [blame]	152	/*
				153	* We can be called with no vfsmnt structure - NFSD will
				154	* sometimes do this.
				155	*
				156	* Note that our action here is different than touch_atime() -
				157	* if we can't tell whether this is a noatime mount, then we
				158	* don't know whether to trust the value of s_atime_quantum.
				159	*/
				160	if (vfsmnt == NULL)
				161	return 0;
				162
Tiger Yang	7f1a37e	2006-11-15 15:48:42 +0800	[diff] [blame]	163	if ((vfsmnt->mnt_flags & MNT_NOATIME) \|\|
				164	((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
				165	return 0;
				166
Mark Fasheh	7e913c5	2006-12-13 00:34:35 -0800	[diff] [blame]	167	if (vfsmnt->mnt_flags & MNT_RELATIME) {
				168	if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) \|\|
				169	(timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
				170	return 1;
				171
				172	return 0;
				173	}
				174
Tiger Yang	7f1a37e	2006-11-15 15:48:42 +0800	[diff] [blame]	175	now = CURRENT_TIME;
				176	if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
				177	return 0;
				178	else
				179	return 1;
				180	}
				181
				182	int ocfs2_update_inode_atime(struct inode *inode,
				183	struct buffer_head *bh)
				184	{
				185	int ret;
				186	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				187	handle_t *handle;
				188
				189	mlog_entry_void();
				190
				191	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
				192	if (handle == NULL) {
				193	ret = -ENOMEM;
				194	mlog_errno(ret);
				195	goto out;
				196	}
				197
				198	inode->i_atime = CURRENT_TIME;
				199	ret = ocfs2_mark_inode_dirty(handle, inode, bh);
				200	if (ret < 0)
				201	mlog_errno(ret);
				202
				203	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
				204	out:
				205	mlog_exit(ret);
				206	return ret;
				207	}
				208
Mark Fasheh	1fabe14	2006-10-09 18:11:45 -0700	[diff] [blame]	209	int ocfs2_set_inode_size(handle_t *handle,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	210	struct inode *inode,
				211	struct buffer_head *fe_bh,
				212	u64 new_i_size)
				213	{
				214	int status;
				215
				216	mlog_entry_void();
				217	i_size_write(inode, new_i_size);
				218	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
				219	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
				220
				221	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
				222	if (status < 0) {
				223	mlog_errno(status);
				224	goto bail;
				225	}
				226
				227	bail:
				228	mlog_exit(status);
				229	return status;
				230	}
				231
				232	static int ocfs2_simple_size_update(struct inode *inode,
				233	struct buffer_head *di_bh,
				234	u64 new_i_size)
				235	{
				236	int ret;
				237	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
Mark Fasheh	1fabe14	2006-10-09 18:11:45 -0700	[diff] [blame]	238	handle_t *handle = NULL;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	239
Mark Fasheh	65eff9c	2006-10-09 17:26:22 -0700	[diff] [blame]	240	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	241	if (handle == NULL) {
				242	ret = -ENOMEM;
				243	mlog_errno(ret);
				244	goto out;
				245	}
				246
				247	ret = ocfs2_set_inode_size(handle, inode, di_bh,
				248	new_i_size);
				249	if (ret < 0)
				250	mlog_errno(ret);
				251
Mark Fasheh	02dc1af	2006-10-09 16:48:10 -0700	[diff] [blame]	252	ocfs2_commit_trans(osb, handle);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	253	out:
				254	return ret;
				255	}
				256
				257	static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
				258	struct inode *inode,
				259	struct buffer_head *fe_bh,
				260	u64 new_i_size)
				261	{
				262	int status;
Mark Fasheh	1fabe14	2006-10-09 18:11:45 -0700	[diff] [blame]	263	handle_t *handle;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	264
				265	mlog_entry_void();
				266
				267	/* TODO: This needs to actually orphan the inode in this
				268	* transaction. */
				269
Mark Fasheh	65eff9c	2006-10-09 17:26:22 -0700	[diff] [blame]	270	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	271	if (IS_ERR(handle)) {
				272	status = PTR_ERR(handle);
				273	mlog_errno(status);
				274	goto out;
				275	}
				276
				277	status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
				278	if (status < 0)
				279	mlog_errno(status);
				280
Mark Fasheh	02dc1af	2006-10-09 16:48:10 -0700	[diff] [blame]	281	ocfs2_commit_trans(osb, handle);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	282	out:
				283	mlog_exit(status);
				284	return status;
				285	}
				286
				287	static int ocfs2_truncate_file(struct inode *inode,
				288	struct buffer_head *di_bh,
				289	u64 new_i_size)
				290	{
				291	int status = 0;
				292	struct ocfs2_dinode *fe = NULL;
				293	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				294	struct ocfs2_truncate_context *tc = NULL;
				295
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	296	mlog_entry("(inode = %llu, new_i_size = %llu\n",
				297	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				298	(unsigned long long)new_i_size);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	299
				300	truncate_inode_pages(inode->i_mapping, new_i_size);
				301
				302	fe = (struct ocfs2_dinode *) di_bh->b_data;
				303	if (!OCFS2_IS_VALID_DINODE(fe)) {
				304	OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
				305	status = -EIO;
				306	goto bail;
				307	}
				308
				309	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	310	"Inode %llu, inode i_size = %lld != di "
				311	"i_size = %llu, i_flags = 0x%x\n",
				312	(unsigned long long)OCFS2_I(inode)->ip_blkno,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	313	i_size_read(inode),
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	314	(unsigned long long)le64_to_cpu(fe->i_size),
				315	le32_to_cpu(fe->i_flags));
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	316
				317	if (new_i_size > le64_to_cpu(fe->i_size)) {
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	318	mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
				319	(unsigned long long)le64_to_cpu(fe->i_size),
				320	(unsigned long long)new_i_size);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	321	status = -EINVAL;
				322	mlog_errno(status);
				323	goto bail;
				324	}
				325
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	326	mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
				327	(unsigned long long)le64_to_cpu(fe->i_blkno),
				328	(unsigned long long)le64_to_cpu(fe->i_size),
				329	(unsigned long long)new_i_size);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	330
				331	/* lets handle the simple truncate cases before doing any more
				332	* cluster locking. */
				333	if (new_i_size == le64_to_cpu(fe->i_size))
				334	goto bail;
				335
Mark Fasheh	ab0920c	2006-03-16 15:06:37 -0800	[diff] [blame]	336	/* This forces other nodes to sync and drop their pages. Do
				337	* this even if we have a truncate without allocation change -
				338	* ocfs2 cluster sizes can be much greater than page size, so
				339	* we have to truncate them anyway. */
				340	status = ocfs2_data_lock(inode, 1);
				341	if (status < 0) {
				342	mlog_errno(status);
				343	goto bail;
				344	}
				345	ocfs2_data_unlock(inode, 1);
				346
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	347	if (le32_to_cpu(fe->i_clusters) ==
				348	ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
				349	mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
				350	fe->i_clusters);
				351	/* No allocation change is required, so lets fast path
				352	* this truncate. */
				353	status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
				354	if (status < 0)
				355	mlog_errno(status);
				356	goto bail;
				357	}
				358
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	359	/* alright, we're going to need to do a full blown alloc size
				360	* change. Orphan the inode so that recovery can complete the
				361	* truncate if necessary. This does the task of marking
				362	* i_size. */
				363	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
				364	if (status < 0) {
				365	mlog_errno(status);
				366	goto bail;
				367	}
				368
				369	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
				370	if (status < 0) {
				371	mlog_errno(status);
				372	goto bail;
				373	}
				374
				375	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
				376	if (status < 0) {
				377	mlog_errno(status);
				378	goto bail;
				379	}
				380
				381	/* TODO: orphan dir cleanup here. */
				382	bail:
				383
				384	mlog_exit(status);
				385	return status;
				386	}
				387
				388	/*
				389	* extend allocation only here.
				390	* we'll update all the disk stuff, and oip->alloc_size
				391	*
				392	* expect stuff to be locked, a transaction started and enough data /
				393	* metadata reservations in the contexts.
				394	*
				395	* Will return -EAGAIN, and a reason if a restart is needed.
				396	* If passed in, *reason will always be set, even in error.
				397	*/
				398	int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
				399	struct inode *inode,
				400	u32 clusters_to_add,
				401	struct buffer_head *fe_bh,
Mark Fasheh	1fabe14	2006-10-09 18:11:45 -0700	[diff] [blame]	402	handle_t *handle,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	403	struct ocfs2_alloc_context *data_ac,
				404	struct ocfs2_alloc_context *meta_ac,
				405	enum ocfs2_alloc_restarted *reason_ret)
				406	{
				407	int status = 0;
				408	int free_extents;
				409	struct ocfs2_dinode fe = (struct ocfs2_dinode ) fe_bh->b_data;
				410	enum ocfs2_alloc_restarted reason = RESTART_NONE;
				411	u32 bit_off, num_bits;
				412	u64 block;
				413
				414	BUG_ON(!clusters_to_add);
				415
				416	free_extents = ocfs2_num_free_extents(osb, inode, fe);
				417	if (free_extents < 0) {
				418	status = free_extents;
				419	mlog_errno(status);
				420	goto leave;
				421	}
				422
				423	/* there are two cases which could cause us to EAGAIN in the
				424	* we-need-more-metadata case:
				425	* 1) we haven't reserved any
				426	* 2) we are so fragmented, we've needed to add metadata too
				427	* many times. */
				428	if (!free_extents && !meta_ac) {
				429	mlog(0, "we haven't reserved any metadata!\n");
				430	status = -EAGAIN;
				431	reason = RESTART_META;
				432	goto leave;
				433	} else if ((!free_extents)
				434	&& (ocfs2_alloc_context_bits_left(meta_ac)
				435	< ocfs2_extend_meta_needed(fe))) {
				436	mlog(0, "filesystem is really fragmented...\n");
				437	status = -EAGAIN;
				438	reason = RESTART_META;
				439	goto leave;
				440	}
				441
				442	status = ocfs2_claim_clusters(osb, handle, data_ac, 1,
				443	&bit_off, &num_bits);
				444	if (status < 0) {
				445	if (status != -ENOSPC)
				446	mlog_errno(status);
				447	goto leave;
				448	}
				449
				450	BUG_ON(num_bits > clusters_to_add);
				451
				452	/* reserve our write early -- insert_extent may update the inode */
				453	status = ocfs2_journal_access(handle, inode, fe_bh,
				454	OCFS2_JOURNAL_ACCESS_WRITE);
				455	if (status < 0) {
				456	mlog_errno(status);
				457	goto leave;
				458	}
				459
				460	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	461	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
				462	num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	463	status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
				464	num_bits, meta_ac);
				465	if (status < 0) {
				466	mlog_errno(status);
				467	goto leave;
				468	}
				469
				470	le32_add_cpu(&fe->i_clusters, num_bits);
				471	spin_lock(&OCFS2_I(inode)->ip_lock);
				472	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
				473	spin_unlock(&OCFS2_I(inode)->ip_lock);
				474
				475	status = ocfs2_journal_dirty(handle, fe_bh);
				476	if (status < 0) {
				477	mlog_errno(status);
				478	goto leave;
				479	}
				480
				481	clusters_to_add -= num_bits;
				482
				483	if (clusters_to_add) {
				484	mlog(0, "need to alloc once more, clusters = %u, wanted = "
				485	"%u\n", fe->i_clusters, clusters_to_add);
				486	status = -EAGAIN;
				487	reason = RESTART_TRANS;
				488	}
				489
				490	leave:
				491	mlog_exit(status);
				492	if (reason_ret)
				493	*reason_ret = reason;
				494	return status;
				495	}
				496
				497	static int ocfs2_extend_allocation(struct inode *inode,
				498	u32 clusters_to_add)
				499	{
				500	int status = 0;
				501	int restart_func = 0;
				502	int drop_alloc_sem = 0;
				503	int credits, num_free_extents;
				504	u32 prev_clusters;
				505	struct buffer_head *bh = NULL;
				506	struct ocfs2_dinode *fe = NULL;
Mark Fasheh	1fabe14	2006-10-09 18:11:45 -0700	[diff] [blame]	507	handle_t *handle = NULL;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	508	struct ocfs2_alloc_context *data_ac = NULL;
				509	struct ocfs2_alloc_context *meta_ac = NULL;
				510	enum ocfs2_alloc_restarted why;
				511	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				512
				513	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
				514
				515	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
				516	OCFS2_BH_CACHED, inode);
				517	if (status < 0) {
				518	mlog_errno(status);
				519	goto leave;
				520	}
				521
				522	fe = (struct ocfs2_dinode *) bh->b_data;
				523	if (!OCFS2_IS_VALID_DINODE(fe)) {
				524	OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
				525	status = -EIO;
				526	goto leave;
				527	}
				528
				529	restart_all:
				530	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
				531
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	532	mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, "
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	533	"clusters_to_add = %u\n",
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	534	(unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	535	fe->i_clusters, clusters_to_add);
				536
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	537	num_free_extents = ocfs2_num_free_extents(osb,
				538	inode,
				539	fe);
				540	if (num_free_extents < 0) {
				541	status = num_free_extents;
				542	mlog_errno(status);
				543	goto leave;
				544	}
				545
				546	if (!num_free_extents) {
Mark Fasheh	da5cbf2	2006-10-06 18:34:35 -0700	[diff] [blame]	547	status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	548	if (status < 0) {
				549	if (status != -ENOSPC)
				550	mlog_errno(status);
				551	goto leave;
				552	}
				553	}
				554
Mark Fasheh	da5cbf2	2006-10-06 18:34:35 -0700	[diff] [blame]	555	status = ocfs2_reserve_clusters(osb, clusters_to_add, &data_ac);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	556	if (status < 0) {
				557	if (status != -ENOSPC)
				558	mlog_errno(status);
				559	goto leave;
				560	}
				561
				562	/* blocks peope in read/write from reading our allocation
Jes Sorensen	1b1dcc1	2006-01-09 15:59:24 -0800	[diff] [blame]	563	* until we're done changing it. We depend on i_mutex to block
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	564	* other extend/truncate calls while we're here. Ordering wrt
				565	* start_trans is important here -- always do it before! */
				566	down_write(&OCFS2_I(inode)->ip_alloc_sem);
				567	drop_alloc_sem = 1;
				568
				569	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
Mark Fasheh	65eff9c	2006-10-09 17:26:22 -0700	[diff] [blame]	570	handle = ocfs2_start_trans(osb, credits);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	571	if (IS_ERR(handle)) {
				572	status = PTR_ERR(handle);
				573	handle = NULL;
				574	mlog_errno(status);
				575	goto leave;
				576	}
				577
				578	restarted_transaction:
				579	/* reserve a write to the file entry early on - that we if we
				580	* run out of credits in the allocation path, we can still
				581	* update i_size. */
				582	status = ocfs2_journal_access(handle, inode, bh,
				583	OCFS2_JOURNAL_ACCESS_WRITE);
				584	if (status < 0) {
				585	mlog_errno(status);
				586	goto leave;
				587	}
				588
				589	prev_clusters = OCFS2_I(inode)->ip_clusters;
				590
				591	status = ocfs2_do_extend_allocation(osb,
				592	inode,
				593	clusters_to_add,
				594	bh,
				595	handle,
				596	data_ac,
				597	meta_ac,
				598	&why);
				599	if ((status < 0) && (status != -EAGAIN)) {
				600	if (status != -ENOSPC)
				601	mlog_errno(status);
				602	goto leave;
				603	}
				604
				605	status = ocfs2_journal_dirty(handle, bh);
				606	if (status < 0) {
				607	mlog_errno(status);
				608	goto leave;
				609	}
				610
				611	spin_lock(&OCFS2_I(inode)->ip_lock);
				612	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
				613	spin_unlock(&OCFS2_I(inode)->ip_lock);
				614
				615	if (why != RESTART_NONE && clusters_to_add) {
				616	if (why == RESTART_META) {
				617	mlog(0, "restarting function.\n");
				618	restart_func = 1;
				619	} else {
				620	BUG_ON(why != RESTART_TRANS);
				621
				622	mlog(0, "restarting transaction.\n");
				623	/* TODO: This can be more intelligent. */
				624	credits = ocfs2_calc_extend_credits(osb->sb,
				625	fe,
				626	clusters_to_add);
Mark Fasheh	1fabe14	2006-10-09 18:11:45 -0700	[diff] [blame]	627	status = ocfs2_extend_trans(handle, credits);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	628	if (status < 0) {
				629	/* handle still has to be committed at
				630	* this point. */
				631	status = -ENOMEM;
				632	mlog_errno(status);
				633	goto leave;
				634	}
				635	goto restarted_transaction;
				636	}
				637	}
				638
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	639	mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
				640	fe->i_clusters, (unsigned long long)fe->i_size);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	641	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
				642	OCFS2_I(inode)->ip_clusters, i_size_read(inode));
				643
				644	leave:
				645	if (drop_alloc_sem) {
				646	up_write(&OCFS2_I(inode)->ip_alloc_sem);
				647	drop_alloc_sem = 0;
				648	}
				649	if (handle) {
Mark Fasheh	02dc1af	2006-10-09 16:48:10 -0700	[diff] [blame]	650	ocfs2_commit_trans(osb, handle);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	651	handle = NULL;
				652	}
				653	if (data_ac) {
				654	ocfs2_free_alloc_context(data_ac);
				655	data_ac = NULL;
				656	}
				657	if (meta_ac) {
				658	ocfs2_free_alloc_context(meta_ac);
				659	meta_ac = NULL;
				660	}
				661	if ((!status) && restart_func) {
				662	restart_func = 0;
				663	goto restart_all;
				664	}
				665	if (bh) {
				666	brelse(bh);
				667	bh = NULL;
				668	}
				669
				670	mlog_exit(status);
				671	return status;
				672	}
				673
				674	/* Some parts of this taken from generic_cont_expand, which turned out
				675	* to be too fragile to do exactly what we need without us having to
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	676	* worry about recursive locking in ->prepare_write() and
				677	* ->commit_write(). */
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	678	static int ocfs2_write_zero_page(struct inode *inode,
				679	u64 size)
				680	{
				681	struct address_space *mapping = inode->i_mapping;
				682	struct page *page;
				683	unsigned long index;
				684	unsigned int offset;
Mark Fasheh	1fabe14	2006-10-09 18:11:45 -0700	[diff] [blame]	685	handle_t *handle = NULL;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	686	int ret;
				687
				688	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
				689	/* ugh. in prepare/commit_write, if from==to==start of block, we
				690	** skip the prepare. make sure we never send an offset for the start
				691	** of a block
				692	*/
				693	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
				694	offset++;
				695	}
				696	index = size >> PAGE_CACHE_SHIFT;
				697
				698	page = grab_cache_page(mapping, index);
				699	if (!page) {
				700	ret = -ENOMEM;
				701	mlog_errno(ret);
				702	goto out;
				703	}
				704
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	705	ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	706	if (ret < 0) {
				707	mlog_errno(ret);
				708	goto out_unlock;
				709	}
				710
				711	if (ocfs2_should_order_data(inode)) {
				712	handle = ocfs2_start_walk_page_trans(inode, page, offset,
				713	offset);
				714	if (IS_ERR(handle)) {
				715	ret = PTR_ERR(handle);
				716	handle = NULL;
				717	goto out_unlock;
				718	}
				719	}
				720
				721	/* must not update i_size! */
				722	ret = block_commit_write(page, offset, offset);
				723	if (ret < 0)
				724	mlog_errno(ret);
				725	else
				726	ret = 0;
				727
				728	if (handle)
Mark Fasheh	02dc1af	2006-10-09 16:48:10 -0700	[diff] [blame]	729	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	730	out_unlock:
				731	unlock_page(page);
				732	page_cache_release(page);
				733	out:
				734	return ret;
				735	}
				736
				737	static int ocfs2_zero_extend(struct inode *inode,
				738	u64 zero_to_size)
				739	{
				740	int ret = 0;
				741	u64 start_off;
				742	struct super_block *sb = inode->i_sb;
				743
				744	start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
				745	while (start_off < zero_to_size) {
				746	ret = ocfs2_write_zero_page(inode, start_off);
				747	if (ret < 0) {
				748	mlog_errno(ret);
				749	goto out;
				750	}
				751
				752	start_off += sb->s_blocksize;
Mark Fasheh	e2057c5	2006-10-03 17:53:05 -0700	[diff] [blame]	753
				754	/*
				755	* Very large extends have the potential to lock up
				756	* the cpu for extended periods of time.
				757	*/
				758	cond_resched();
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	759	}
				760
				761	out:
				762	return ret;
				763	}
				764
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	765	/*
				766	* A tail_to_skip value > 0 indicates that we're being called from
				767	* ocfs2_file_aio_write(). This has the following implications:
				768	*
				769	* - we don't want to update i_size
				770	* - di_bh will be NULL, which is fine because it's only used in the
				771	* case where we want to update i_size.
				772	* - ocfs2_zero_extend() will then only be filling the hole created
				773	* between i_size and the start of the write.
				774	*/
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	775	static int ocfs2_extend_file(struct inode *inode,
				776	struct buffer_head *di_bh,
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	777	u64 new_i_size,
				778	size_t tail_to_skip)
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	779	{
				780	int ret = 0;
				781	u32 clusters_to_add;
				782
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	783	BUG_ON(!tail_to_skip && !di_bh);
				784
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	785	/* setattr sometimes calls us like this. */
				786	if (new_i_size == 0)
				787	goto out;
				788
				789	if (i_size_read(inode) == new_i_size)
				790	goto out;
				791	BUG_ON(new_i_size < i_size_read(inode));
				792
				793	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
				794	OCFS2_I(inode)->ip_clusters;
				795
Mark Fasheh	0effef7	2006-10-03 17:44:42 -0700	[diff] [blame]	796	/*
				797	* protect the pages that ocfs2_zero_extend is going to be
				798	* pulling into the page cache.. we do this before the
				799	* metadata extend so that we don't get into the situation
				800	* where we've extended the metadata but can't get the data
				801	* lock to zero.
				802	*/
				803	ret = ocfs2_data_lock(inode, 1);
				804	if (ret < 0) {
				805	mlog_errno(ret);
				806	goto out;
				807	}
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	808
Mark Fasheh	0effef7	2006-10-03 17:44:42 -0700	[diff] [blame]	809	if (clusters_to_add) {
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	810	ret = ocfs2_extend_allocation(inode, clusters_to_add);
				811	if (ret < 0) {
				812	mlog_errno(ret);
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	813	goto out_unlock;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	814	}
Mark Fasheh	0effef7	2006-10-03 17:44:42 -0700	[diff] [blame]	815	}
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	816
Mark Fasheh	0effef7	2006-10-03 17:44:42 -0700	[diff] [blame]	817	/*
				818	* Call this even if we don't add any clusters to the tree. We
				819	* still need to zero the area between the old i_size and the
				820	* new i_size.
				821	*/
				822	ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);
				823	if (ret < 0) {
				824	mlog_errno(ret);
				825	goto out_unlock;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	826	}
				827
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	828	if (!tail_to_skip) {
				829	/* We're being called from ocfs2_setattr() which wants
				830	* us to update i_size */
				831	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
				832	if (ret < 0)
				833	mlog_errno(ret);
				834	}
				835
				836	out_unlock:
Mark Fasheh	0effef7	2006-10-03 17:44:42 -0700	[diff] [blame]	837	ocfs2_data_unlock(inode, 1);
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	838
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	839	out:
				840	return ret;
				841	}
				842
				843	int ocfs2_setattr(struct dentry dentry, struct iattr attr)
				844	{
				845	int status = 0, size_change;
				846	struct inode *inode = dentry->d_inode;
				847	struct super_block *sb = inode->i_sb;
				848	struct ocfs2_super *osb = OCFS2_SB(sb);
				849	struct buffer_head *bh = NULL;
Mark Fasheh	1fabe14	2006-10-09 18:11:45 -0700	[diff] [blame]	850	handle_t *handle = NULL;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	851
				852	mlog_entry("(0x%p, '%.*s')\n", dentry,
				853	dentry->d_name.len, dentry->d_name.name);
				854
				855	if (attr->ia_valid & ATTR_MODE)
				856	mlog(0, "mode change: %d\n", attr->ia_mode);
				857	if (attr->ia_valid & ATTR_UID)
				858	mlog(0, "uid change: %d\n", attr->ia_uid);
				859	if (attr->ia_valid & ATTR_GID)
				860	mlog(0, "gid change: %d\n", attr->ia_gid);
				861	if (attr->ia_valid & ATTR_SIZE)
				862	mlog(0, "size change...\n");
				863	if (attr->ia_valid & (ATTR_ATIME \| ATTR_MTIME \| ATTR_CTIME))
				864	mlog(0, "time change...\n");
				865
				866	#define OCFS2_VALID_ATTRS (ATTR_ATIME \| ATTR_MTIME \| ATTR_CTIME \| ATTR_SIZE \
				867	\| ATTR_GID \| ATTR_UID \| ATTR_MODE)
				868	if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
				869	mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
				870	return 0;
				871	}
				872
				873	status = inode_change_ok(inode, attr);
				874	if (status)
				875	return status;
				876
				877	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
				878	if (size_change) {
				879	status = ocfs2_rw_lock(inode, 1);
				880	if (status < 0) {
				881	mlog_errno(status);
				882	goto bail;
				883	}
				884	}
				885
Mark Fasheh	4bcec18	2006-10-09 16:02:40 -0700	[diff] [blame]	886	status = ocfs2_meta_lock(inode, &bh, 1);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	887	if (status < 0) {
				888	if (status != -ENOENT)
				889	mlog_errno(status);
				890	goto bail_unlock_rw;
				891	}
				892
				893	if (size_change && attr->ia_size != i_size_read(inode)) {
				894	if (i_size_read(inode) > attr->ia_size)
				895	status = ocfs2_truncate_file(inode, bh, attr->ia_size);
				896	else
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	897	status = ocfs2_extend_file(inode, bh, attr->ia_size, 0);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	898	if (status < 0) {
				899	if (status != -ENOSPC)
				900	mlog_errno(status);
				901	status = -ENOSPC;
				902	goto bail_unlock;
				903	}
				904	}
				905
Mark Fasheh	65eff9c	2006-10-09 17:26:22 -0700	[diff] [blame]	906	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	907	if (IS_ERR(handle)) {
				908	status = PTR_ERR(handle);
				909	mlog_errno(status);
				910	goto bail_unlock;
				911	}
				912
				913	status = inode_setattr(inode, attr);
				914	if (status < 0) {
				915	mlog_errno(status);
				916	goto bail_commit;
				917	}
				918
				919	status = ocfs2_mark_inode_dirty(handle, inode, bh);
				920	if (status < 0)
				921	mlog_errno(status);
				922
				923	bail_commit:
Mark Fasheh	02dc1af	2006-10-09 16:48:10 -0700	[diff] [blame]	924	ocfs2_commit_trans(osb, handle);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	925	bail_unlock:
				926	ocfs2_meta_unlock(inode, 1);
				927	bail_unlock_rw:
				928	if (size_change)
				929	ocfs2_rw_unlock(inode, 1);
				930	bail:
				931	if (bh)
				932	brelse(bh);
				933
				934	mlog_exit(status);
				935	return status;
				936	}
				937
				938	int ocfs2_getattr(struct vfsmount *mnt,
				939	struct dentry *dentry,
				940	struct kstat *stat)
				941	{
				942	struct inode *inode = dentry->d_inode;
				943	struct super_block *sb = dentry->d_inode->i_sb;
				944	struct ocfs2_super *osb = sb->s_fs_info;
				945	int err;
				946
				947	mlog_entry_void();
				948
				949	err = ocfs2_inode_revalidate(dentry);
				950	if (err) {
				951	if (err != -ENOENT)
				952	mlog_errno(err);
				953	goto bail;
				954	}
				955
				956	generic_fillattr(inode, stat);
				957
				958	/* We set the blksize from the cluster size for performance */
				959	stat->blksize = osb->s_clustersize;
				960
				961	bail:
				962	mlog_exit(err);
				963
				964	return err;
				965	}
				966
Tiger Yang	d38eb8d	2006-11-27 09:59:21 +0800	[diff] [blame]	967	int ocfs2_permission(struct inode inode, int mask, struct nameidata nd)
				968	{
				969	int ret;
				970
				971	mlog_entry_void();
				972
				973	ret = ocfs2_meta_lock(inode, NULL, 0);
				974	if (ret) {
				975	mlog_errno(ret);
				976	goto out;
				977	}
				978
				979	ret = generic_permission(inode, mask, NULL);
Tiger Yang	d38eb8d	2006-11-27 09:59:21 +0800	[diff] [blame]	980
				981	ocfs2_meta_unlock(inode, 0);
				982	out:
				983	mlog_exit(ret);
				984	return ret;
				985	}
				986
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	987	static int ocfs2_write_remove_suid(struct inode *inode)
				988	{
				989	int ret;
				990	struct buffer_head *bh = NULL;
				991	struct ocfs2_inode_info *oi = OCFS2_I(inode);
Mark Fasheh	1fabe14	2006-10-09 18:11:45 -0700	[diff] [blame]	992	handle_t *handle;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	993	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				994	struct ocfs2_dinode *di;
				995
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	996	mlog_entry("(Inode %llu, mode 0%o)\n",
				997	(unsigned long long)oi->ip_blkno, inode->i_mode);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	998
Mark Fasheh	65eff9c	2006-10-09 17:26:22 -0700	[diff] [blame]	999	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1000	if (handle == NULL) {
				1001	ret = -ENOMEM;
				1002	mlog_errno(ret);
				1003	goto out;
				1004	}
				1005
				1006	ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
				1007	if (ret < 0) {
				1008	mlog_errno(ret);
				1009	goto out_trans;
				1010	}
				1011
				1012	ret = ocfs2_journal_access(handle, inode, bh,
				1013	OCFS2_JOURNAL_ACCESS_WRITE);
				1014	if (ret < 0) {
				1015	mlog_errno(ret);
				1016	goto out_bh;
				1017	}
				1018
				1019	inode->i_mode &= ~S_ISUID;
				1020	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
				1021	inode->i_mode &= ~S_ISGID;
				1022
				1023	di = (struct ocfs2_dinode *) bh->b_data;
				1024	di->i_mode = cpu_to_le16(inode->i_mode);
				1025
				1026	ret = ocfs2_journal_dirty(handle, bh);
				1027	if (ret < 0)
				1028	mlog_errno(ret);
				1029	out_bh:
				1030	brelse(bh);
				1031	out_trans:
Mark Fasheh	02dc1af	2006-10-09 16:48:10 -0700	[diff] [blame]	1032	ocfs2_commit_trans(osb, handle);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1033	out:
				1034	mlog_exit(ret);
				1035	return ret;
				1036	}
				1037
Tiger Yang	8659ac2	2006-10-17 18:29:52 -0700	[diff] [blame]	1038	static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
				1039	loff_t *ppos,
				1040	size_t count,
				1041	int appending)
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1042	{
Tiger Yang	8659ac2	2006-10-17 18:29:52 -0700	[diff] [blame]	1043	int ret = 0, meta_level = appending;
				1044	struct inode *inode = dentry->d_inode;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1045	u32 clusters;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1046	loff_t newsize, saved_pos;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1047
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1048	/*
				1049	* We sample i_size under a read level meta lock to see if our write
				1050	* is extending the file, if it is we back off and get a write level
				1051	* meta lock.
				1052	*/
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1053	for(;;) {
Mark Fasheh	4bcec18	2006-10-09 16:02:40 -0700	[diff] [blame]	1054	ret = ocfs2_meta_lock(inode, NULL, meta_level);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1055	if (ret < 0) {
				1056	meta_level = -1;
				1057	mlog_errno(ret);
				1058	goto out;
				1059	}
				1060
				1061	/* Clear suid / sgid if necessary. We do this here
				1062	* instead of later in the write path because
				1063	* remove_suid() calls ->setattr without any hint that
				1064	* we may have already done our cluster locking. Since
				1065	* ocfs2_setattr() must take cluster locks to
				1066	* proceeed, this will lead us to recursively lock the
				1067	* inode. There's also the dinode i_size state which
				1068	* can be lost via setattr during extending writes (we
				1069	* set inode->i_size at the end of a write. */
Tiger Yang	8659ac2	2006-10-17 18:29:52 -0700	[diff] [blame]	1070	if (should_remove_suid(dentry)) {
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1071	if (meta_level == 0) {
				1072	ocfs2_meta_unlock(inode, meta_level);
				1073	meta_level = 1;
				1074	continue;
				1075	}
				1076
				1077	ret = ocfs2_write_remove_suid(inode);
				1078	if (ret < 0) {
				1079	mlog_errno(ret);
Tiger Yang	8659ac2	2006-10-17 18:29:52 -0700	[diff] [blame]	1080	goto out_unlock;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1081	}
				1082	}
				1083
				1084	/* work on a copy of ppos until we're sure that we won't have
				1085	* to recalculate it due to relocking. */
Tiger Yang	8659ac2	2006-10-17 18:29:52 -0700	[diff] [blame]	1086	if (appending) {
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1087	saved_pos = i_size_read(inode);
				1088	mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
				1089	} else {
Tiger Yang	8659ac2	2006-10-17 18:29:52 -0700	[diff] [blame]	1090	saved_pos = *ppos;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1091	}
Tiger Yang	8659ac2	2006-10-17 18:29:52 -0700	[diff] [blame]	1092	newsize = count + saved_pos;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1093
Mark Fasheh	215c7f9	2006-02-01 16:42:10 -0800	[diff] [blame]	1094	mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
				1095	(long long) saved_pos, (long long) newsize,
				1096	(long long) i_size_read(inode));
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1097
				1098	/* No need for a higher level metadata lock if we're
				1099	* never going past i_size. */
				1100	if (newsize <= i_size_read(inode))
				1101	break;
				1102
				1103	if (meta_level == 0) {
				1104	ocfs2_meta_unlock(inode, meta_level);
				1105	meta_level = 1;
				1106	continue;
				1107	}
				1108
				1109	spin_lock(&OCFS2_I(inode)->ip_lock);
				1110	clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
				1111	OCFS2_I(inode)->ip_clusters;
				1112	spin_unlock(&OCFS2_I(inode)->ip_lock);
				1113
				1114	mlog(0, "Writing at EOF, may need more allocation: "
Mark Fasheh	215c7f9	2006-02-01 16:42:10 -0800	[diff] [blame]	1115	"i_size = %lld, newsize = %lld, need %u clusters\n",
				1116	(long long) i_size_read(inode), (long long) newsize,
				1117	clusters);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1118
				1119	/* We only want to continue the rest of this loop if
				1120	* our extend will actually require more
				1121	* allocation. */
				1122	if (!clusters)
				1123	break;
				1124
Tiger Yang	8659ac2	2006-10-17 18:29:52 -0700	[diff] [blame]	1125	ret = ocfs2_extend_file(inode, NULL, newsize, count);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1126	if (ret < 0) {
				1127	if (ret != -ENOSPC)
				1128	mlog_errno(ret);
Tiger Yang	8659ac2	2006-10-17 18:29:52 -0700	[diff] [blame]	1129	goto out_unlock;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1130	}
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1131	break;
				1132	}
				1133
Tiger Yang	8659ac2	2006-10-17 18:29:52 -0700	[diff] [blame]	1134	if (appending)
				1135	*ppos = saved_pos;
				1136
				1137	out_unlock:
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1138	ocfs2_meta_unlock(inode, meta_level);
Tiger Yang	8659ac2	2006-10-17 18:29:52 -0700	[diff] [blame]	1139
				1140	out:
				1141	return ret;
				1142	}
				1143
				1144	static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
				1145	const struct iovec *iov,
				1146	unsigned long nr_segs,
				1147	loff_t pos)
				1148	{
				1149	int ret, rw_level, have_alloc_sem = 0;
				1150	struct file *filp = iocb->ki_filp;
Josef Sipek	d28c917	2006-12-08 02:37:25 -0800	[diff] [blame]	1151	struct inode *inode = filp->f_path.dentry->d_inode;
Tiger Yang	8659ac2	2006-10-17 18:29:52 -0700	[diff] [blame]	1152	int appending = filp->f_flags & O_APPEND ? 1 : 0;
				1153
				1154	mlog_entry("(0x%p, %u, '%.*s')\n", filp,
				1155	(unsigned int)nr_segs,
Josef Sipek	d28c917	2006-12-08 02:37:25 -0800	[diff] [blame]	1156	filp->f_path.dentry->d_name.len,
				1157	filp->f_path.dentry->d_name.name);
Tiger Yang	8659ac2	2006-10-17 18:29:52 -0700	[diff] [blame]	1158
				1159	/* happy write of zero bytes */
				1160	if (iocb->ki_left == 0)
				1161	return 0;
				1162
				1163	mutex_lock(&inode->i_mutex);
				1164	/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
				1165	if (filp->f_flags & O_DIRECT) {
				1166	have_alloc_sem = 1;
				1167	down_read(&inode->i_alloc_sem);
				1168	}
				1169
				1170	/* concurrent O_DIRECT writes are allowed */
				1171	rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
				1172	ret = ocfs2_rw_lock(inode, rw_level);
				1173	if (ret < 0) {
				1174	rw_level = -1;
				1175	mlog_errno(ret);
				1176	goto out;
				1177	}
				1178
Josef Sipek	d28c917	2006-12-08 02:37:25 -0800	[diff] [blame]	1179	ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos,
Tiger Yang	8659ac2	2006-10-17 18:29:52 -0700	[diff] [blame]	1180	iocb->ki_left, appending);
				1181	if (ret < 0) {
				1182	mlog_errno(ret);
				1183	goto out;
				1184	}
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1185
				1186	/* communicate with ocfs2_dio_end_io */
				1187	ocfs2_iocb_set_rw_locked(iocb);
				1188
Badari Pulavarty	027445c	2006-09-30 23:28:46 -0700	[diff] [blame]	1189	ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1190
				1191	/* buffered aio wouldn't have proper lock coverage today */
				1192	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
				1193
				1194	/*
				1195	* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
				1196	* function pointer which is called when o_direct io completes so that
				1197	* it can unlock our rw lock. (it's the clustered equivalent of
				1198	* i_alloc_sem; protects truncate from racing with pending ios).
				1199	* Unfortunately there are error cases which call end_io and others
				1200	* that don't. so we don't have to unlock the rw_lock if either an
				1201	* async dio is going to do it in the future or an end_io after an
				1202	* error has already done it.
				1203	*/
				1204	if (ret == -EIOCBQUEUED \|\| !ocfs2_iocb_is_rw_locked(iocb)) {
				1205	rw_level = -1;
				1206	have_alloc_sem = 0;
				1207	}
				1208
				1209	out:
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1210	if (have_alloc_sem)
				1211	up_read(&inode->i_alloc_sem);
				1212	if (rw_level != -1)
				1213	ocfs2_rw_unlock(inode, rw_level);
Jes Sorensen	1b1dcc1	2006-01-09 15:59:24 -0800	[diff] [blame]	1214	mutex_unlock(&inode->i_mutex);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1215
				1216	mlog_exit(ret);
				1217	return ret;
				1218	}
				1219
Tiger Yang	8659ac2	2006-10-17 18:29:52 -0700	[diff] [blame]	1220	static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
				1221	struct file *out,
				1222	loff_t *ppos,
				1223	size_t len,
				1224	unsigned int flags)
				1225	{
				1226	int ret;
Josef Sipek	d28c917	2006-12-08 02:37:25 -0800	[diff] [blame]	1227	struct inode *inode = out->f_path.dentry->d_inode;
Tiger Yang	8659ac2	2006-10-17 18:29:52 -0700	[diff] [blame]	1228
				1229	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
				1230	(unsigned int)len,
Josef Sipek	d28c917	2006-12-08 02:37:25 -0800	[diff] [blame]	1231	out->f_path.dentry->d_name.len,
				1232	out->f_path.dentry->d_name.name);
Tiger Yang	8659ac2	2006-10-17 18:29:52 -0700	[diff] [blame]	1233
				1234	inode_double_lock(inode, pipe->inode);
				1235
				1236	ret = ocfs2_rw_lock(inode, 1);
				1237	if (ret < 0) {
				1238	mlog_errno(ret);
				1239	goto out;
				1240	}
				1241
Josef Sipek	d28c917	2006-12-08 02:37:25 -0800	[diff] [blame]	1242	ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0);
Tiger Yang	8659ac2	2006-10-17 18:29:52 -0700	[diff] [blame]	1243	if (ret < 0) {
				1244	mlog_errno(ret);
				1245	goto out_unlock;
				1246	}
				1247
				1248	/* ok, we're done with i_size and alloc work */
				1249	ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
				1250
				1251	out_unlock:
				1252	ocfs2_rw_unlock(inode, 1);
				1253	out:
				1254	inode_double_unlock(inode, pipe->inode);
				1255
				1256	mlog_exit(ret);
				1257	return ret;
				1258	}
				1259
				1260	static ssize_t ocfs2_file_splice_read(struct file *in,
				1261	loff_t *ppos,
				1262	struct pipe_inode_info *pipe,
				1263	size_t len,
				1264	unsigned int flags)
				1265	{
				1266	int ret = 0;
Josef Sipek	d28c917	2006-12-08 02:37:25 -0800	[diff] [blame]	1267	struct inode *inode = in->f_path.dentry->d_inode;
Tiger Yang	8659ac2	2006-10-17 18:29:52 -0700	[diff] [blame]	1268
				1269	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
				1270	(unsigned int)len,
Josef Sipek	d28c917	2006-12-08 02:37:25 -0800	[diff] [blame]	1271	in->f_path.dentry->d_name.len,
				1272	in->f_path.dentry->d_name.name);
Tiger Yang	8659ac2	2006-10-17 18:29:52 -0700	[diff] [blame]	1273
				1274	/*
				1275	* See the comment in ocfs2_file_aio_read()
				1276	*/
				1277	ret = ocfs2_meta_lock(inode, NULL, 0);
				1278	if (ret < 0) {
				1279	mlog_errno(ret);
				1280	goto bail;
				1281	}
				1282	ocfs2_meta_unlock(inode, 0);
				1283
				1284	ret = generic_file_splice_read(in, ppos, pipe, len, flags);
				1285
				1286	bail:
				1287	mlog_exit(ret);
				1288	return ret;
				1289	}
				1290
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1291	static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
Badari Pulavarty	027445c	2006-09-30 23:28:46 -0700	[diff] [blame]	1292	const struct iovec *iov,
				1293	unsigned long nr_segs,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1294	loff_t pos)
				1295	{
Tiger Yang	25899de	2006-11-15 15:49:02 +0800	[diff] [blame]	1296	int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1297	struct file *filp = iocb->ki_filp;
Josef Sipek	d28c917	2006-12-08 02:37:25 -0800	[diff] [blame]	1298	struct inode *inode = filp->f_path.dentry->d_inode;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1299
Badari Pulavarty	027445c	2006-09-30 23:28:46 -0700	[diff] [blame]	1300	mlog_entry("(0x%p, %u, '%.*s')\n", filp,
				1301	(unsigned int)nr_segs,
Josef Sipek	d28c917	2006-12-08 02:37:25 -0800	[diff] [blame]	1302	filp->f_path.dentry->d_name.len,
				1303	filp->f_path.dentry->d_name.name);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1304
				1305	if (!inode) {
				1306	ret = -EINVAL;
				1307	mlog_errno(ret);
				1308	goto bail;
				1309	}
				1310
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1311	/*
				1312	* buffered reads protect themselves in ->readpage(). O_DIRECT reads
				1313	* need locks to protect pending reads from racing with truncate.
				1314	*/
				1315	if (filp->f_flags & O_DIRECT) {
				1316	down_read(&inode->i_alloc_sem);
				1317	have_alloc_sem = 1;
				1318
				1319	ret = ocfs2_rw_lock(inode, 0);
				1320	if (ret < 0) {
				1321	mlog_errno(ret);
				1322	goto bail;
				1323	}
				1324	rw_level = 0;
				1325	/* communicate with ocfs2_dio_end_io */
				1326	ocfs2_iocb_set_rw_locked(iocb);
				1327	}
				1328
Mark Fasheh	c4374f8	2006-05-05 19:04:35 -0700	[diff] [blame]	1329	/*
				1330	* We're fine letting folks race truncates and extending
				1331	* writes with read across the cluster, just like they can
				1332	* locally. Hence no rw_lock during read.
				1333	*
				1334	* Take and drop the meta data lock to update inode fields
				1335	* like i_size. This allows the checks down below
				1336	* generic_file_aio_read() a chance of actually working.
				1337	*/
Tiger Yang	25899de	2006-11-15 15:49:02 +0800	[diff] [blame]	1338	ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
Mark Fasheh	c4374f8	2006-05-05 19:04:35 -0700	[diff] [blame]	1339	if (ret < 0) {
				1340	mlog_errno(ret);
				1341	goto bail;
				1342	}
Tiger Yang	25899de	2006-11-15 15:49:02 +0800	[diff] [blame]	1343	ocfs2_meta_unlock(inode, lock_level);
Mark Fasheh	c4374f8	2006-05-05 19:04:35 -0700	[diff] [blame]	1344
Badari Pulavarty	027445c	2006-09-30 23:28:46 -0700	[diff] [blame]	1345	ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1346	if (ret == -EINVAL)
				1347	mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
				1348
				1349	/* buffered aio wouldn't have proper lock coverage today */
				1350	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
				1351
				1352	/* see ocfs2_file_aio_write */
				1353	if (ret == -EIOCBQUEUED \|\| !ocfs2_iocb_is_rw_locked(iocb)) {
				1354	rw_level = -1;
				1355	have_alloc_sem = 0;
				1356	}
				1357
				1358	bail:
				1359	if (have_alloc_sem)
				1360	up_read(&inode->i_alloc_sem);
				1361	if (rw_level != -1)
				1362	ocfs2_rw_unlock(inode, rw_level);
				1363	mlog_exit(ret);
				1364
				1365	return ret;
				1366	}
				1367
Arjan van de Ven	92e1d5b	2007-02-12 00:55:39 -0800	[diff] [blame]	1368	const struct inode_operations ocfs2_file_iops = {
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1369	.setattr = ocfs2_setattr,
				1370	.getattr = ocfs2_getattr,
Tiger Yang	d38eb8d	2006-11-27 09:59:21 +0800	[diff] [blame]	1371	.permission = ocfs2_permission,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1372	};
				1373
Arjan van de Ven	92e1d5b	2007-02-12 00:55:39 -0800	[diff] [blame]	1374	const struct inode_operations ocfs2_special_file_iops = {
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1375	.setattr = ocfs2_setattr,
				1376	.getattr = ocfs2_getattr,
Tiger Yang	d38eb8d	2006-11-27 09:59:21 +0800	[diff] [blame]	1377	.permission = ocfs2_permission,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1378	};
				1379
Arjan van de Ven	4b6f5d2	2006-03-28 01:56:42 -0800	[diff] [blame]	1380	const struct file_operations ocfs2_fops = {
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1381	.read = do_sync_read,
				1382	.write = do_sync_write,
				1383	.sendfile = generic_file_sendfile,
				1384	.mmap = ocfs2_mmap,
				1385	.fsync = ocfs2_sync_file,
				1386	.release = ocfs2_file_release,
				1387	.open = ocfs2_file_open,
				1388	.aio_read = ocfs2_file_aio_read,
				1389	.aio_write = ocfs2_file_aio_write,
Herbert Poetzl	ca4d147	2006-07-03 17:27:12 -0700	[diff] [blame]	1390	.ioctl = ocfs2_ioctl,
Tiger Yang	8659ac2	2006-10-17 18:29:52 -0700	[diff] [blame]	1391	.splice_read = ocfs2_file_splice_read,
				1392	.splice_write = ocfs2_file_splice_write,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1393	};
				1394
Arjan van de Ven	4b6f5d2	2006-03-28 01:56:42 -0800	[diff] [blame]	1395	const struct file_operations ocfs2_dops = {
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1396	.read = generic_read_dir,
				1397	.readdir = ocfs2_readdir,
				1398	.fsync = ocfs2_sync_file,
Herbert Poetzl	ca4d147	2006-07-03 17:27:12 -0700	[diff] [blame]	1399	.ioctl = ocfs2_ioctl,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1400	};