Blame - fs/ocfs2/file.c - kernel/msm

blob: 4b4cbadd583834adddc8f033c64cc2a449fc3a7c [file] [log] [blame]

Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1	/* -- mode: c; c-basic-offset: 8; --
				2	* vim: noexpandtab sw=8 ts=8 sts=0:
				3	*
				4	* file.c
				5	*
				6	* File open, close, extend, truncate
				7	*
				8	* Copyright (C) 2002, 2004 Oracle. All rights reserved.
				9	*
				10	* This program is free software; you can redistribute it and/or
				11	* modify it under the terms of the GNU General Public
				12	* License as published by the Free Software Foundation; either
				13	* version 2 of the License, or (at your option) any later version.
				14	*
				15	* This program is distributed in the hope that it will be useful,
				16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				18	* General Public License for more details.
				19	*
				20	* You should have received a copy of the GNU General Public
				21	* License along with this program; if not, write to the
				22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				23	* Boston, MA 021110-1307, USA.
				24	*/
				25
Randy Dunlap	16f7e0f	2006-01-11 12:17:46 -0800	[diff] [blame]	26	#include <linux/capability.h>
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	27	#include <linux/fs.h>
				28	#include <linux/types.h>
				29	#include <linux/slab.h>
				30	#include <linux/highmem.h>
				31	#include <linux/pagemap.h>
				32	#include <linux/uio.h>
				33
				34	#define MLOG_MASK_PREFIX ML_INODE
				35	#include <cluster/masklog.h>
				36
				37	#include "ocfs2.h"
				38
				39	#include "alloc.h"
				40	#include "aops.h"
				41	#include "dir.h"
				42	#include "dlmglue.h"
				43	#include "extent_map.h"
				44	#include "file.h"
				45	#include "sysfile.h"
				46	#include "inode.h"
				47	#include "journal.h"
				48	#include "mmap.h"
				49	#include "suballoc.h"
				50	#include "super.h"
				51
				52	#include "buffer_head_io.h"
				53
				54	static int ocfs2_sync_inode(struct inode *inode)
				55	{
				56	filemap_fdatawrite(inode->i_mapping);
				57	return sync_mapping_buffers(inode->i_mapping);
				58	}
				59
				60	static int ocfs2_file_open(struct inode inode, struct file file)
				61	{
				62	int status;
				63	int mode = file->f_flags;
				64	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				65
				66	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
				67	file->f_dentry->d_name.len, file->f_dentry->d_name.name);
				68
				69	spin_lock(&oi->ip_lock);
				70
				71	/* Check that the inode hasn't been wiped from disk by another
				72	* node. If it hasn't then we're safe as long as we hold the
				73	* spin lock until our increment of open count. */
				74	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
				75	spin_unlock(&oi->ip_lock);
				76
				77	status = -ENOENT;
				78	goto leave;
				79	}
				80
				81	if (mode & O_DIRECT)
				82	oi->ip_flags \|= OCFS2_INODE_OPEN_DIRECT;
				83
				84	oi->ip_open_count++;
				85	spin_unlock(&oi->ip_lock);
				86	status = 0;
				87	leave:
				88	mlog_exit(status);
				89	return status;
				90	}
				91
				92	static int ocfs2_file_release(struct inode inode, struct file file)
				93	{
				94	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				95
				96	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
				97	file->f_dentry->d_name.len,
				98	file->f_dentry->d_name.name);
				99
				100	spin_lock(&oi->ip_lock);
				101	if (!--oi->ip_open_count)
				102	oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
				103	spin_unlock(&oi->ip_lock);
				104
				105	mlog_exit(0);
				106
				107	return 0;
				108	}
				109
				110	static int ocfs2_sync_file(struct file *file,
				111	struct dentry *dentry,
				112	int datasync)
				113	{
				114	int err = 0;
				115	journal_t *journal;
				116	struct inode *inode = dentry->d_inode;
				117	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				118
				119	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
				120	dentry->d_name.len, dentry->d_name.name);
				121
				122	err = ocfs2_sync_inode(dentry->d_inode);
				123	if (err)
				124	goto bail;
				125
				126	journal = osb->journal->j_journal;
				127	err = journal_force_commit(journal);
				128
				129	bail:
				130	mlog_exit(err);
				131
				132	return (err < 0) ? -EIO : 0;
				133	}
				134
				135	int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
				136	struct inode *inode,
				137	struct buffer_head *fe_bh,
				138	u64 new_i_size)
				139	{
				140	int status;
				141
				142	mlog_entry_void();
				143	i_size_write(inode, new_i_size);
				144	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
				145	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
				146
				147	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
				148	if (status < 0) {
				149	mlog_errno(status);
				150	goto bail;
				151	}
				152
				153	bail:
				154	mlog_exit(status);
				155	return status;
				156	}
				157
				158	static int ocfs2_simple_size_update(struct inode *inode,
				159	struct buffer_head *di_bh,
				160	u64 new_i_size)
				161	{
				162	int ret;
				163	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				164	struct ocfs2_journal_handle *handle = NULL;
				165
				166	handle = ocfs2_start_trans(osb, NULL,
				167	OCFS2_INODE_UPDATE_CREDITS);
				168	if (handle == NULL) {
				169	ret = -ENOMEM;
				170	mlog_errno(ret);
				171	goto out;
				172	}
				173
				174	ret = ocfs2_set_inode_size(handle, inode, di_bh,
				175	new_i_size);
				176	if (ret < 0)
				177	mlog_errno(ret);
				178
				179	ocfs2_commit_trans(handle);
				180	out:
				181	return ret;
				182	}
				183
				184	static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
				185	struct inode *inode,
				186	struct buffer_head *fe_bh,
				187	u64 new_i_size)
				188	{
				189	int status;
				190	struct ocfs2_journal_handle *handle;
				191
				192	mlog_entry_void();
				193
				194	/* TODO: This needs to actually orphan the inode in this
				195	* transaction. */
				196
				197	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
				198	if (IS_ERR(handle)) {
				199	status = PTR_ERR(handle);
				200	mlog_errno(status);
				201	goto out;
				202	}
				203
				204	status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
				205	if (status < 0)
				206	mlog_errno(status);
				207
				208	ocfs2_commit_trans(handle);
				209	out:
				210	mlog_exit(status);
				211	return status;
				212	}
				213
				214	static int ocfs2_truncate_file(struct inode *inode,
				215	struct buffer_head *di_bh,
				216	u64 new_i_size)
				217	{
				218	int status = 0;
				219	struct ocfs2_dinode *fe = NULL;
				220	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				221	struct ocfs2_truncate_context *tc = NULL;
				222
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	223	mlog_entry("(inode = %llu, new_i_size = %llu\n",
				224	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				225	(unsigned long long)new_i_size);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	226
				227	truncate_inode_pages(inode->i_mapping, new_i_size);
				228
				229	fe = (struct ocfs2_dinode *) di_bh->b_data;
				230	if (!OCFS2_IS_VALID_DINODE(fe)) {
				231	OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
				232	status = -EIO;
				233	goto bail;
				234	}
				235
				236	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	237	"Inode %llu, inode i_size = %lld != di "
				238	"i_size = %llu, i_flags = 0x%x\n",
				239	(unsigned long long)OCFS2_I(inode)->ip_blkno,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	240	i_size_read(inode),
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	241	(unsigned long long)le64_to_cpu(fe->i_size),
				242	le32_to_cpu(fe->i_flags));
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	243
				244	if (new_i_size > le64_to_cpu(fe->i_size)) {
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	245	mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
				246	(unsigned long long)le64_to_cpu(fe->i_size),
				247	(unsigned long long)new_i_size);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	248	status = -EINVAL;
				249	mlog_errno(status);
				250	goto bail;
				251	}
				252
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	253	mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
				254	(unsigned long long)le64_to_cpu(fe->i_blkno),
				255	(unsigned long long)le64_to_cpu(fe->i_size),
				256	(unsigned long long)new_i_size);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	257
				258	/* lets handle the simple truncate cases before doing any more
				259	* cluster locking. */
				260	if (new_i_size == le64_to_cpu(fe->i_size))
				261	goto bail;
				262
				263	if (le32_to_cpu(fe->i_clusters) ==
				264	ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
				265	mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
				266	fe->i_clusters);
				267	/* No allocation change is required, so lets fast path
				268	* this truncate. */
				269	status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
				270	if (status < 0)
				271	mlog_errno(status);
				272	goto bail;
				273	}
				274
				275	/* This forces other nodes to sync and drop their pages */
				276	status = ocfs2_data_lock(inode, 1);
				277	if (status < 0) {
				278	mlog_errno(status);
				279	goto bail;
				280	}
				281	ocfs2_data_unlock(inode, 1);
				282
				283	/* alright, we're going to need to do a full blown alloc size
				284	* change. Orphan the inode so that recovery can complete the
				285	* truncate if necessary. This does the task of marking
				286	* i_size. */
				287	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
				288	if (status < 0) {
				289	mlog_errno(status);
				290	goto bail;
				291	}
				292
				293	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
				294	if (status < 0) {
				295	mlog_errno(status);
				296	goto bail;
				297	}
				298
				299	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
				300	if (status < 0) {
				301	mlog_errno(status);
				302	goto bail;
				303	}
				304
				305	/* TODO: orphan dir cleanup here. */
				306	bail:
				307
				308	mlog_exit(status);
				309	return status;
				310	}
				311
				312	/*
				313	* extend allocation only here.
				314	* we'll update all the disk stuff, and oip->alloc_size
				315	*
				316	* expect stuff to be locked, a transaction started and enough data /
				317	* metadata reservations in the contexts.
				318	*
				319	* Will return -EAGAIN, and a reason if a restart is needed.
				320	* If passed in, *reason will always be set, even in error.
				321	*/
				322	int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
				323	struct inode *inode,
				324	u32 clusters_to_add,
				325	struct buffer_head *fe_bh,
				326	struct ocfs2_journal_handle *handle,
				327	struct ocfs2_alloc_context *data_ac,
				328	struct ocfs2_alloc_context *meta_ac,
				329	enum ocfs2_alloc_restarted *reason_ret)
				330	{
				331	int status = 0;
				332	int free_extents;
				333	struct ocfs2_dinode fe = (struct ocfs2_dinode ) fe_bh->b_data;
				334	enum ocfs2_alloc_restarted reason = RESTART_NONE;
				335	u32 bit_off, num_bits;
				336	u64 block;
				337
				338	BUG_ON(!clusters_to_add);
				339
				340	free_extents = ocfs2_num_free_extents(osb, inode, fe);
				341	if (free_extents < 0) {
				342	status = free_extents;
				343	mlog_errno(status);
				344	goto leave;
				345	}
				346
				347	/* there are two cases which could cause us to EAGAIN in the
				348	* we-need-more-metadata case:
				349	* 1) we haven't reserved any
				350	* 2) we are so fragmented, we've needed to add metadata too
				351	* many times. */
				352	if (!free_extents && !meta_ac) {
				353	mlog(0, "we haven't reserved any metadata!\n");
				354	status = -EAGAIN;
				355	reason = RESTART_META;
				356	goto leave;
				357	} else if ((!free_extents)
				358	&& (ocfs2_alloc_context_bits_left(meta_ac)
				359	< ocfs2_extend_meta_needed(fe))) {
				360	mlog(0, "filesystem is really fragmented...\n");
				361	status = -EAGAIN;
				362	reason = RESTART_META;
				363	goto leave;
				364	}
				365
				366	status = ocfs2_claim_clusters(osb, handle, data_ac, 1,
				367	&bit_off, &num_bits);
				368	if (status < 0) {
				369	if (status != -ENOSPC)
				370	mlog_errno(status);
				371	goto leave;
				372	}
				373
				374	BUG_ON(num_bits > clusters_to_add);
				375
				376	/* reserve our write early -- insert_extent may update the inode */
				377	status = ocfs2_journal_access(handle, inode, fe_bh,
				378	OCFS2_JOURNAL_ACCESS_WRITE);
				379	if (status < 0) {
				380	mlog_errno(status);
				381	goto leave;
				382	}
				383
				384	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	385	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
				386	num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	387	status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
				388	num_bits, meta_ac);
				389	if (status < 0) {
				390	mlog_errno(status);
				391	goto leave;
				392	}
				393
				394	le32_add_cpu(&fe->i_clusters, num_bits);
				395	spin_lock(&OCFS2_I(inode)->ip_lock);
				396	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
				397	spin_unlock(&OCFS2_I(inode)->ip_lock);
				398
				399	status = ocfs2_journal_dirty(handle, fe_bh);
				400	if (status < 0) {
				401	mlog_errno(status);
				402	goto leave;
				403	}
				404
				405	clusters_to_add -= num_bits;
				406
				407	if (clusters_to_add) {
				408	mlog(0, "need to alloc once more, clusters = %u, wanted = "
				409	"%u\n", fe->i_clusters, clusters_to_add);
				410	status = -EAGAIN;
				411	reason = RESTART_TRANS;
				412	}
				413
				414	leave:
				415	mlog_exit(status);
				416	if (reason_ret)
				417	*reason_ret = reason;
				418	return status;
				419	}
				420
				421	static int ocfs2_extend_allocation(struct inode *inode,
				422	u32 clusters_to_add)
				423	{
				424	int status = 0;
				425	int restart_func = 0;
				426	int drop_alloc_sem = 0;
				427	int credits, num_free_extents;
				428	u32 prev_clusters;
				429	struct buffer_head *bh = NULL;
				430	struct ocfs2_dinode *fe = NULL;
				431	struct ocfs2_journal_handle *handle = NULL;
				432	struct ocfs2_alloc_context *data_ac = NULL;
				433	struct ocfs2_alloc_context *meta_ac = NULL;
				434	enum ocfs2_alloc_restarted why;
				435	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				436
				437	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
				438
				439	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
				440	OCFS2_BH_CACHED, inode);
				441	if (status < 0) {
				442	mlog_errno(status);
				443	goto leave;
				444	}
				445
				446	fe = (struct ocfs2_dinode *) bh->b_data;
				447	if (!OCFS2_IS_VALID_DINODE(fe)) {
				448	OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
				449	status = -EIO;
				450	goto leave;
				451	}
				452
				453	restart_all:
				454	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
				455
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	456	mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, "
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	457	"clusters_to_add = %u\n",
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	458	(unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	459	fe->i_clusters, clusters_to_add);
				460
				461	handle = ocfs2_alloc_handle(osb);
				462	if (handle == NULL) {
				463	status = -ENOMEM;
				464	mlog_errno(status);
				465	goto leave;
				466	}
				467
				468	num_free_extents = ocfs2_num_free_extents(osb,
				469	inode,
				470	fe);
				471	if (num_free_extents < 0) {
				472	status = num_free_extents;
				473	mlog_errno(status);
				474	goto leave;
				475	}
				476
				477	if (!num_free_extents) {
				478	status = ocfs2_reserve_new_metadata(osb,
				479	handle,
				480	fe,
				481	&meta_ac);
				482	if (status < 0) {
				483	if (status != -ENOSPC)
				484	mlog_errno(status);
				485	goto leave;
				486	}
				487	}
				488
				489	status = ocfs2_reserve_clusters(osb,
				490	handle,
				491	clusters_to_add,
				492	&data_ac);
				493	if (status < 0) {
				494	if (status != -ENOSPC)
				495	mlog_errno(status);
				496	goto leave;
				497	}
				498
				499	/* blocks peope in read/write from reading our allocation
Jes Sorensen	1b1dcc1	2006-01-09 15:59:24 -0800	[diff] [blame]	500	* until we're done changing it. We depend on i_mutex to block
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	501	* other extend/truncate calls while we're here. Ordering wrt
				502	* start_trans is important here -- always do it before! */
				503	down_write(&OCFS2_I(inode)->ip_alloc_sem);
				504	drop_alloc_sem = 1;
				505
				506	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
				507	handle = ocfs2_start_trans(osb, handle, credits);
				508	if (IS_ERR(handle)) {
				509	status = PTR_ERR(handle);
				510	handle = NULL;
				511	mlog_errno(status);
				512	goto leave;
				513	}
				514
				515	restarted_transaction:
				516	/* reserve a write to the file entry early on - that we if we
				517	* run out of credits in the allocation path, we can still
				518	* update i_size. */
				519	status = ocfs2_journal_access(handle, inode, bh,
				520	OCFS2_JOURNAL_ACCESS_WRITE);
				521	if (status < 0) {
				522	mlog_errno(status);
				523	goto leave;
				524	}
				525
				526	prev_clusters = OCFS2_I(inode)->ip_clusters;
				527
				528	status = ocfs2_do_extend_allocation(osb,
				529	inode,
				530	clusters_to_add,
				531	bh,
				532	handle,
				533	data_ac,
				534	meta_ac,
				535	&why);
				536	if ((status < 0) && (status != -EAGAIN)) {
				537	if (status != -ENOSPC)
				538	mlog_errno(status);
				539	goto leave;
				540	}
				541
				542	status = ocfs2_journal_dirty(handle, bh);
				543	if (status < 0) {
				544	mlog_errno(status);
				545	goto leave;
				546	}
				547
				548	spin_lock(&OCFS2_I(inode)->ip_lock);
				549	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
				550	spin_unlock(&OCFS2_I(inode)->ip_lock);
				551
				552	if (why != RESTART_NONE && clusters_to_add) {
				553	if (why == RESTART_META) {
				554	mlog(0, "restarting function.\n");
				555	restart_func = 1;
				556	} else {
				557	BUG_ON(why != RESTART_TRANS);
				558
				559	mlog(0, "restarting transaction.\n");
				560	/* TODO: This can be more intelligent. */
				561	credits = ocfs2_calc_extend_credits(osb->sb,
				562	fe,
				563	clusters_to_add);
				564	status = ocfs2_extend_trans(handle, credits);
				565	if (status < 0) {
				566	/* handle still has to be committed at
				567	* this point. */
				568	status = -ENOMEM;
				569	mlog_errno(status);
				570	goto leave;
				571	}
				572	goto restarted_transaction;
				573	}
				574	}
				575
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	576	mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
				577	fe->i_clusters, (unsigned long long)fe->i_size);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	578	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
				579	OCFS2_I(inode)->ip_clusters, i_size_read(inode));
				580
				581	leave:
				582	if (drop_alloc_sem) {
				583	up_write(&OCFS2_I(inode)->ip_alloc_sem);
				584	drop_alloc_sem = 0;
				585	}
				586	if (handle) {
				587	ocfs2_commit_trans(handle);
				588	handle = NULL;
				589	}
				590	if (data_ac) {
				591	ocfs2_free_alloc_context(data_ac);
				592	data_ac = NULL;
				593	}
				594	if (meta_ac) {
				595	ocfs2_free_alloc_context(meta_ac);
				596	meta_ac = NULL;
				597	}
				598	if ((!status) && restart_func) {
				599	restart_func = 0;
				600	goto restart_all;
				601	}
				602	if (bh) {
				603	brelse(bh);
				604	bh = NULL;
				605	}
				606
				607	mlog_exit(status);
				608	return status;
				609	}
				610
				611	/* Some parts of this taken from generic_cont_expand, which turned out
				612	* to be too fragile to do exactly what we need without us having to
				613	* worry about recursive locking in ->commit_write(). */
				614	static int ocfs2_write_zero_page(struct inode *inode,
				615	u64 size)
				616	{
				617	struct address_space *mapping = inode->i_mapping;
				618	struct page *page;
				619	unsigned long index;
				620	unsigned int offset;
				621	struct ocfs2_journal_handle *handle = NULL;
				622	int ret;
				623
				624	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
				625	/* ugh. in prepare/commit_write, if from==to==start of block, we
				626	** skip the prepare. make sure we never send an offset for the start
				627	** of a block
				628	*/
				629	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
				630	offset++;
				631	}
				632	index = size >> PAGE_CACHE_SHIFT;
				633
				634	page = grab_cache_page(mapping, index);
				635	if (!page) {
				636	ret = -ENOMEM;
				637	mlog_errno(ret);
				638	goto out;
				639	}
				640
				641	ret = ocfs2_prepare_write(NULL, page, offset, offset);
				642	if (ret < 0) {
				643	mlog_errno(ret);
				644	goto out_unlock;
				645	}
				646
				647	if (ocfs2_should_order_data(inode)) {
				648	handle = ocfs2_start_walk_page_trans(inode, page, offset,
				649	offset);
				650	if (IS_ERR(handle)) {
				651	ret = PTR_ERR(handle);
				652	handle = NULL;
				653	goto out_unlock;
				654	}
				655	}
				656
				657	/* must not update i_size! */
				658	ret = block_commit_write(page, offset, offset);
				659	if (ret < 0)
				660	mlog_errno(ret);
				661	else
				662	ret = 0;
				663
				664	if (handle)
				665	ocfs2_commit_trans(handle);
				666	out_unlock:
				667	unlock_page(page);
				668	page_cache_release(page);
				669	out:
				670	return ret;
				671	}
				672
				673	static int ocfs2_zero_extend(struct inode *inode,
				674	u64 zero_to_size)
				675	{
				676	int ret = 0;
				677	u64 start_off;
				678	struct super_block *sb = inode->i_sb;
				679
				680	start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
				681	while (start_off < zero_to_size) {
				682	ret = ocfs2_write_zero_page(inode, start_off);
				683	if (ret < 0) {
				684	mlog_errno(ret);
				685	goto out;
				686	}
				687
				688	start_off += sb->s_blocksize;
				689	}
				690
				691	out:
				692	return ret;
				693	}
				694
				695	static int ocfs2_extend_file(struct inode *inode,
				696	struct buffer_head *di_bh,
				697	u64 new_i_size)
				698	{
				699	int ret = 0;
				700	u32 clusters_to_add;
				701
				702	/* setattr sometimes calls us like this. */
				703	if (new_i_size == 0)
				704	goto out;
				705
				706	if (i_size_read(inode) == new_i_size)
				707	goto out;
				708	BUG_ON(new_i_size < i_size_read(inode));
				709
				710	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
				711	OCFS2_I(inode)->ip_clusters;
				712
				713	if (clusters_to_add) {
				714	ret = ocfs2_extend_allocation(inode, clusters_to_add);
				715	if (ret < 0) {
				716	mlog_errno(ret);
				717	goto out;
				718	}
				719
				720	ret = ocfs2_zero_extend(inode, new_i_size);
				721	if (ret < 0) {
				722	mlog_errno(ret);
				723	goto out;
				724	}
				725	}
				726
				727	/* No allocation required, we just use this helper to
				728	* do a trivial update of i_size. */
				729	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
				730	if (ret < 0) {
				731	mlog_errno(ret);
				732	goto out;
				733	}
				734
				735	out:
				736	return ret;
				737	}
				738
				739	int ocfs2_setattr(struct dentry dentry, struct iattr attr)
				740	{
				741	int status = 0, size_change;
				742	struct inode *inode = dentry->d_inode;
				743	struct super_block *sb = inode->i_sb;
				744	struct ocfs2_super *osb = OCFS2_SB(sb);
				745	struct buffer_head *bh = NULL;
				746	struct ocfs2_journal_handle *handle = NULL;
				747
				748	mlog_entry("(0x%p, '%.*s')\n", dentry,
				749	dentry->d_name.len, dentry->d_name.name);
				750
				751	if (attr->ia_valid & ATTR_MODE)
				752	mlog(0, "mode change: %d\n", attr->ia_mode);
				753	if (attr->ia_valid & ATTR_UID)
				754	mlog(0, "uid change: %d\n", attr->ia_uid);
				755	if (attr->ia_valid & ATTR_GID)
				756	mlog(0, "gid change: %d\n", attr->ia_gid);
				757	if (attr->ia_valid & ATTR_SIZE)
				758	mlog(0, "size change...\n");
				759	if (attr->ia_valid & (ATTR_ATIME \| ATTR_MTIME \| ATTR_CTIME))
				760	mlog(0, "time change...\n");
				761
				762	#define OCFS2_VALID_ATTRS (ATTR_ATIME \| ATTR_MTIME \| ATTR_CTIME \| ATTR_SIZE \
				763	\| ATTR_GID \| ATTR_UID \| ATTR_MODE)
				764	if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
				765	mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
				766	return 0;
				767	}
				768
				769	status = inode_change_ok(inode, attr);
				770	if (status)
				771	return status;
				772
				773	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
				774	if (size_change) {
				775	status = ocfs2_rw_lock(inode, 1);
				776	if (status < 0) {
				777	mlog_errno(status);
				778	goto bail;
				779	}
				780	}
				781
				782	status = ocfs2_meta_lock(inode, NULL, &bh, 1);
				783	if (status < 0) {
				784	if (status != -ENOENT)
				785	mlog_errno(status);
				786	goto bail_unlock_rw;
				787	}
				788
				789	if (size_change && attr->ia_size != i_size_read(inode)) {
				790	if (i_size_read(inode) > attr->ia_size)
				791	status = ocfs2_truncate_file(inode, bh, attr->ia_size);
				792	else
				793	status = ocfs2_extend_file(inode, bh, attr->ia_size);
				794	if (status < 0) {
				795	if (status != -ENOSPC)
				796	mlog_errno(status);
				797	status = -ENOSPC;
				798	goto bail_unlock;
				799	}
				800	}
				801
				802	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
				803	if (IS_ERR(handle)) {
				804	status = PTR_ERR(handle);
				805	mlog_errno(status);
				806	goto bail_unlock;
				807	}
				808
				809	status = inode_setattr(inode, attr);
				810	if (status < 0) {
				811	mlog_errno(status);
				812	goto bail_commit;
				813	}
				814
				815	status = ocfs2_mark_inode_dirty(handle, inode, bh);
				816	if (status < 0)
				817	mlog_errno(status);
				818
				819	bail_commit:
				820	ocfs2_commit_trans(handle);
				821	bail_unlock:
				822	ocfs2_meta_unlock(inode, 1);
				823	bail_unlock_rw:
				824	if (size_change)
				825	ocfs2_rw_unlock(inode, 1);
				826	bail:
				827	if (bh)
				828	brelse(bh);
				829
				830	mlog_exit(status);
				831	return status;
				832	}
				833
				834	int ocfs2_getattr(struct vfsmount *mnt,
				835	struct dentry *dentry,
				836	struct kstat *stat)
				837	{
				838	struct inode *inode = dentry->d_inode;
				839	struct super_block *sb = dentry->d_inode->i_sb;
				840	struct ocfs2_super *osb = sb->s_fs_info;
				841	int err;
				842
				843	mlog_entry_void();
				844
				845	err = ocfs2_inode_revalidate(dentry);
				846	if (err) {
				847	if (err != -ENOENT)
				848	mlog_errno(err);
				849	goto bail;
				850	}
				851
				852	generic_fillattr(inode, stat);
				853
				854	/* We set the blksize from the cluster size for performance */
				855	stat->blksize = osb->s_clustersize;
				856
				857	bail:
				858	mlog_exit(err);
				859
				860	return err;
				861	}
				862
				863	static int ocfs2_write_remove_suid(struct inode *inode)
				864	{
				865	int ret;
				866	struct buffer_head *bh = NULL;
				867	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				868	struct ocfs2_journal_handle *handle;
				869	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				870	struct ocfs2_dinode *di;
				871
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	872	mlog_entry("(Inode %llu, mode 0%o)\n",
				873	(unsigned long long)oi->ip_blkno, inode->i_mode);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	874
				875	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
				876	if (handle == NULL) {
				877	ret = -ENOMEM;
				878	mlog_errno(ret);
				879	goto out;
				880	}
				881
				882	ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
				883	if (ret < 0) {
				884	mlog_errno(ret);
				885	goto out_trans;
				886	}
				887
				888	ret = ocfs2_journal_access(handle, inode, bh,
				889	OCFS2_JOURNAL_ACCESS_WRITE);
				890	if (ret < 0) {
				891	mlog_errno(ret);
				892	goto out_bh;
				893	}
				894
				895	inode->i_mode &= ~S_ISUID;
				896	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
				897	inode->i_mode &= ~S_ISGID;
				898
				899	di = (struct ocfs2_dinode *) bh->b_data;
				900	di->i_mode = cpu_to_le16(inode->i_mode);
				901
				902	ret = ocfs2_journal_dirty(handle, bh);
				903	if (ret < 0)
				904	mlog_errno(ret);
				905	out_bh:
				906	brelse(bh);
				907	out_trans:
				908	ocfs2_commit_trans(handle);
				909	out:
				910	mlog_exit(ret);
				911	return ret;
				912	}
				913
				914	static inline int ocfs2_write_should_remove_suid(struct inode *inode)
				915	{
				916	mode_t mode = inode->i_mode;
				917
				918	if (!capable(CAP_FSETID)) {
				919	if (unlikely(mode & S_ISUID))
				920	return 1;
				921
				922	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
				923	return 1;
				924	}
				925	return 0;
				926	}
				927
				928	static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
				929	const char __user *buf,
				930	size_t count,
				931	loff_t pos)
				932	{
				933	struct iovec local_iov = { .iov_base = (void __user *)buf,
				934	.iov_len = count };
				935	int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0;
				936	u32 clusters;
				937	struct file *filp = iocb->ki_filp;
				938	struct inode *inode = filp->f_dentry->d_inode;
				939	loff_t newsize, saved_pos;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	940
				941	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
				942	(unsigned int)count,
				943	filp->f_dentry->d_name.len,
				944	filp->f_dentry->d_name.name);
				945
				946	/* happy write of zero bytes */
				947	if (count == 0)
				948	return 0;
				949
				950	if (!inode) {
				951	mlog(0, "bad inode\n");
				952	return -EIO;
				953	}
				954
Jes Sorensen	1b1dcc1	2006-01-09 15:59:24 -0800	[diff] [blame]	955	mutex_lock(&inode->i_mutex);
				956	/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	957	if (filp->f_flags & O_DIRECT) {
				958	have_alloc_sem = 1;
				959	down_read(&inode->i_alloc_sem);
				960	}
				961
				962	/* concurrent O_DIRECT writes are allowed */
				963	rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
				964	ret = ocfs2_rw_lock(inode, rw_level);
				965	if (ret < 0) {
				966	rw_level = -1;
				967	mlog_errno(ret);
				968	goto out;
				969	}
				970
				971	/*
				972	* We sample i_size under a read level meta lock to see if our write
				973	* is extending the file, if it is we back off and get a write level
				974	* meta lock.
				975	*/
				976	meta_level = (filp->f_flags & O_APPEND) ? 1 : 0;
				977	for(;;) {
				978	ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level);
				979	if (ret < 0) {
				980	meta_level = -1;
				981	mlog_errno(ret);
				982	goto out;
				983	}
				984
				985	/* Clear suid / sgid if necessary. We do this here
				986	* instead of later in the write path because
				987	* remove_suid() calls ->setattr without any hint that
				988	* we may have already done our cluster locking. Since
				989	* ocfs2_setattr() must take cluster locks to
				990	* proceeed, this will lead us to recursively lock the
				991	* inode. There's also the dinode i_size state which
				992	* can be lost via setattr during extending writes (we
				993	* set inode->i_size at the end of a write. */
				994	if (ocfs2_write_should_remove_suid(inode)) {
				995	if (meta_level == 0) {
				996	ocfs2_meta_unlock(inode, meta_level);
				997	meta_level = 1;
				998	continue;
				999	}
				1000
				1001	ret = ocfs2_write_remove_suid(inode);
				1002	if (ret < 0) {
				1003	mlog_errno(ret);
				1004	goto out;
				1005	}
				1006	}
				1007
				1008	/* work on a copy of ppos until we're sure that we won't have
				1009	* to recalculate it due to relocking. */
				1010	if (filp->f_flags & O_APPEND) {
				1011	saved_pos = i_size_read(inode);
				1012	mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
				1013	} else {
				1014	saved_pos = iocb->ki_pos;
				1015	}
				1016	newsize = count + saved_pos;
				1017
Mark Fasheh	215c7f9	2006-02-01 16:42:10 -0800	[diff] [blame]	1018	mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
				1019	(long long) saved_pos, (long long) newsize,
				1020	(long long) i_size_read(inode));
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1021
				1022	/* No need for a higher level metadata lock if we're
				1023	* never going past i_size. */
				1024	if (newsize <= i_size_read(inode))
				1025	break;
				1026
				1027	if (meta_level == 0) {
				1028	ocfs2_meta_unlock(inode, meta_level);
				1029	meta_level = 1;
				1030	continue;
				1031	}
				1032
				1033	spin_lock(&OCFS2_I(inode)->ip_lock);
				1034	clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
				1035	OCFS2_I(inode)->ip_clusters;
				1036	spin_unlock(&OCFS2_I(inode)->ip_lock);
				1037
				1038	mlog(0, "Writing at EOF, may need more allocation: "
Mark Fasheh	215c7f9	2006-02-01 16:42:10 -0800	[diff] [blame]	1039	"i_size = %lld, newsize = %lld, need %u clusters\n",
				1040	(long long) i_size_read(inode), (long long) newsize,
				1041	clusters);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1042
				1043	/* We only want to continue the rest of this loop if
				1044	* our extend will actually require more
				1045	* allocation. */
				1046	if (!clusters)
				1047	break;
				1048
				1049	ret = ocfs2_extend_allocation(inode, clusters);
				1050	if (ret < 0) {
				1051	if (ret != -ENOSPC)
				1052	mlog_errno(ret);
				1053	goto out;
				1054	}
				1055
				1056	/* Fill any holes which would've been created by this
				1057	* write. If we're O_APPEND, this will wind up
				1058	* (correctly) being a noop. */
				1059	ret = ocfs2_zero_extend(inode, (u64) newsize - count);
				1060	if (ret < 0) {
				1061	mlog_errno(ret);
				1062	goto out;
				1063	}
				1064	break;
				1065	}
				1066
				1067	/* ok, we're done with i_size and alloc work */
				1068	iocb->ki_pos = saved_pos;
				1069	ocfs2_meta_unlock(inode, meta_level);
				1070	meta_level = -1;
				1071
				1072	/* communicate with ocfs2_dio_end_io */
				1073	ocfs2_iocb_set_rw_locked(iocb);
				1074
Mark Fasheh	d267a56	2006-02-23 13:23:39 -0800	[diff] [blame]	1075	ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1076
				1077	/* buffered aio wouldn't have proper lock coverage today */
				1078	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
				1079
				1080	/*
				1081	* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
				1082	* function pointer which is called when o_direct io completes so that
				1083	* it can unlock our rw lock. (it's the clustered equivalent of
				1084	* i_alloc_sem; protects truncate from racing with pending ios).
				1085	* Unfortunately there are error cases which call end_io and others
				1086	* that don't. so we don't have to unlock the rw_lock if either an
				1087	* async dio is going to do it in the future or an end_io after an
				1088	* error has already done it.
				1089	*/
				1090	if (ret == -EIOCBQUEUED \|\| !ocfs2_iocb_is_rw_locked(iocb)) {
				1091	rw_level = -1;
				1092	have_alloc_sem = 0;
				1093	}
				1094
				1095	out:
				1096	if (meta_level != -1)
				1097	ocfs2_meta_unlock(inode, meta_level);
				1098	if (have_alloc_sem)
				1099	up_read(&inode->i_alloc_sem);
				1100	if (rw_level != -1)
				1101	ocfs2_rw_unlock(inode, rw_level);
Jes Sorensen	1b1dcc1	2006-01-09 15:59:24 -0800	[diff] [blame]	1102	mutex_unlock(&inode->i_mutex);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1103
				1104	mlog_exit(ret);
				1105	return ret;
				1106	}
				1107
				1108	static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
				1109	char __user *buf,
				1110	size_t count,
				1111	loff_t pos)
				1112	{
				1113	int ret = 0, rw_level = -1, have_alloc_sem = 0;
				1114	struct file *filp = iocb->ki_filp;
				1115	struct inode *inode = filp->f_dentry->d_inode;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1116
				1117	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
				1118	(unsigned int)count,
				1119	filp->f_dentry->d_name.len,
				1120	filp->f_dentry->d_name.name);
				1121
				1122	if (!inode) {
				1123	ret = -EINVAL;
				1124	mlog_errno(ret);
				1125	goto bail;
				1126	}
				1127
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1128	/*
				1129	* buffered reads protect themselves in ->readpage(). O_DIRECT reads
				1130	* need locks to protect pending reads from racing with truncate.
				1131	*/
				1132	if (filp->f_flags & O_DIRECT) {
				1133	down_read(&inode->i_alloc_sem);
				1134	have_alloc_sem = 1;
				1135
				1136	ret = ocfs2_rw_lock(inode, 0);
				1137	if (ret < 0) {
				1138	mlog_errno(ret);
				1139	goto bail;
				1140	}
				1141	rw_level = 0;
				1142	/* communicate with ocfs2_dio_end_io */
				1143	ocfs2_iocb_set_rw_locked(iocb);
				1144	}
				1145
				1146	ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);
				1147	if (ret == -EINVAL)
				1148	mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
				1149
				1150	/* buffered aio wouldn't have proper lock coverage today */
				1151	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
				1152
				1153	/* see ocfs2_file_aio_write */
				1154	if (ret == -EIOCBQUEUED \|\| !ocfs2_iocb_is_rw_locked(iocb)) {
				1155	rw_level = -1;
				1156	have_alloc_sem = 0;
				1157	}
				1158
				1159	bail:
				1160	if (have_alloc_sem)
				1161	up_read(&inode->i_alloc_sem);
				1162	if (rw_level != -1)
				1163	ocfs2_rw_unlock(inode, rw_level);
				1164	mlog_exit(ret);
				1165
				1166	return ret;
				1167	}
				1168
				1169	struct inode_operations ocfs2_file_iops = {
				1170	.setattr = ocfs2_setattr,
				1171	.getattr = ocfs2_getattr,
				1172	};
				1173
				1174	struct inode_operations ocfs2_special_file_iops = {
				1175	.setattr = ocfs2_setattr,
				1176	.getattr = ocfs2_getattr,
				1177	};
				1178
				1179	struct file_operations ocfs2_fops = {
				1180	.read = do_sync_read,
				1181	.write = do_sync_write,
				1182	.sendfile = generic_file_sendfile,
				1183	.mmap = ocfs2_mmap,
				1184	.fsync = ocfs2_sync_file,
				1185	.release = ocfs2_file_release,
				1186	.open = ocfs2_file_open,
				1187	.aio_read = ocfs2_file_aio_read,
				1188	.aio_write = ocfs2_file_aio_write,
				1189	};
				1190
				1191	struct file_operations ocfs2_dops = {
				1192	.read = generic_read_dir,
				1193	.readdir = ocfs2_readdir,
				1194	.fsync = ocfs2_sync_file,
				1195	};