Blame - fs/ocfs2/aops.c - kernel/msm

blob: c22543b3342062f0308ec06a6f74f2334291727f [file] [log] [blame]

Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1	/* -- mode: c; c-basic-offset: 8; --
				2	* vim: noexpandtab sw=8 ts=8 sts=0:
				3	*
				4	* Copyright (C) 2002, 2004 Oracle. All rights reserved.
				5	*
				6	* This program is free software; you can redistribute it and/or
				7	* modify it under the terms of the GNU General Public
				8	* License as published by the Free Software Foundation; either
				9	* version 2 of the License, or (at your option) any later version.
				10	*
				11	* This program is distributed in the hope that it will be useful,
				12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				14	* General Public License for more details.
				15	*
				16	* You should have received a copy of the GNU General Public
				17	* License along with this program; if not, write to the
				18	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				19	* Boston, MA 021110-1307, USA.
				20	*/
				21
				22	#include <linux/fs.h>
				23	#include <linux/slab.h>
				24	#include <linux/highmem.h>
				25	#include <linux/pagemap.h>
				26	#include <asm/byteorder.h>
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	27	#include <linux/swap.h>
Mark Fasheh	6af67d8	2007-03-06 17:24:46 -0800	[diff] [blame]	28	#include <linux/pipe_fs_i.h>
Mark Fasheh	628a24f	2007-10-30 12:08:32 -0700	[diff] [blame]	29	#include <linux/mpage.h>
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	30
				31	#define MLOG_MASK_PREFIX ML_FILE_IO
				32	#include <cluster/masklog.h>
				33
				34	#include "ocfs2.h"
				35
				36	#include "alloc.h"
				37	#include "aops.h"
				38	#include "dlmglue.h"
				39	#include "extent_map.h"
				40	#include "file.h"
				41	#include "inode.h"
				42	#include "journal.h"
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	43	#include "suballoc.h"
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	44	#include "super.h"
				45	#include "symlink.h"
				46
				47	#include "buffer_head_io.h"
				48
				49	static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
				50	struct buffer_head *bh_result, int create)
				51	{
				52	int err = -EIO;
				53	int status;
				54	struct ocfs2_dinode *fe = NULL;
				55	struct buffer_head *bh = NULL;
				56	struct buffer_head *buffer_cache_bh = NULL;
				57	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				58	void *kaddr;
				59
				60	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
				61	(unsigned long long)iblock, bh_result, create);
				62
				63	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
				64
				65	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
				66	mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
				67	(unsigned long long)iblock);
				68	goto bail;
				69	}
				70
Joel Becker	0fcaa56	2008-10-09 17:20:31 -0700	[diff] [blame]	71	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	72	if (status < 0) {
				73	mlog_errno(status);
				74	goto bail;
				75	}
				76	fe = (struct ocfs2_dinode *) bh->b_data;
				77
				78	if (!OCFS2_IS_VALID_DINODE(fe)) {
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	79	mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
Mark Fasheh	1ca1a11	2007-04-27 16:01:25 -0700	[diff] [blame]	80	(unsigned long long)le64_to_cpu(fe->i_blkno), 7,
				81	fe->i_signature);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	82	goto bail;
				83	}
				84
				85	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
				86	le32_to_cpu(fe->i_clusters))) {
				87	mlog(ML_ERROR, "block offset is outside the allocated size: "
				88	"%llu\n", (unsigned long long)iblock);
				89	goto bail;
				90	}
				91
				92	/* We don't use the page cache to create symlink data, so if
				93	* need be, copy it over from the buffer cache. */
				94	if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
				95	u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
				96	iblock;
				97	buffer_cache_bh = sb_getblk(osb->sb, blkno);
				98	if (!buffer_cache_bh) {
				99	mlog(ML_ERROR, "couldn't getblock for symlink!\n");
				100	goto bail;
				101	}
				102
				103	/* we haven't locked out transactions, so a commit
				104	* could've happened. Since we've got a reference on
				105	* the bh, even if it commits while we're doing the
				106	* copy, the data is still good. */
				107	if (buffer_jbd(buffer_cache_bh)
				108	&& ocfs2_inode_is_new(inode)) {
				109	kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
				110	if (!kaddr) {
				111	mlog(ML_ERROR, "couldn't kmap!\n");
				112	goto bail;
				113	}
				114	memcpy(kaddr + (bh_result->b_size * iblock),
				115	buffer_cache_bh->b_data,
				116	bh_result->b_size);
				117	kunmap_atomic(kaddr, KM_USER0);
				118	set_buffer_uptodate(bh_result);
				119	}
				120	brelse(buffer_cache_bh);
				121	}
				122
				123	map_bh(bh_result, inode->i_sb,
				124	le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
				125
				126	err = 0;
				127
				128	bail:
Mark Fasheh	a81cb88	2008-10-07 14:25:16 -0700	[diff] [blame]	129	brelse(bh);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	130
				131	mlog_exit(err);
				132	return err;
				133	}
				134
				135	static int ocfs2_get_block(struct inode *inode, sector_t iblock,
				136	struct buffer_head *bh_result, int create)
				137	{
				138	int err = 0;
Mark Fasheh	49cb8d2	2007-03-09 16:21:46 -0800	[diff] [blame]	139	unsigned int ext_flags;
Mark Fasheh	628a24f	2007-10-30 12:08:32 -0700	[diff] [blame]	140	u64 max_blocks = bh_result->b_size >> inode->i_blkbits;
				141	u64 p_blkno, count, past_eof;
Mark Fasheh	25baf2d	2007-02-14 15:30:30 -0800	[diff] [blame]	142	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	143
				144	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
				145	(unsigned long long)iblock, bh_result, create);
				146
				147	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
				148	mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
				149	inode, inode->i_ino);
				150
				151	if (S_ISLNK(inode->i_mode)) {
				152	/* this always does I/O for some reason. */
				153	err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
				154	goto bail;
				155	}
				156
Mark Fasheh	628a24f	2007-10-30 12:08:32 -0700	[diff] [blame]	157	err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count,
Mark Fasheh	49cb8d2	2007-03-09 16:21:46 -0800	[diff] [blame]	158	&ext_flags);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	159	if (err) {
				160	mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	161	"%llu, NULL)\n", err, inode, (unsigned long long)iblock,
				162	(unsigned long long)p_blkno);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	163	goto bail;
				164	}
				165
Mark Fasheh	628a24f	2007-10-30 12:08:32 -0700	[diff] [blame]	166	if (max_blocks < count)
				167	count = max_blocks;
				168
Mark Fasheh	25baf2d	2007-02-14 15:30:30 -0800	[diff] [blame]	169	/*
				170	* ocfs2 never allocates in this function - the only time we
				171	* need to use BH_New is when we're extending i_size on a file
				172	* system which doesn't support holes, in which case BH_New
				173	* allows block_prepare_write() to zero.
Coly Li	c0420ad	2008-06-30 18:45:45 +0800	[diff] [blame]	174	*
				175	* If we see this on a sparse file system, then a truncate has
				176	* raced us and removed the cluster. In this case, we clear
				177	* the buffers dirty and uptodate bits and let the buffer code
				178	* ignore it as a hole.
Mark Fasheh	25baf2d	2007-02-14 15:30:30 -0800	[diff] [blame]	179	*/
Coly Li	c0420ad	2008-06-30 18:45:45 +0800	[diff] [blame]	180	if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) {
				181	clear_buffer_dirty(bh_result);
				182	clear_buffer_uptodate(bh_result);
				183	goto bail;
				184	}
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	185
Mark Fasheh	49cb8d2	2007-03-09 16:21:46 -0800	[diff] [blame]	186	/* Treat the unwritten extent as a hole for zeroing purposes. */
				187	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
Mark Fasheh	25baf2d	2007-02-14 15:30:30 -0800	[diff] [blame]	188	map_bh(bh_result, inode->i_sb, p_blkno);
				189
Mark Fasheh	628a24f	2007-10-30 12:08:32 -0700	[diff] [blame]	190	bh_result->b_size = count << inode->i_blkbits;
				191
Mark Fasheh	25baf2d	2007-02-14 15:30:30 -0800	[diff] [blame]	192	if (!ocfs2_sparse_alloc(osb)) {
				193	if (p_blkno == 0) {
				194	err = -EIO;
				195	mlog(ML_ERROR,
				196	"iblock = %llu p_blkno = %llu blkno=(%llu)\n",
				197	(unsigned long long)iblock,
				198	(unsigned long long)p_blkno,
				199	(unsigned long long)OCFS2_I(inode)->ip_blkno);
				200	mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
				201	dump_stack();
				202	}
				203
				204	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
				205	mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
				206	(unsigned long long)past_eof);
				207
				208	if (create && (iblock >= past_eof))
				209	set_buffer_new(bh_result);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	210	}
				211
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	212	bail:
				213	if (err < 0)
				214	err = -EIO;
				215
				216	mlog_exit(err);
				217	return err;
				218	}
				219
Mark Fasheh	1afc32b	2007-09-07 14:46:51 -0700	[diff] [blame]	220	int ocfs2_read_inline_data(struct inode inode, struct page page,
				221	struct buffer_head *di_bh)
Mark Fasheh	6798d35	2007-09-07 14:05:51 -0700	[diff] [blame]	222	{
				223	void *kaddr;
Jan Kara	d2849fb	2007-12-19 15:24:09 +0100	[diff] [blame]	224	loff_t size;
Mark Fasheh	6798d35	2007-09-07 14:05:51 -0700	[diff] [blame]	225	struct ocfs2_dinode di = (struct ocfs2_dinode )di_bh->b_data;
				226
				227	if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
				228	ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag",
				229	(unsigned long long)OCFS2_I(inode)->ip_blkno);
				230	return -EROFS;
				231	}
				232
				233	size = i_size_read(inode);
				234
				235	if (size > PAGE_CACHE_SIZE \|\|
				236	size > ocfs2_max_inline_data(inode->i_sb)) {
				237	ocfs2_error(inode->i_sb,
Jan Kara	d2849fb	2007-12-19 15:24:09 +0100	[diff] [blame]	238	"Inode %llu has with inline data has bad size: %Lu",
				239	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				240	(unsigned long long)size);
Mark Fasheh	6798d35	2007-09-07 14:05:51 -0700	[diff] [blame]	241	return -EROFS;
				242	}
				243
				244	kaddr = kmap_atomic(page, KM_USER0);
				245	if (size)
				246	memcpy(kaddr, di->id2.i_data.id_data, size);
				247	/* Clear the remaining part of the page */
				248	memset(kaddr + size, 0, PAGE_CACHE_SIZE - size);
				249	flush_dcache_page(page);
				250	kunmap_atomic(kaddr, KM_USER0);
				251
				252	SetPageUptodate(page);
				253
				254	return 0;
				255	}
				256
				257	static int ocfs2_readpage_inline(struct inode inode, struct page page)
				258	{
				259	int ret;
				260	struct buffer_head *di_bh = NULL;
Mark Fasheh	6798d35	2007-09-07 14:05:51 -0700	[diff] [blame]	261
				262	BUG_ON(!PageLocked(page));
Julia Lawall	86c838b	2008-02-26 21:45:56 +0100	[diff] [blame]	263	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
Mark Fasheh	6798d35	2007-09-07 14:05:51 -0700	[diff] [blame]	264
Joel Becker	0fcaa56	2008-10-09 17:20:31 -0700	[diff] [blame]	265	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
Mark Fasheh	6798d35	2007-09-07 14:05:51 -0700	[diff] [blame]	266	if (ret) {
				267	mlog_errno(ret);
				268	goto out;
				269	}
				270
				271	ret = ocfs2_read_inline_data(inode, page, di_bh);
				272	out:
				273	unlock_page(page);
				274
				275	brelse(di_bh);
				276	return ret;
				277	}
				278
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	279	static int ocfs2_readpage(struct file file, struct page page)
				280	{
				281	struct inode *inode = page->mapping->host;
Mark Fasheh	6798d35	2007-09-07 14:05:51 -0700	[diff] [blame]	282	struct ocfs2_inode_info *oi = OCFS2_I(inode);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	283	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
				284	int ret, unlock = 1;
				285
				286	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
				287
Mark Fasheh	e63aecb6	2007-10-18 15:30:42 -0700	[diff] [blame]	288	ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	289	if (ret != 0) {
				290	if (ret == AOP_TRUNCATED_PAGE)
				291	unlock = 0;
				292	mlog_errno(ret);
				293	goto out;
				294	}
				295
Mark Fasheh	6798d35	2007-09-07 14:05:51 -0700	[diff] [blame]	296	if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
Mark Fasheh	e9dfc0b	2007-05-14 11:38:51 -0700	[diff] [blame]	297	ret = AOP_TRUNCATED_PAGE;
Mark Fasheh	e63aecb6	2007-10-18 15:30:42 -0700	[diff] [blame]	298	goto out_inode_unlock;
Mark Fasheh	e9dfc0b	2007-05-14 11:38:51 -0700	[diff] [blame]	299	}
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	300
				301	/*
				302	* i_size might have just been updated as we grabed the meta lock. We
				303	* might now be discovering a truncate that hit on another node.
				304	* block_read_full_page->get_block freaks out if it is asked to read
				305	* beyond the end of a file, so we check here. Callers
Nick Piggin	54cb882	2007-07-19 01:46:59 -0700	[diff] [blame]	306	* (generic_file_read, vm_ops->fault) are clever enough to check i_size
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	307	* and notice that the page they just read isn't needed.
				308	*
				309	* XXX sys_readahead() seems to get that wrong?
				310	*/
				311	if (start >= i_size_read(inode)) {
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	312	zero_user(page, 0, PAGE_SIZE);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	313	SetPageUptodate(page);
				314	ret = 0;
				315	goto out_alloc;
				316	}
				317
Mark Fasheh	6798d35	2007-09-07 14:05:51 -0700	[diff] [blame]	318	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
				319	ret = ocfs2_readpage_inline(inode, page);
				320	else
				321	ret = block_read_full_page(page, ocfs2_get_block);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	322	unlock = 0;
				323
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	324	out_alloc:
				325	up_read(&OCFS2_I(inode)->ip_alloc_sem);
Mark Fasheh	e63aecb6	2007-10-18 15:30:42 -0700	[diff] [blame]	326	out_inode_unlock:
				327	ocfs2_inode_unlock(inode, 0);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	328	out:
				329	if (unlock)
				330	unlock_page(page);
				331	mlog_exit(ret);
				332	return ret;
				333	}
				334
Mark Fasheh	628a24f	2007-10-30 12:08:32 -0700	[diff] [blame]	335	/*
				336	* This is used only for read-ahead. Failures or difficult to handle
				337	* situations are safe to ignore.
				338	*
				339	* Right now, we don't bother with BH_Boundary - in-inode extent lists
				340	* are quite large (243 extents on 4k blocks), so most inodes don't
				341	* grow out to a tree. If need be, detecting boundary extents could
				342	* trivially be added in a future version of ocfs2_get_block().
				343	*/
				344	static int ocfs2_readpages(struct file filp, struct address_space mapping,
				345	struct list_head *pages, unsigned nr_pages)
				346	{
				347	int ret, err = -EIO;
				348	struct inode *inode = mapping->host;
				349	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				350	loff_t start;
				351	struct page *last;
				352
				353	/*
				354	* Use the nonblocking flag for the dlm code to avoid page
				355	* lock inversion, but don't bother with retrying.
				356	*/
				357	ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
				358	if (ret)
				359	return err;
				360
				361	if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
				362	ocfs2_inode_unlock(inode, 0);
				363	return err;
				364	}
				365
				366	/*
				367	* Don't bother with inline-data. There isn't anything
				368	* to read-ahead in that case anyway...
				369	*/
				370	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
				371	goto out_unlock;
				372
				373	/*
				374	* Check whether a remote node truncated this file - we just
				375	* drop out in that case as it's not worth handling here.
				376	*/
				377	last = list_entry(pages->prev, struct page, lru);
				378	start = (loff_t)last->index << PAGE_CACHE_SHIFT;
				379	if (start >= i_size_read(inode))
				380	goto out_unlock;
				381
				382	err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block);
				383
				384	out_unlock:
				385	up_read(&oi->ip_alloc_sem);
				386	ocfs2_inode_unlock(inode, 0);
				387
				388	return err;
				389	}
				390
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	391	/* Note: Because we don't support holes, our allocation has
				392	* already happened (allocation writes zeros to the file data)
				393	* so we don't have to worry about ordered writes in
				394	* ocfs2_writepage.
				395	*
				396	* ->writepage is called during the process of invalidating the page cache
				397	* during blocked lock processing. It can't block on any cluster locks
				398	* to during block mapping. It's relying on the fact that the block
				399	* mapping can't have disappeared under the dirty pages that it is
				400	* being asked to write back.
				401	*/
				402	static int ocfs2_writepage(struct page page, struct writeback_control wbc)
				403	{
				404	int ret;
				405
				406	mlog_entry("(0x%p)\n", page);
				407
				408	ret = block_write_full_page(page, ocfs2_get_block, wbc);
				409
				410	mlog_exit(ret);
				411
				412	return ret;
				413	}
				414
Mark Fasheh	5069120	2007-02-09 20:52:53 -0800	[diff] [blame]	415	/*
				416	* This is called from ocfs2_write_zero_page() which has handled it's
				417	* own cluster locking and has ensured allocation exists for those
				418	* blocks to be written.
				419	*/
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	420	int ocfs2_prepare_write_nolock(struct inode inode, struct page page,
				421	unsigned from, unsigned to)
				422	{
				423	int ret;
				424
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	425	ret = block_prepare_write(page, from, to, ocfs2_get_block);
				426
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	427	return ret;
				428	}
				429
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	430	/* Taken from ext3. We don't necessarily need the full blown
				431	* functionality yet, but IMHO it's better to cut and paste the whole
				432	* thing so we can avoid introducing our own bugs (and easily pick up
				433	* their fixes when they happen) --Mark */
Mark Fasheh	60b1139	2007-02-16 11:46:50 -0800	[diff] [blame]	434	int walk_page_buffers( handle_t *handle,
				435	struct buffer_head *head,
				436	unsigned from,
				437	unsigned to,
				438	int *partial,
				439	int (fn)( handle_t handle,
				440	struct buffer_head *bh))
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	441	{
				442	struct buffer_head *bh;
				443	unsigned block_start, block_end;
				444	unsigned blocksize = head->b_size;
				445	int err, ret = 0;
				446	struct buffer_head *next;
				447
				448	for ( bh = head, block_start = 0;
				449	ret == 0 && (bh != head \|\| !block_start);
				450	block_start = block_end, bh = next)
				451	{
				452	next = bh->b_this_page;
				453	block_end = block_start + blocksize;
				454	if (block_end <= from \|\| block_start >= to) {
				455	if (partial && !buffer_uptodate(bh))
				456	*partial = 1;
				457	continue;
				458	}
				459	err = (*fn)(handle, bh);
				460	if (!ret)
				461	ret = err;
				462	}
				463	return ret;
				464	}
				465
Mark Fasheh	1fabe14	2006-10-09 18:11:45 -0700	[diff] [blame]	466	handle_t ocfs2_start_walk_page_trans(struct inode inode,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	467	struct page *page,
				468	unsigned from,
				469	unsigned to)
				470	{
				471	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
Julia Lawall	58dadcd	2008-03-28 14:43:10 -0700	[diff] [blame]	472	handle_t *handle;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	473	int ret = 0;
				474
Mark Fasheh	65eff9c	2006-10-09 17:26:22 -0700	[diff] [blame]	475	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
Julia Lawall	58dadcd	2008-03-28 14:43:10 -0700	[diff] [blame]	476	if (IS_ERR(handle)) {
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	477	ret = -ENOMEM;
				478	mlog_errno(ret);
				479	goto out;
				480	}
				481
				482	if (ocfs2_should_order_data(inode)) {
Joel Becker	2b4e30f	2008-09-03 20:03:41 -0700	[diff] [blame]	483	ret = ocfs2_jbd2_file_inode(handle, inode);
				484	#ifdef CONFIG_OCFS2_COMPAT_JBD
Mark Fasheh	1fabe14	2006-10-09 18:11:45 -0700	[diff] [blame]	485	ret = walk_page_buffers(handle,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	486	page_buffers(page),
				487	from, to, NULL,
				488	ocfs2_journal_dirty_data);
Joel Becker	2b4e30f	2008-09-03 20:03:41 -0700	[diff] [blame]	489	#endif
				490	if (ret < 0)
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	491	mlog_errno(ret);
				492	}
				493	out:
				494	if (ret) {
Julia Lawall	58dadcd	2008-03-28 14:43:10 -0700	[diff] [blame]	495	if (!IS_ERR(handle))
Mark Fasheh	02dc1af	2006-10-09 16:48:10 -0700	[diff] [blame]	496	ocfs2_commit_trans(osb, handle);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	497	handle = ERR_PTR(ret);
				498	}
				499	return handle;
				500	}
				501
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	502	static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
				503	{
				504	sector_t status;
				505	u64 p_blkno = 0;
				506	int err = 0;
				507	struct inode *inode = mapping->host;
				508
				509	mlog_entry("(block = %llu)\n", (unsigned long long)block);
				510
				511	/* We don't need to lock journal system files, since they aren't
				512	* accessed concurrently from multiple nodes.
				513	*/
				514	if (!INODE_JOURNAL(inode)) {
Mark Fasheh	e63aecb6	2007-10-18 15:30:42 -0700	[diff] [blame]	515	err = ocfs2_inode_lock(inode, NULL, 0);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	516	if (err) {
				517	if (err != -ENOENT)
				518	mlog_errno(err);
				519	goto bail;
				520	}
				521	down_read(&OCFS2_I(inode)->ip_alloc_sem);
				522	}
				523
Mark Fasheh	6798d35	2007-09-07 14:05:51 -0700	[diff] [blame]	524	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
				525	err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
				526	NULL);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	527
				528	if (!INODE_JOURNAL(inode)) {
				529	up_read(&OCFS2_I(inode)->ip_alloc_sem);
Mark Fasheh	e63aecb6	2007-10-18 15:30:42 -0700	[diff] [blame]	530	ocfs2_inode_unlock(inode, 0);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	531	}
				532
				533	if (err) {
				534	mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
				535	(unsigned long long)block);
				536	mlog_errno(err);
				537	goto bail;
				538	}
				539
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	540	bail:
				541	status = err ? 0 : p_blkno;
				542
				543	mlog_exit((int)status);
				544
				545	return status;
				546	}
				547
				548	/*
				549	* TODO: Make this into a generic get_blocks function.
				550	*
				551	* From do_direct_io in direct-io.c:
				552	* "So what we do is to permit the ->get_blocks function to populate
				553	* bh.b_size with the size of IO which is permitted at this offset and
				554	* this i_blkbits."
				555	*
				556	* This function is called directly from get_more_blocks in direct-io.c.
				557	*
				558	* called like this: dio->get_blocks(dio->inode, fs_startblk,
				559	* fs_count, map_bh, dio->rw == WRITE);
				560	*/
				561	static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	562	struct buffer_head *bh_result, int create)
				563	{
				564	int ret;
Mark Fasheh	4f902c3	2007-03-09 16:26:50 -0800	[diff] [blame]	565	u64 p_blkno, inode_blocks, contig_blocks;
Mark Fasheh	49cb8d2	2007-03-09 16:21:46 -0800	[diff] [blame]	566	unsigned int ext_flags;
Florin Malita	184d7d2	2006-06-03 19:30:10 -0400	[diff] [blame]	567	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
Badari Pulavarty	1d8fa7a	2006-03-26 01:38:02 -0800	[diff] [blame]	568	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	569
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	570	/* This function won't even be called if the request isn't all
				571	* nicely aligned and of the right size, so there's no need
				572	* for us to check any of that. */
				573
Mark Fasheh	25baf2d	2007-02-14 15:30:30 -0800	[diff] [blame]	574	inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
Mark Fasheh	564f8a3	2006-12-14 13:01:05 -0800	[diff] [blame]	575
				576	/*
				577	* Any write past EOF is not allowed because we'd be extending.
				578	*/
				579	if (create && (iblock + max_blocks) > inode_blocks) {
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	580	ret = -EIO;
				581	goto bail;
				582	}
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	583
				584	/* This figures out the size of the next contiguous block, and
				585	* our logical offset */
Mark Fasheh	363041a	2007-01-17 12:31:35 -0800	[diff] [blame]	586	ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
Mark Fasheh	49cb8d2	2007-03-09 16:21:46 -0800	[diff] [blame]	587	&contig_blocks, &ext_flags);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	588	if (ret) {
				589	mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
				590	(unsigned long long)iblock);
				591	ret = -EIO;
				592	goto bail;
				593	}
				594
Tao Ma	0e11622	2008-09-03 01:57:14 +0800	[diff] [blame]	595	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
Mark Fasheh	25baf2d	2007-02-14 15:30:30 -0800	[diff] [blame]	596	ocfs2_error(inode->i_sb,
				597	"Inode %llu has a hole at block %llu\n",
				598	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				599	(unsigned long long)iblock);
				600	ret = -EROFS;
				601	goto bail;
				602	}
				603
				604	/*
				605	* get_more_blocks() expects us to describe a hole by clearing
				606	* the mapped bit on bh_result().
Mark Fasheh	49cb8d2	2007-03-09 16:21:46 -0800	[diff] [blame]	607	*
				608	* Consider an unwritten extent as a hole.
Mark Fasheh	25baf2d	2007-02-14 15:30:30 -0800	[diff] [blame]	609	*/
Mark Fasheh	49cb8d2	2007-03-09 16:21:46 -0800	[diff] [blame]	610	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
Mark Fasheh	25baf2d	2007-02-14 15:30:30 -0800	[diff] [blame]	611	map_bh(bh_result, inode->i_sb, p_blkno);
				612	else {
				613	/*
				614	* ocfs2_prepare_inode_for_write() should have caught
				615	* the case where we'd be filling a hole and triggered
				616	* a buffered write instead.
				617	*/
				618	if (create) {
				619	ret = -EIO;
				620	mlog_errno(ret);
				621	goto bail;
				622	}
				623
				624	clear_buffer_mapped(bh_result);
				625	}
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	626
				627	/* make sure we don't map more than max_blocks blocks here as
				628	that's all the kernel will handle at this point. */
				629	if (max_blocks < contig_blocks)
				630	contig_blocks = max_blocks;
				631	bh_result->b_size = contig_blocks << blocksize_bits;
				632	bail:
				633	return ret;
				634	}
				635
				636	/*
				637	* ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
				638	* particularly interested in the aio/dio case. Like the core uses
				639	* i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
				640	* truncation on another.
				641	*/
				642	static void ocfs2_dio_end_io(struct kiocb *iocb,
				643	loff_t offset,
				644	ssize_t bytes,
				645	void *private)
				646	{
Josef Sipek	d28c917	2006-12-08 02:37:25 -0800	[diff] [blame]	647	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
Mark Fasheh	7cdfc3a	2007-04-16 17:28:51 -0700	[diff] [blame]	648	int level;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	649
				650	/* this io's submitter should not have unlocked this before we could */
				651	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
Mark Fasheh	7cdfc3a	2007-04-16 17:28:51 -0700	[diff] [blame]	652
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	653	ocfs2_iocb_clear_rw_locked(iocb);
Mark Fasheh	7cdfc3a	2007-04-16 17:28:51 -0700	[diff] [blame]	654
				655	level = ocfs2_iocb_rw_locked_level(iocb);
				656	if (!level)
				657	up_read(&inode->i_alloc_sem);
				658	ocfs2_rw_unlock(inode, level);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	659	}
				660
Joel Becker	03f981c	2007-01-04 14:54:41 -0800	[diff] [blame]	661	/*
				662	* ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen
				663	* from ext3. PageChecked() bits have been removed as OCFS2 does not
				664	* do journalled data.
				665	*/
				666	static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
				667	{
				668	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
				669
Joel Becker	2b4e30f	2008-09-03 20:03:41 -0700	[diff] [blame]	670	jbd2_journal_invalidatepage(journal, page, offset);
Joel Becker	03f981c	2007-01-04 14:54:41 -0800	[diff] [blame]	671	}
				672
				673	static int ocfs2_releasepage(struct page *page, gfp_t wait)
				674	{
				675	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
				676
				677	if (!page_has_buffers(page))
				678	return 0;
Joel Becker	2b4e30f	2008-09-03 20:03:41 -0700	[diff] [blame]	679	return jbd2_journal_try_to_free_buffers(journal, page, wait);
Joel Becker	03f981c	2007-01-04 14:54:41 -0800	[diff] [blame]	680	}
				681
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	682	static ssize_t ocfs2_direct_IO(int rw,
				683	struct kiocb *iocb,
				684	const struct iovec *iov,
				685	loff_t offset,
				686	unsigned long nr_segs)
				687	{
				688	struct file *file = iocb->ki_filp;
Josef Sipek	d28c917	2006-12-08 02:37:25 -0800	[diff] [blame]	689	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	690	int ret;
				691
				692	mlog_entry_void();
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	693
Mark Fasheh	6798d35	2007-09-07 14:05:51 -0700	[diff] [blame]	694	/*
				695	* Fallback to buffered I/O if we see an inode without
				696	* extents.
				697	*/
				698	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
				699	return 0;
				700
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	701	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
				702	inode->i_sb->s_bdev, iov, offset,
				703	nr_segs,
				704	ocfs2_direct_IO_get_blocks,
				705	ocfs2_dio_end_io);
Mark Fasheh	c934a92	2007-10-18 15:23:46 -0700	[diff] [blame]	706
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	707	mlog_exit(ret);
				708	return ret;
				709	}
				710
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	711	static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
				712	u32 cpos,
				713	unsigned int *start,
				714	unsigned int *end)
				715	{
				716	unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
				717
				718	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
				719	unsigned int cpp;
				720
				721	cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
				722
				723	cluster_start = cpos % cpp;
				724	cluster_start = cluster_start << osb->s_clustersize_bits;
				725
				726	cluster_end = cluster_start + osb->s_clustersize;
				727	}
				728
				729	BUG_ON(cluster_start > PAGE_SIZE);
				730	BUG_ON(cluster_end > PAGE_SIZE);
				731
				732	if (start)
				733	*start = cluster_start;
				734	if (end)
				735	*end = cluster_end;
				736	}
				737
				738	/*
				739	* 'from' and 'to' are the region in the page to avoid zeroing.
				740	*
				741	* If pagesize > clustersize, this function will avoid zeroing outside
				742	* of the cluster boundary.
				743	*
				744	* from == to == 0 is code for "zero the entire cluster region"
				745	*/
				746	static void ocfs2_clear_page_regions(struct page *page,
				747	struct ocfs2_super *osb, u32 cpos,
				748	unsigned from, unsigned to)
				749	{
				750	void *kaddr;
				751	unsigned int cluster_start, cluster_end;
				752
				753	ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
				754
				755	kaddr = kmap_atomic(page, KM_USER0);
				756
				757	if (from \|\| to) {
				758	if (from > cluster_start)
				759	memset(kaddr + cluster_start, 0, from - cluster_start);
				760	if (to < cluster_end)
				761	memset(kaddr + to, 0, cluster_end - to);
				762	} else {
				763	memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
				764	}
				765
				766	kunmap_atomic(kaddr, KM_USER0);
				767	}
				768
				769	/*
Mark Fasheh	4e9563f	2007-11-01 11:37:48 -0700	[diff] [blame]	770	* Nonsparse file systems fully allocate before we get to the write
				771	* code. This prevents ocfs2_write() from tagging the write as an
				772	* allocating one, which means ocfs2_map_page_blocks() might try to
				773	* read-in the blocks at the tail of our file. Avoid reading them by
				774	* testing i_size against each block offset.
				775	*/
				776	static int ocfs2_should_read_blk(struct inode inode, struct page page,
				777	unsigned int block_start)
				778	{
				779	u64 offset = page_offset(page) + block_start;
				780
				781	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
				782	return 1;
				783
				784	if (i_size_read(inode) > offset)
				785	return 1;
				786
				787	return 0;
				788	}
				789
				790	/*
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	791	* Some of this taken from block_prepare_write(). We already have our
				792	* mapping by now though, and the entire write will be allocating or
				793	* it won't, so not much need to use BH_New.
				794	*
				795	* This will also skip zeroing, which is handled externally.
				796	*/
Mark Fasheh	60b1139	2007-02-16 11:46:50 -0800	[diff] [blame]	797	int ocfs2_map_page_blocks(struct page page, u64 p_blkno,
				798	struct inode *inode, unsigned int from,
				799	unsigned int to, int new)
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	800	{
				801	int ret = 0;
				802	struct buffer_head head, bh, wait[2], *wait_bh = wait;
				803	unsigned int block_end, block_start;
				804	unsigned int bsize = 1 << inode->i_blkbits;
				805
				806	if (!page_has_buffers(page))
				807	create_empty_buffers(page, bsize, 0);
				808
				809	head = page_buffers(page);
				810	for (bh = head, block_start = 0; bh != head \|\| !block_start;
				811	bh = bh->b_this_page, block_start += bsize) {
				812	block_end = block_start + bsize;
				813
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	814	clear_buffer_new(bh);
				815
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	816	/*
				817	* Ignore blocks outside of our i/o range -
				818	* they may belong to unallocated clusters.
				819	*/
Mark Fasheh	60b1139	2007-02-16 11:46:50 -0800	[diff] [blame]	820	if (block_start >= to \|\| block_end <= from) {
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	821	if (PageUptodate(page))
				822	set_buffer_uptodate(bh);
				823	continue;
				824	}
				825
				826	/*
				827	* For an allocating write with cluster size >= page
				828	* size, we always write the entire page.
				829	*/
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	830	if (new)
				831	set_buffer_new(bh);
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	832
				833	if (!buffer_mapped(bh)) {
				834	map_bh(bh, inode->i_sb, *p_blkno);
				835	unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
				836	}
				837
				838	if (PageUptodate(page)) {
				839	if (!buffer_uptodate(bh))
				840	set_buffer_uptodate(bh);
				841	} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
Mark Fasheh	bce9976	2007-06-18 11:12:36 -0700	[diff] [blame]	842	!buffer_new(bh) &&
Mark Fasheh	4e9563f	2007-11-01 11:37:48 -0700	[diff] [blame]	843	ocfs2_should_read_blk(inode, page, block_start) &&
Mark Fasheh	bce9976	2007-06-18 11:12:36 -0700	[diff] [blame]	844	(block_start < from \|\| block_end > to)) {
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	845	ll_rw_block(READ, 1, &bh);
				846	*wait_bh++=bh;
				847	}
				848
				849	p_blkno = p_blkno + 1;
				850	}
				851
				852	/*
				853	* If we issued read requests - let them complete.
				854	*/
				855	while(wait_bh > wait) {
				856	wait_on_buffer(*--wait_bh);
				857	if (!buffer_uptodate(*wait_bh))
				858	ret = -EIO;
				859	}
				860
				861	if (ret == 0 \|\| !new)
				862	return ret;
				863
				864	/*
				865	* If we get -EIO above, zero out any newly allocated blocks
				866	* to avoid exposing stale data.
				867	*/
				868	bh = head;
				869	block_start = 0;
				870	do {
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	871	block_end = block_start + bsize;
				872	if (block_end <= from)
				873	goto next_bh;
				874	if (block_start >= to)
				875	break;
				876
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	877	zero_user(page, block_start, bh->b_size);
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	878	set_buffer_uptodate(bh);
				879	mark_buffer_dirty(bh);
				880
				881	next_bh:
				882	block_start = block_end;
				883	bh = bh->b_this_page;
				884	} while (bh != head);
				885
				886	return ret;
				887	}
				888
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	889	#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
				890	#define OCFS2_MAX_CTXT_PAGES 1
				891	#else
				892	#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
				893	#endif
Mark Fasheh	6af67d8	2007-03-06 17:24:46 -0800	[diff] [blame]	894
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	895	#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
Mark Fasheh	6af67d8	2007-03-06 17:24:46 -0800	[diff] [blame]	896
				897	/*
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	898	* Describe the state of a single cluster to be written to.
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	899	*/
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	900	struct ocfs2_write_cluster_desc {
				901	u32 c_cpos;
				902	u32 c_phys;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	903	/*
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	904	* Give this a unique field because c_phys eventually gets
				905	* filled.
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	906	*/
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	907	unsigned c_new;
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	908	unsigned c_unwritten;
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	909	};
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	910
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	911	static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d)
				912	{
				913	return d->c_new \|\| d->c_unwritten;
				914	}
				915
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	916	struct ocfs2_write_ctxt {
				917	/* Logical cluster position / len of write */
				918	u32 w_cpos;
				919	u32 w_clen;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	920
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	921	struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	922
				923	/*
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	924	* This is true if page_size > cluster_size.
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	925	*
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	926	* It triggers a set of special cases during write which might
				927	* have to deal with allocating writes to partial pages.
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	928	*/
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	929	unsigned int w_large_pages;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	930
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	931	/*
				932	* Pages involved in this write.
				933	*
				934	* w_target_page is the page being written to by the user.
				935	*
				936	* w_pages is an array of pages which always contains
				937	* w_target_page, and in the case of an allocating write with
				938	* page_size < cluster size, it will contain zero'd and mapped
				939	* pages adjacent to w_target_page which need to be written
				940	* out in so that future reads from that region will get
				941	* zero's.
				942	*/
				943	struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
				944	unsigned int w_num_pages;
				945	struct page *w_target_page;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	946
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	947	/*
				948	* ocfs2_write_end() uses this to know what the real range to
				949	* write in the target should be.
				950	*/
				951	unsigned int w_target_from;
				952	unsigned int w_target_to;
				953
				954	/*
				955	* We could use journal_current_handle() but this is cleaner,
				956	* IMHO -Mark
				957	*/
				958	handle_t *w_handle;
				959
				960	struct buffer_head *w_di_bh;
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	961
				962	struct ocfs2_cached_dealloc_ctxt w_dealloc;
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	963	};
				964
Mark Fasheh	1d410a6	2007-09-07 14:20:45 -0700	[diff] [blame]	965	void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	966	{
				967	int i;
				968
Mark Fasheh	1d410a6	2007-09-07 14:20:45 -0700	[diff] [blame]	969	for(i = 0; i < num_pages; i++) {
				970	if (pages[i]) {
				971	unlock_page(pages[i]);
				972	mark_page_accessed(pages[i]);
				973	page_cache_release(pages[i]);
				974	}
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	975	}
Mark Fasheh	1d410a6	2007-09-07 14:20:45 -0700	[diff] [blame]	976	}
				977
				978	static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
				979	{
				980	ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	981
				982	brelse(wc->w_di_bh);
				983	kfree(wc);
				984	}
				985
				986	static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
				987	struct ocfs2_super *osb, loff_t pos,
Mark Fasheh	607d44a	2007-05-09 15:14:45 -0700	[diff] [blame]	988	unsigned len, struct buffer_head *di_bh)
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	989	{
tao.ma@oracle.com	30b8548	2007-09-06 08:02:25 +0800	[diff] [blame]	990	u32 cend;
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	991	struct ocfs2_write_ctxt *wc;
				992
				993	wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
				994	if (!wc)
				995	return -ENOMEM;
				996
				997	wc->w_cpos = pos >> osb->s_clustersize_bits;
tao.ma@oracle.com	30b8548	2007-09-06 08:02:25 +0800	[diff] [blame]	998	cend = (pos + len - 1) >> osb->s_clustersize_bits;
				999	wc->w_clen = cend - wc->w_cpos + 1;
Mark Fasheh	607d44a	2007-05-09 15:14:45 -0700	[diff] [blame]	1000	get_bh(di_bh);
				1001	wc->w_di_bh = di_bh;
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1002
				1003	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
				1004	wc->w_large_pages = 1;
				1005	else
				1006	wc->w_large_pages = 0;
				1007
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1008	ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
				1009
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1010	*wcp = wc;
				1011
				1012	return 0;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1013	}
				1014
				1015	/*
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1016	* If a page has any new buffers, zero them out here, and mark them uptodate
				1017	* and dirty so they'll be written out (in order to prevent uninitialised
				1018	* block data from leaking). And clear the new bit.
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1019	*/
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1020	static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1021	{
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1022	unsigned int block_start, block_end;
				1023	struct buffer_head head, bh;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1024
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1025	BUG_ON(!PageLocked(page));
				1026	if (!page_has_buffers(page))
				1027	return;
				1028
				1029	bh = head = page_buffers(page);
				1030	block_start = 0;
				1031	do {
				1032	block_end = block_start + bh->b_size;
				1033
				1034	if (buffer_new(bh)) {
				1035	if (block_end > from && block_start < to) {
				1036	if (!PageUptodate(page)) {
				1037	unsigned start, end;
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1038
				1039	start = max(from, block_start);
				1040	end = min(to, block_end);
				1041
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	1042	zero_user_segment(page, start, end);
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1043	set_buffer_uptodate(bh);
				1044	}
				1045
				1046	clear_buffer_new(bh);
				1047	mark_buffer_dirty(bh);
				1048	}
				1049	}
				1050
				1051	block_start = block_end;
				1052	bh = bh->b_this_page;
				1053	} while (bh != head);
				1054	}
				1055
				1056	/*
				1057	* Only called when we have a failure during allocating write to write
				1058	* zero's to the newly allocated region.
				1059	*/
				1060	static void ocfs2_write_failure(struct inode *inode,
				1061	struct ocfs2_write_ctxt *wc,
				1062	loff_t user_pos, unsigned user_len)
				1063	{
				1064	int i;
Mark Fasheh	5c26a7b	2007-09-18 17:49:29 -0700	[diff] [blame]	1065	unsigned from = user_pos & (PAGE_CACHE_SIZE - 1),
				1066	to = user_pos + user_len;
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1067	struct page *tmppage;
				1068
Mark Fasheh	5c26a7b	2007-09-18 17:49:29 -0700	[diff] [blame]	1069	ocfs2_zero_new_buffers(wc->w_target_page, from, to);
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1070
				1071	for(i = 0; i < wc->w_num_pages; i++) {
				1072	tmppage = wc->w_pages[i];
				1073
Sunil Mushran	961cecb	2008-07-16 17:22:22 -0700	[diff] [blame]	1074	if (page_has_buffers(tmppage)) {
Joel Becker	2b4e30f	2008-09-03 20:03:41 -0700	[diff] [blame]	1075	if (ocfs2_should_order_data(inode)) {
				1076	ocfs2_jbd2_file_inode(wc->w_handle, inode);
				1077	#ifdef CONFIG_OCFS2_COMPAT_JBD
Sunil Mushran	961cecb	2008-07-16 17:22:22 -0700	[diff] [blame]	1078	walk_page_buffers(wc->w_handle,
				1079	page_buffers(tmppage),
				1080	from, to, NULL,
				1081	ocfs2_journal_dirty_data);
Joel Becker	2b4e30f	2008-09-03 20:03:41 -0700	[diff] [blame]	1082	#endif
				1083	}
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1084
Sunil Mushran	961cecb	2008-07-16 17:22:22 -0700	[diff] [blame]	1085	block_commit_write(tmppage, from, to);
				1086	}
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1087	}
				1088	}
				1089
				1090	static int ocfs2_prepare_page_for_write(struct inode inode, u64 p_blkno,
				1091	struct ocfs2_write_ctxt *wc,
				1092	struct page *page, u32 cpos,
				1093	loff_t user_pos, unsigned user_len,
				1094	int new)
				1095	{
				1096	int ret;
				1097	unsigned int map_from = 0, map_to = 0;
				1098	unsigned int cluster_start, cluster_end;
				1099	unsigned int user_data_from = 0, user_data_to = 0;
				1100
				1101	ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1102	&cluster_start, &cluster_end);
				1103
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1104	if (page == wc->w_target_page) {
				1105	map_from = user_pos & (PAGE_CACHE_SIZE - 1);
				1106	map_to = map_from + user_len;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1107
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1108	if (new)
				1109	ret = ocfs2_map_page_blocks(page, p_blkno, inode,
				1110	cluster_start, cluster_end,
				1111	new);
				1112	else
				1113	ret = ocfs2_map_page_blocks(page, p_blkno, inode,
				1114	map_from, map_to, new);
				1115	if (ret) {
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1116	mlog_errno(ret);
				1117	goto out;
				1118	}
				1119
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1120	user_data_from = map_from;
				1121	user_data_to = map_to;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1122	if (new) {
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1123	map_from = cluster_start;
				1124	map_to = cluster_end;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1125	}
				1126	} else {
				1127	/*
				1128	* If we haven't allocated the new page yet, we
				1129	* shouldn't be writing it out without copying user
				1130	* data. This is likely a math error from the caller.
				1131	*/
				1132	BUG_ON(!new);
				1133
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1134	map_from = cluster_start;
				1135	map_to = cluster_end;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1136
				1137	ret = ocfs2_map_page_blocks(page, p_blkno, inode,
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1138	cluster_start, cluster_end, new);
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1139	if (ret) {
				1140	mlog_errno(ret);
				1141	goto out;
				1142	}
				1143	}
				1144
				1145	/*
				1146	* Parts of newly allocated pages need to be zero'd.
				1147	*
				1148	* Above, we have also rewritten 'to' and 'from' - as far as
				1149	* the rest of the function is concerned, the entire cluster
				1150	* range inside of a page needs to be written.
				1151	*
				1152	* We can skip this if the page is up to date - it's already
				1153	* been zero'd from being read in as a hole.
				1154	*/
				1155	if (new && !PageUptodate(page))
				1156	ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1157	cpos, user_data_from, user_data_to);
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1158
				1159	flush_dcache_page(page);
				1160
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1161	out:
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1162	return ret;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1163	}
				1164
				1165	/*
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1166	* This function will only grab one clusters worth of pages.
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1167	*/
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1168	static int ocfs2_grab_pages_for_write(struct address_space *mapping,
				1169	struct ocfs2_write_ctxt *wc,
Mark Fasheh	7307de8	2007-05-09 15:16:19 -0700	[diff] [blame]	1170	u32 cpos, loff_t user_pos, int new,
				1171	struct page *mmap_page)
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1172	{
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1173	int ret = 0, i;
				1174	unsigned long start, target_index, index;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1175	struct inode *inode = mapping->host;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1176
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1177	target_index = user_pos >> PAGE_CACHE_SHIFT;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1178
				1179	/*
				1180	* Figure out how many pages we'll be manipulating here. For
Mark Fasheh	60b1139	2007-02-16 11:46:50 -0800	[diff] [blame]	1181	* non allocating write, we just change the one
				1182	* page. Otherwise, we'll need a whole clusters worth.
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1183	*/
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1184	if (new) {
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1185	wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
				1186	start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1187	} else {
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1188	wc->w_num_pages = 1;
				1189	start = target_index;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1190	}
				1191
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1192	for(i = 0; i < wc->w_num_pages; i++) {
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1193	index = start + i;
				1194
Mark Fasheh	7307de8	2007-05-09 15:16:19 -0700	[diff] [blame]	1195	if (index == target_index && mmap_page) {
				1196	/*
				1197	* ocfs2_pagemkwrite() is a little different
				1198	* and wants us to directly use the page
				1199	* passed in.
				1200	*/
				1201	lock_page(mmap_page);
				1202
				1203	if (mmap_page->mapping != mapping) {
				1204	unlock_page(mmap_page);
				1205	/*
				1206	* Sanity check - the locking in
				1207	* ocfs2_pagemkwrite() should ensure
				1208	* that this code doesn't trigger.
				1209	*/
				1210	ret = -EINVAL;
				1211	mlog_errno(ret);
				1212	goto out;
				1213	}
				1214
				1215	page_cache_get(mmap_page);
				1216	wc->w_pages[i] = mmap_page;
				1217	} else {
				1218	wc->w_pages[i] = find_or_create_page(mapping, index,
				1219	GFP_NOFS);
				1220	if (!wc->w_pages[i]) {
				1221	ret = -ENOMEM;
				1222	mlog_errno(ret);
				1223	goto out;
				1224	}
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1225	}
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1226
				1227	if (index == target_index)
				1228	wc->w_target_page = wc->w_pages[i];
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1229	}
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1230	out:
				1231	return ret;
				1232	}
				1233
				1234	/*
				1235	* Prepare a single cluster for write one cluster into the file.
				1236	*/
				1237	static int ocfs2_write_cluster(struct address_space *mapping,
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1238	u32 phys, unsigned int unwritten,
				1239	struct ocfs2_alloc_context *data_ac,
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1240	struct ocfs2_alloc_context *meta_ac,
				1241	struct ocfs2_write_ctxt *wc, u32 cpos,
				1242	loff_t user_pos, unsigned user_len)
				1243	{
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1244	int ret, i, new, should_zero = 0;
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1245	u64 v_blkno, p_blkno;
				1246	struct inode *inode = mapping->host;
Joel Becker	f99b9b7	2008-08-20 19:36:33 -0700	[diff] [blame]	1247	struct ocfs2_extent_tree et;
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1248
				1249	new = phys == 0 ? 1 : 0;
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1250	if (new \|\| unwritten)
				1251	should_zero = 1;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1252
				1253	if (new) {
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1254	u32 tmp_pos;
				1255
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1256	/*
				1257	* This is safe to call with the page locks - it won't take
				1258	* any additional semaphores or cluster locks.
				1259	*/
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1260	tmp_pos = cpos;
Tao Ma	0eb8d47	2008-08-18 17:38:45 +0800	[diff] [blame]	1261	ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
				1262	&tmp_pos, 1, 0, wc->w_di_bh,
				1263	wc->w_handle, data_ac,
				1264	meta_ac, NULL);
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1265	/*
				1266	* This shouldn't happen because we must have already
				1267	* calculated the correct meta data allocation required. The
				1268	* internal tree allocation code should know how to increase
				1269	* transaction credits itself.
				1270	*
				1271	* If need be, we could handle -EAGAIN for a
				1272	* RESTART_TRANS here.
				1273	*/
				1274	mlog_bug_on_msg(ret == -EAGAIN,
				1275	"Inode %llu: EAGAIN return during allocation.\n",
				1276	(unsigned long long)OCFS2_I(inode)->ip_blkno);
				1277	if (ret < 0) {
				1278	mlog_errno(ret);
				1279	goto out;
				1280	}
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1281	} else if (unwritten) {
Joel Becker	8d6220d	2008-08-22 12:46:09 -0700	[diff] [blame]	1282	ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
Joel Becker	f99b9b7	2008-08-20 19:36:33 -0700	[diff] [blame]	1283	ret = ocfs2_mark_extent_written(inode, &et,
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1284	wc->w_handle, cpos, 1, phys,
Joel Becker	f99b9b7	2008-08-20 19:36:33 -0700	[diff] [blame]	1285	meta_ac, &wc->w_dealloc);
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1286	if (ret < 0) {
				1287	mlog_errno(ret);
				1288	goto out;
				1289	}
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1290	}
				1291
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1292	if (should_zero)
				1293	v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
				1294	else
				1295	v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
				1296
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1297	/*
				1298	* The only reason this should fail is due to an inability to
				1299	* find the extent added.
				1300	*/
Mark Fasheh	49cb8d2	2007-03-09 16:21:46 -0800	[diff] [blame]	1301	ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
				1302	NULL);
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1303	if (ret < 0) {
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1304	ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, "
				1305	"at logical block %llu",
				1306	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				1307	(unsigned long long)v_blkno);
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1308	goto out;
				1309	}
				1310
				1311	BUG_ON(p_blkno == 0);
				1312
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1313	for(i = 0; i < wc->w_num_pages; i++) {
				1314	int tmpret;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1315
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1316	tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
				1317	wc->w_pages[i], cpos,
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1318	user_pos, user_len,
				1319	should_zero);
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1320	if (tmpret) {
				1321	mlog_errno(tmpret);
				1322	if (ret == 0)
				1323	tmpret = ret;
				1324	}
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1325	}
				1326
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1327	/*
				1328	* We only have cleanup to do in case of allocating write.
				1329	*/
				1330	if (ret && new)
				1331	ocfs2_write_failure(inode, wc, user_pos, user_len);
				1332
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1333	out:
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1334
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1335	return ret;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1336	}
				1337
Mark Fasheh	0d172ba	2007-05-14 18:09:54 -0700	[diff] [blame]	1338	static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
				1339	struct ocfs2_alloc_context *data_ac,
				1340	struct ocfs2_alloc_context *meta_ac,
				1341	struct ocfs2_write_ctxt *wc,
				1342	loff_t pos, unsigned len)
				1343	{
				1344	int ret, i;
Mark Fasheh	db56246	2007-09-17 09:06:29 -0700	[diff] [blame]	1345	loff_t cluster_off;
				1346	unsigned int local_len = len;
Mark Fasheh	0d172ba	2007-05-14 18:09:54 -0700	[diff] [blame]	1347	struct ocfs2_write_cluster_desc *desc;
Mark Fasheh	db56246	2007-09-17 09:06:29 -0700	[diff] [blame]	1348	struct ocfs2_super *osb = OCFS2_SB(mapping->host->i_sb);
Mark Fasheh	0d172ba	2007-05-14 18:09:54 -0700	[diff] [blame]	1349
				1350	for (i = 0; i < wc->w_clen; i++) {
				1351	desc = &wc->w_desc[i];
				1352
Mark Fasheh	db56246	2007-09-17 09:06:29 -0700	[diff] [blame]	1353	/*
				1354	* We have to make sure that the total write passed in
				1355	* doesn't extend past a single cluster.
				1356	*/
				1357	local_len = len;
				1358	cluster_off = pos & (osb->s_clustersize - 1);
				1359	if ((cluster_off + local_len) > osb->s_clustersize)
				1360	local_len = osb->s_clustersize - cluster_off;
				1361
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1362	ret = ocfs2_write_cluster(mapping, desc->c_phys,
				1363	desc->c_unwritten, data_ac, meta_ac,
Mark Fasheh	db56246	2007-09-17 09:06:29 -0700	[diff] [blame]	1364	wc, desc->c_cpos, pos, local_len);
Mark Fasheh	0d172ba	2007-05-14 18:09:54 -0700	[diff] [blame]	1365	if (ret) {
				1366	mlog_errno(ret);
				1367	goto out;
				1368	}
Mark Fasheh	db56246	2007-09-17 09:06:29 -0700	[diff] [blame]	1369
				1370	len -= local_len;
				1371	pos += local_len;
Mark Fasheh	0d172ba	2007-05-14 18:09:54 -0700	[diff] [blame]	1372	}
				1373
				1374	ret = 0;
				1375	out:
				1376	return ret;
				1377	}
				1378
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1379	/*
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1380	* ocfs2_write_end() wants to know which parts of the target page it
				1381	* should complete the write on. It's easiest to compute them ahead of
				1382	* time when a more complete view of the write is available.
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1383	*/
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1384	static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
				1385	struct ocfs2_write_ctxt *wc,
				1386	loff_t pos, unsigned len, int alloc)
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1387	{
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1388	struct ocfs2_write_cluster_desc *desc;
				1389
				1390	wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
				1391	wc->w_target_to = wc->w_target_from + len;
				1392
				1393	if (alloc == 0)
				1394	return;
				1395
				1396	/*
				1397	* Allocating write - we may have different boundaries based
				1398	* on page size and cluster size.
				1399	*
				1400	* NOTE: We can no longer compute one value from the other as
				1401	* the actual write length and user provided length may be
				1402	* different.
				1403	*/
				1404
				1405	if (wc->w_large_pages) {
				1406	/*
				1407	* We only care about the 1st and last cluster within
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1408	* our range and whether they should be zero'd or not. Either
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1409	* value may be extended out to the start/end of a
				1410	* newly allocated cluster.
				1411	*/
				1412	desc = &wc->w_desc[0];
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1413	if (ocfs2_should_zero_cluster(desc))
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1414	ocfs2_figure_cluster_boundaries(osb,
				1415	desc->c_cpos,
				1416	&wc->w_target_from,
				1417	NULL);
				1418
				1419	desc = &wc->w_desc[wc->w_clen - 1];
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1420	if (ocfs2_should_zero_cluster(desc))
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1421	ocfs2_figure_cluster_boundaries(osb,
				1422	desc->c_cpos,
				1423	NULL,
				1424	&wc->w_target_to);
				1425	} else {
				1426	wc->w_target_from = 0;
				1427	wc->w_target_to = PAGE_CACHE_SIZE;
				1428	}
				1429	}
				1430
Mark Fasheh	0d172ba	2007-05-14 18:09:54 -0700	[diff] [blame]	1431	/*
				1432	* Populate each single-cluster write descriptor in the write context
				1433	* with information about the i/o to be done.
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1434	*
				1435	* Returns the number of clusters that will have to be allocated, as
				1436	* well as a worst case estimate of the number of extent records that
				1437	* would have to be created during a write to an unwritten region.
Mark Fasheh	0d172ba	2007-05-14 18:09:54 -0700	[diff] [blame]	1438	*/
				1439	static int ocfs2_populate_write_desc(struct inode *inode,
				1440	struct ocfs2_write_ctxt *wc,
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1441	unsigned int *clusters_to_alloc,
				1442	unsigned int *extents_to_split)
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1443	{
Mark Fasheh	0d172ba	2007-05-14 18:09:54 -0700	[diff] [blame]	1444	int ret;
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1445	struct ocfs2_write_cluster_desc *desc;
Mark Fasheh	0d172ba	2007-05-14 18:09:54 -0700	[diff] [blame]	1446	unsigned int num_clusters = 0;
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1447	unsigned int ext_flags = 0;
Mark Fasheh	0d172ba	2007-05-14 18:09:54 -0700	[diff] [blame]	1448	u32 phys = 0;
				1449	int i;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1450
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1451	*clusters_to_alloc = 0;
				1452	*extents_to_split = 0;
				1453
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1454	for (i = 0; i < wc->w_clen; i++) {
				1455	desc = &wc->w_desc[i];
				1456	desc->c_cpos = wc->w_cpos + i;
				1457
				1458	if (num_clusters == 0) {
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1459	/*
				1460	* Need to look up the next extent record.
				1461	*/
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1462	ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1463	&num_clusters, &ext_flags);
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1464	if (ret) {
				1465	mlog_errno(ret);
Mark Fasheh	607d44a	2007-05-09 15:14:45 -0700	[diff] [blame]	1466	goto out;
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1467	}
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1468
				1469	/*
				1470	* Assume worst case - that we're writing in
				1471	* the middle of the extent.
				1472	*
				1473	* We can assume that the write proceeds from
				1474	* left to right, in which case the extent
				1475	* insert code is smart enough to coalesce the
				1476	* next splits into the previous records created.
				1477	*/
				1478	if (ext_flags & OCFS2_EXT_UNWRITTEN)
				1479	extents_to_split = extents_to_split + 2;
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1480	} else if (phys) {
				1481	/*
				1482	* Only increment phys if it doesn't describe
				1483	* a hole.
				1484	*/
				1485	phys++;
				1486	}
				1487
				1488	desc->c_phys = phys;
				1489	if (phys == 0) {
				1490	desc->c_new = 1;
Mark Fasheh	0d172ba	2007-05-14 18:09:54 -0700	[diff] [blame]	1491	clusters_to_alloc = clusters_to_alloc + 1;
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1492	}
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1493	if (ext_flags & OCFS2_EXT_UNWRITTEN)
				1494	desc->c_unwritten = 1;
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1495
				1496	num_clusters--;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1497	}
				1498
Mark Fasheh	0d172ba	2007-05-14 18:09:54 -0700	[diff] [blame]	1499	ret = 0;
				1500	out:
				1501	return ret;
				1502	}
				1503
Mark Fasheh	1afc32b	2007-09-07 14:46:51 -0700	[diff] [blame]	1504	static int ocfs2_write_begin_inline(struct address_space *mapping,
				1505	struct inode *inode,
				1506	struct ocfs2_write_ctxt *wc)
				1507	{
				1508	int ret;
				1509	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1510	struct page *page;
				1511	handle_t *handle;
				1512	struct ocfs2_dinode di = (struct ocfs2_dinode )wc->w_di_bh->b_data;
				1513
				1514	page = find_or_create_page(mapping, 0, GFP_NOFS);
				1515	if (!page) {
				1516	ret = -ENOMEM;
				1517	mlog_errno(ret);
				1518	goto out;
				1519	}
				1520	/*
				1521	* If we don't set w_num_pages then this page won't get unlocked
				1522	* and freed on cleanup of the write context.
				1523	*/
				1524	wc->w_pages[0] = wc->w_target_page = page;
				1525	wc->w_num_pages = 1;
				1526
				1527	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
				1528	if (IS_ERR(handle)) {
				1529	ret = PTR_ERR(handle);
				1530	mlog_errno(ret);
				1531	goto out;
				1532	}
				1533
				1534	ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
				1535	OCFS2_JOURNAL_ACCESS_WRITE);
				1536	if (ret) {
				1537	ocfs2_commit_trans(osb, handle);
				1538
				1539	mlog_errno(ret);
				1540	goto out;
				1541	}
				1542
				1543	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
				1544	ocfs2_set_inode_data_inline(inode, di);
				1545
				1546	if (!PageUptodate(page)) {
				1547	ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh);
				1548	if (ret) {
				1549	ocfs2_commit_trans(osb, handle);
				1550
				1551	goto out;
				1552	}
				1553	}
				1554
				1555	wc->w_handle = handle;
				1556	out:
				1557	return ret;
				1558	}
				1559
				1560	int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size)
				1561	{
				1562	struct ocfs2_dinode di = (struct ocfs2_dinode )di_bh->b_data;
				1563
Mark Fasheh	0d8a4e0	2007-11-20 11:48:41 -0800	[diff] [blame]	1564	if (new_size <= le16_to_cpu(di->id2.i_data.id_count))
Mark Fasheh	1afc32b	2007-09-07 14:46:51 -0700	[diff] [blame]	1565	return 1;
				1566	return 0;
				1567	}
				1568
				1569	static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
				1570	struct inode *inode, loff_t pos,
				1571	unsigned len, struct page *mmap_page,
				1572	struct ocfs2_write_ctxt *wc)
				1573	{
				1574	int ret, written = 0;
				1575	loff_t end = pos + len;
				1576	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				1577
				1578	mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n",
				1579	(unsigned long long)oi->ip_blkno, len, (unsigned long long)pos,
				1580	oi->ip_dyn_features);
				1581
				1582	/*
				1583	* Handle inodes which already have inline data 1st.
				1584	*/
				1585	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
				1586	if (mmap_page == NULL &&
				1587	ocfs2_size_fits_inline_data(wc->w_di_bh, end))
				1588	goto do_inline_write;
				1589
				1590	/*
				1591	* The write won't fit - we have to give this inode an
				1592	* inline extent list now.
				1593	*/
				1594	ret = ocfs2_convert_inline_data_to_extents(inode, wc->w_di_bh);
				1595	if (ret)
				1596	mlog_errno(ret);
				1597	goto out;
				1598	}
				1599
				1600	/*
				1601	* Check whether the inode can accept inline data.
				1602	*/
				1603	if (oi->ip_clusters != 0 \|\| i_size_read(inode) != 0)
				1604	return 0;
				1605
				1606	/*
				1607	* Check whether the write can fit.
				1608	*/
				1609	if (mmap_page \|\| end > ocfs2_max_inline_data(inode->i_sb))
				1610	return 0;
				1611
				1612	do_inline_write:
				1613	ret = ocfs2_write_begin_inline(mapping, inode, wc);
				1614	if (ret) {
				1615	mlog_errno(ret);
				1616	goto out;
				1617	}
				1618
				1619	/*
				1620	* This signals to the caller that the data can be written
				1621	* inline.
				1622	*/
				1623	written = 1;
				1624	out:
				1625	return written ? written : ret;
				1626	}
				1627
Mark Fasheh	65ed39d	2007-08-28 17:13:23 -0700	[diff] [blame]	1628	/*
				1629	* This function only does anything for file systems which can't
				1630	* handle sparse files.
				1631	*
				1632	* What we want to do here is fill in any hole between the current end
				1633	* of allocation and the end of our write. That way the rest of the
				1634	* write path can treat it as an non-allocating write, which has no
				1635	* special case code for sparse/nonsparse files.
				1636	*/
				1637	static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
				1638	unsigned len,
				1639	struct ocfs2_write_ctxt *wc)
				1640	{
				1641	int ret;
				1642	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1643	loff_t newsize = pos + len;
				1644
				1645	if (ocfs2_sparse_alloc(osb))
				1646	return 0;
				1647
				1648	if (newsize <= i_size_read(inode))
				1649	return 0;
				1650
				1651	ret = ocfs2_extend_no_holes(inode, newsize, newsize - len);
				1652	if (ret)
				1653	mlog_errno(ret);
				1654
				1655	return ret;
				1656	}
				1657
Mark Fasheh	0d172ba	2007-05-14 18:09:54 -0700	[diff] [blame]	1658	int ocfs2_write_begin_nolock(struct address_space *mapping,
				1659	loff_t pos, unsigned len, unsigned flags,
				1660	struct page pagep, void fsdata,
				1661	struct buffer_head di_bh, struct page mmap_page)
				1662	{
				1663	int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1664	unsigned int clusters_to_alloc, extents_to_split;
Mark Fasheh	0d172ba	2007-05-14 18:09:54 -0700	[diff] [blame]	1665	struct ocfs2_write_ctxt *wc;
				1666	struct inode *inode = mapping->host;
				1667	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1668	struct ocfs2_dinode *di;
				1669	struct ocfs2_alloc_context *data_ac = NULL;
				1670	struct ocfs2_alloc_context *meta_ac = NULL;
				1671	handle_t *handle;
Joel Becker	f99b9b7	2008-08-20 19:36:33 -0700	[diff] [blame]	1672	struct ocfs2_extent_tree et;
Mark Fasheh	0d172ba	2007-05-14 18:09:54 -0700	[diff] [blame]	1673
				1674	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
				1675	if (ret) {
				1676	mlog_errno(ret);
				1677	return ret;
				1678	}
				1679
Mark Fasheh	1afc32b	2007-09-07 14:46:51 -0700	[diff] [blame]	1680	if (ocfs2_supports_inline_data(osb)) {
				1681	ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len,
				1682	mmap_page, wc);
				1683	if (ret == 1) {
				1684	ret = 0;
				1685	goto success;
				1686	}
				1687	if (ret < 0) {
				1688	mlog_errno(ret);
				1689	goto out;
				1690	}
				1691	}
				1692
Mark Fasheh	65ed39d	2007-08-28 17:13:23 -0700	[diff] [blame]	1693	ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
				1694	if (ret) {
				1695	mlog_errno(ret);
				1696	goto out;
				1697	}
				1698
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1699	ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
				1700	&extents_to_split);
Mark Fasheh	0d172ba	2007-05-14 18:09:54 -0700	[diff] [blame]	1701	if (ret) {
				1702	mlog_errno(ret);
				1703	goto out;
				1704	}
				1705
				1706	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
				1707
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1708	/*
				1709	* We set w_target_from, w_target_to here so that
				1710	* ocfs2_write_end() knows which range in the target page to
				1711	* write out. An allocation requires that we write the entire
				1712	* cluster range.
				1713	*/
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1714	if (clusters_to_alloc \|\| extents_to_split) {
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1715	/*
				1716	* XXX: We are stretching the limits of
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1717	* ocfs2_lock_allocators(). It greatly over-estimates
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1718	* the work to be done.
				1719	*/
Tao Ma	e7d4cb6	2008-08-18 17:38:44 +0800	[diff] [blame]	1720	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u,"
				1721	" clusters_to_add = %u, extents_to_split = %u\n",
				1722	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				1723	(long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
				1724	clusters_to_alloc, extents_to_split);
				1725
Joel Becker	8d6220d	2008-08-22 12:46:09 -0700	[diff] [blame]	1726	ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
Joel Becker	f99b9b7	2008-08-20 19:36:33 -0700	[diff] [blame]	1727	ret = ocfs2_lock_allocators(inode, &et,
Tao Ma	231b87d	2008-08-18 17:38:42 +0800	[diff] [blame]	1728	clusters_to_alloc, extents_to_split,
Joel Becker	f99b9b7	2008-08-20 19:36:33 -0700	[diff] [blame]	1729	&data_ac, &meta_ac);
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1730	if (ret) {
				1731	mlog_errno(ret);
Mark Fasheh	607d44a	2007-05-09 15:14:45 -0700	[diff] [blame]	1732	goto out;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1733	}
				1734
Tao Ma	811f933	2008-08-18 17:38:43 +0800	[diff] [blame]	1735	credits = ocfs2_calc_extend_credits(inode->i_sb,
				1736	&di->id2.i_list,
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1737	clusters_to_alloc);
				1738
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1739	}
				1740
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1741	ocfs2_set_target_boundaries(osb, wc, pos, len,
				1742	clusters_to_alloc + extents_to_split);
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1743
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1744	handle = ocfs2_start_trans(osb, credits);
				1745	if (IS_ERR(handle)) {
				1746	ret = PTR_ERR(handle);
				1747	mlog_errno(ret);
Mark Fasheh	607d44a	2007-05-09 15:14:45 -0700	[diff] [blame]	1748	goto out;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1749	}
				1750
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1751	wc->w_handle = handle;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1752
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1753	/*
				1754	* We don't want this to fail in ocfs2_write_end(), so do it
				1755	* here.
				1756	*/
				1757	ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1758	OCFS2_JOURNAL_ACCESS_WRITE);
				1759	if (ret) {
				1760	mlog_errno(ret);
				1761	goto out_commit;
				1762	}
				1763
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1764	/*
				1765	* Fill our page array first. That way we've grabbed enough so
				1766	* that we can zero and flush if we error after adding the
				1767	* extent.
				1768	*/
				1769	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1770	clusters_to_alloc + extents_to_split,
				1771	mmap_page);
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1772	if (ret) {
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1773	mlog_errno(ret);
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1774	goto out_commit;
				1775	}
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1776
Mark Fasheh	0d172ba	2007-05-14 18:09:54 -0700	[diff] [blame]	1777	ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
				1778	len);
				1779	if (ret) {
				1780	mlog_errno(ret);
				1781	goto out_commit;
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1782	}
				1783
				1784	if (data_ac)
				1785	ocfs2_free_alloc_context(data_ac);
				1786	if (meta_ac)
				1787	ocfs2_free_alloc_context(meta_ac);
				1788
Mark Fasheh	1afc32b	2007-09-07 14:46:51 -0700	[diff] [blame]	1789	success:
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1790	*pagep = wc->w_target_page;
				1791	*fsdata = wc;
				1792	return 0;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1793	out_commit:
				1794	ocfs2_commit_trans(osb, handle);
				1795
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1796	out:
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1797	ocfs2_free_write_ctxt(wc);
				1798
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1799	if (data_ac)
				1800	ocfs2_free_alloc_context(data_ac);
				1801	if (meta_ac)
				1802	ocfs2_free_alloc_context(meta_ac);
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1803	return ret;
				1804	}
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1805
Nick Piggin	b6af1bc	2007-10-16 01:25:24 -0700	[diff] [blame]	1806	static int ocfs2_write_begin(struct file file, struct address_space mapping,
				1807	loff_t pos, unsigned len, unsigned flags,
				1808	struct page pagep, void fsdata)
Mark Fasheh	607d44a	2007-05-09 15:14:45 -0700	[diff] [blame]	1809	{
				1810	int ret;
				1811	struct buffer_head *di_bh = NULL;
				1812	struct inode *inode = mapping->host;
				1813
Mark Fasheh	e63aecb6	2007-10-18 15:30:42 -0700	[diff] [blame]	1814	ret = ocfs2_inode_lock(inode, &di_bh, 1);
Mark Fasheh	607d44a	2007-05-09 15:14:45 -0700	[diff] [blame]	1815	if (ret) {
				1816	mlog_errno(ret);
				1817	return ret;
				1818	}
				1819
				1820	/*
				1821	* Take alloc sem here to prevent concurrent lookups. That way
				1822	* the mapping, zeroing and tree manipulation within
				1823	* ocfs2_write() will be safe against ->readpage(). This
				1824	* should also serve to lock out allocation from a shared
				1825	* writeable region.
				1826	*/
				1827	down_write(&OCFS2_I(inode)->ip_alloc_sem);
				1828
Mark Fasheh	607d44a	2007-05-09 15:14:45 -0700	[diff] [blame]	1829	ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
Mark Fasheh	7307de8	2007-05-09 15:16:19 -0700	[diff] [blame]	1830	fsdata, di_bh, NULL);
Mark Fasheh	607d44a	2007-05-09 15:14:45 -0700	[diff] [blame]	1831	if (ret) {
				1832	mlog_errno(ret);
Mark Fasheh	c934a92	2007-10-18 15:23:46 -0700	[diff] [blame]	1833	goto out_fail;
Mark Fasheh	607d44a	2007-05-09 15:14:45 -0700	[diff] [blame]	1834	}
				1835
				1836	brelse(di_bh);
				1837
				1838	return 0;
				1839
Mark Fasheh	607d44a	2007-05-09 15:14:45 -0700	[diff] [blame]	1840	out_fail:
				1841	up_write(&OCFS2_I(inode)->ip_alloc_sem);
				1842
				1843	brelse(di_bh);
Mark Fasheh	e63aecb6	2007-10-18 15:30:42 -0700	[diff] [blame]	1844	ocfs2_inode_unlock(inode, 1);
Mark Fasheh	607d44a	2007-05-09 15:14:45 -0700	[diff] [blame]	1845
				1846	return ret;
				1847	}
				1848
Mark Fasheh	1afc32b	2007-09-07 14:46:51 -0700	[diff] [blame]	1849	static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
				1850	unsigned len, unsigned *copied,
				1851	struct ocfs2_dinode *di,
				1852	struct ocfs2_write_ctxt *wc)
				1853	{
				1854	void *kaddr;
				1855
				1856	if (unlikely(*copied < len)) {
				1857	if (!PageUptodate(wc->w_target_page)) {
				1858	*copied = 0;
				1859	return;
				1860	}
				1861	}
				1862
				1863	kaddr = kmap_atomic(wc->w_target_page, KM_USER0);
				1864	memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
				1865	kunmap_atomic(kaddr, KM_USER0);
				1866
				1867	mlog(0, "Data written to inode at offset %llu. "
				1868	"id_count = %u, copied = %u, i_dyn_features = 0x%x\n",
				1869	(unsigned long long)pos, *copied,
				1870	le16_to_cpu(di->id2.i_data.id_count),
				1871	le16_to_cpu(di->i_dyn_features));
				1872	}
				1873
Mark Fasheh	7307de8	2007-05-09 15:16:19 -0700	[diff] [blame]	1874	int ocfs2_write_end_nolock(struct address_space *mapping,
				1875	loff_t pos, unsigned len, unsigned copied,
				1876	struct page page, void fsdata)
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1877	{
				1878	int i;
				1879	unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
				1880	struct inode *inode = mapping->host;
				1881	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1882	struct ocfs2_write_ctxt *wc = fsdata;
				1883	struct ocfs2_dinode di = (struct ocfs2_dinode )wc->w_di_bh->b_data;
				1884	handle_t *handle = wc->w_handle;
				1885	struct page *tmppage;
				1886
Mark Fasheh	1afc32b	2007-09-07 14:46:51 -0700	[diff] [blame]	1887	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
				1888	ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
				1889	goto out_write_size;
				1890	}
				1891
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1892	if (unlikely(copied < len)) {
				1893	if (!PageUptodate(wc->w_target_page))
				1894	copied = 0;
				1895
				1896	ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
				1897	start+len);
				1898	}
				1899	flush_dcache_page(wc->w_target_page);
				1900
				1901	for(i = 0; i < wc->w_num_pages; i++) {
				1902	tmppage = wc->w_pages[i];
				1903
				1904	if (tmppage == wc->w_target_page) {
				1905	from = wc->w_target_from;
				1906	to = wc->w_target_to;
				1907
				1908	BUG_ON(from > PAGE_CACHE_SIZE \|\|
				1909	to > PAGE_CACHE_SIZE \|\|
				1910	to < from);
				1911	} else {
				1912	/*
				1913	* Pages adjacent to the target (if any) imply
				1914	* a hole-filling write in which case we want
				1915	* to flush their entire range.
				1916	*/
				1917	from = 0;
				1918	to = PAGE_CACHE_SIZE;
				1919	}
				1920
Sunil Mushran	961cecb	2008-07-16 17:22:22 -0700	[diff] [blame]	1921	if (page_has_buffers(tmppage)) {
Joel Becker	2b4e30f	2008-09-03 20:03:41 -0700	[diff] [blame]	1922	if (ocfs2_should_order_data(inode)) {
				1923	ocfs2_jbd2_file_inode(wc->w_handle, inode);
				1924	#ifdef CONFIG_OCFS2_COMPAT_JBD
Sunil Mushran	961cecb	2008-07-16 17:22:22 -0700	[diff] [blame]	1925	walk_page_buffers(wc->w_handle,
				1926	page_buffers(tmppage),
				1927	from, to, NULL,
				1928	ocfs2_journal_dirty_data);
Joel Becker	2b4e30f	2008-09-03 20:03:41 -0700	[diff] [blame]	1929	#endif
				1930	}
Sunil Mushran	961cecb	2008-07-16 17:22:22 -0700	[diff] [blame]	1931	block_commit_write(tmppage, from, to);
				1932	}
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1933	}
				1934
Mark Fasheh	1afc32b	2007-09-07 14:46:51 -0700	[diff] [blame]	1935	out_write_size:
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1936	pos += copied;
				1937	if (pos > inode->i_size) {
				1938	i_size_write(inode, pos);
				1939	mark_inode_dirty(inode);
				1940	}
				1941	inode->i_blocks = ocfs2_inode_sector_count(inode);
				1942	di->i_size = cpu_to_le64((u64)i_size_read(inode));
				1943	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
				1944	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
				1945	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1946	ocfs2_journal_dirty(handle, wc->w_di_bh);
				1947
				1948	ocfs2_commit_trans(osb, handle);
Mark Fasheh	59a5e41	2007-06-22 15:52:36 -0700	[diff] [blame]	1949
Mark Fasheh	b27b7cb	2007-06-18 11:22:56 -0700	[diff] [blame]	1950	ocfs2_run_deallocs(osb, &wc->w_dealloc);
				1951
Mark Fasheh	3a307ff	2007-05-08 17:47:32 -0700	[diff] [blame]	1952	ocfs2_free_write_ctxt(wc);
				1953
				1954	return copied;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	1955	}
				1956
Nick Piggin	b6af1bc	2007-10-16 01:25:24 -0700	[diff] [blame]	1957	static int ocfs2_write_end(struct file file, struct address_space mapping,
				1958	loff_t pos, unsigned len, unsigned copied,
				1959	struct page page, void fsdata)
Mark Fasheh	607d44a	2007-05-09 15:14:45 -0700	[diff] [blame]	1960	{
				1961	int ret;
				1962	struct inode *inode = mapping->host;
				1963
				1964	ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
				1965
Mark Fasheh	607d44a	2007-05-09 15:14:45 -0700	[diff] [blame]	1966	up_write(&OCFS2_I(inode)->ip_alloc_sem);
Mark Fasheh	e63aecb6	2007-10-18 15:30:42 -0700	[diff] [blame]	1967	ocfs2_inode_unlock(inode, 1);
Mark Fasheh	607d44a	2007-05-09 15:14:45 -0700	[diff] [blame]	1968
				1969	return ret;
				1970	}
				1971
Christoph Hellwig	f5e54d6	2006-06-28 04:26:44 -0700	[diff] [blame]	1972	const struct address_space_operations ocfs2_aops = {
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1973	.readpage = ocfs2_readpage,
Mark Fasheh	628a24f	2007-10-30 12:08:32 -0700	[diff] [blame]	1974	.readpages = ocfs2_readpages,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1975	.writepage = ocfs2_writepage,
Nick Piggin	b6af1bc	2007-10-16 01:25:24 -0700	[diff] [blame]	1976	.write_begin = ocfs2_write_begin,
				1977	.write_end = ocfs2_write_end,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1978	.bmap = ocfs2_bmap,
				1979	.sync_page = block_sync_page,
Joel Becker	03f981c	2007-01-04 14:54:41 -0800	[diff] [blame]	1980	.direct_IO = ocfs2_direct_IO,
				1981	.invalidatepage = ocfs2_invalidatepage,
				1982	.releasepage = ocfs2_releasepage,
				1983	.migratepage = buffer_migrate_page,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1984	};