Blame - fs/ocfs2/aops.c - fp2-dev/kernel/msm

blob: ef6cd30108a9fb153a244c97050420f93a9efb0e [file] [log] [blame]

Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1	/* -- mode: c; c-basic-offset: 8; --
				2	* vim: noexpandtab sw=8 ts=8 sts=0:
				3	*
				4	* Copyright (C) 2002, 2004 Oracle. All rights reserved.
				5	*
				6	* This program is free software; you can redistribute it and/or
				7	* modify it under the terms of the GNU General Public
				8	* License as published by the Free Software Foundation; either
				9	* version 2 of the License, or (at your option) any later version.
				10	*
				11	* This program is distributed in the hope that it will be useful,
				12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				14	* General Public License for more details.
				15	*
				16	* You should have received a copy of the GNU General Public
				17	* License along with this program; if not, write to the
				18	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				19	* Boston, MA 021110-1307, USA.
				20	*/
				21
				22	#include <linux/fs.h>
				23	#include <linux/slab.h>
				24	#include <linux/highmem.h>
				25	#include <linux/pagemap.h>
				26	#include <asm/byteorder.h>
				27
				28	#define MLOG_MASK_PREFIX ML_FILE_IO
				29	#include <cluster/masklog.h>
				30
				31	#include "ocfs2.h"
				32
				33	#include "alloc.h"
				34	#include "aops.h"
				35	#include "dlmglue.h"
				36	#include "extent_map.h"
				37	#include "file.h"
				38	#include "inode.h"
				39	#include "journal.h"
				40	#include "super.h"
				41	#include "symlink.h"
				42
				43	#include "buffer_head_io.h"
				44
				45	static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
				46	struct buffer_head *bh_result, int create)
				47	{
				48	int err = -EIO;
				49	int status;
				50	struct ocfs2_dinode *fe = NULL;
				51	struct buffer_head *bh = NULL;
				52	struct buffer_head *buffer_cache_bh = NULL;
				53	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				54	void *kaddr;
				55
				56	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
				57	(unsigned long long)iblock, bh_result, create);
				58
				59	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
				60
				61	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
				62	mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
				63	(unsigned long long)iblock);
				64	goto bail;
				65	}
				66
				67	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
				68	OCFS2_I(inode)->ip_blkno,
				69	&bh, OCFS2_BH_CACHED, inode);
				70	if (status < 0) {
				71	mlog_errno(status);
				72	goto bail;
				73	}
				74	fe = (struct ocfs2_dinode *) bh->b_data;
				75
				76	if (!OCFS2_IS_VALID_DINODE(fe)) {
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	77	mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
				78	(unsigned long long)fe->i_blkno, 7, fe->i_signature);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	79	goto bail;
				80	}
				81
				82	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
				83	le32_to_cpu(fe->i_clusters))) {
				84	mlog(ML_ERROR, "block offset is outside the allocated size: "
				85	"%llu\n", (unsigned long long)iblock);
				86	goto bail;
				87	}
				88
				89	/* We don't use the page cache to create symlink data, so if
				90	* need be, copy it over from the buffer cache. */
				91	if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
				92	u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
				93	iblock;
				94	buffer_cache_bh = sb_getblk(osb->sb, blkno);
				95	if (!buffer_cache_bh) {
				96	mlog(ML_ERROR, "couldn't getblock for symlink!\n");
				97	goto bail;
				98	}
				99
				100	/* we haven't locked out transactions, so a commit
				101	* could've happened. Since we've got a reference on
				102	* the bh, even if it commits while we're doing the
				103	* copy, the data is still good. */
				104	if (buffer_jbd(buffer_cache_bh)
				105	&& ocfs2_inode_is_new(inode)) {
				106	kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
				107	if (!kaddr) {
				108	mlog(ML_ERROR, "couldn't kmap!\n");
				109	goto bail;
				110	}
				111	memcpy(kaddr + (bh_result->b_size * iblock),
				112	buffer_cache_bh->b_data,
				113	bh_result->b_size);
				114	kunmap_atomic(kaddr, KM_USER0);
				115	set_buffer_uptodate(bh_result);
				116	}
				117	brelse(buffer_cache_bh);
				118	}
				119
				120	map_bh(bh_result, inode->i_sb,
				121	le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
				122
				123	err = 0;
				124
				125	bail:
				126	if (bh)
				127	brelse(bh);
				128
				129	mlog_exit(err);
				130	return err;
				131	}
				132
				133	static int ocfs2_get_block(struct inode *inode, sector_t iblock,
				134	struct buffer_head *bh_result, int create)
				135	{
				136	int err = 0;
				137	u64 p_blkno, past_eof;
				138
				139	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
				140	(unsigned long long)iblock, bh_result, create);
				141
				142	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
				143	mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
				144	inode, inode->i_ino);
				145
				146	if (S_ISLNK(inode->i_mode)) {
				147	/* this always does I/O for some reason. */
				148	err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
				149	goto bail;
				150	}
				151
				152	/* this can happen if another node truncs after our extend! */
				153	spin_lock(&OCFS2_I(inode)->ip_lock);
				154	if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
				155	OCFS2_I(inode)->ip_clusters))
				156	err = -EIO;
				157	spin_unlock(&OCFS2_I(inode)->ip_lock);
				158	if (err)
				159	goto bail;
				160
				161	err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
				162	NULL);
				163	if (err) {
				164	mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	165	"%llu, NULL)\n", err, inode, (unsigned long long)iblock,
				166	(unsigned long long)p_blkno);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	167	goto bail;
				168	}
				169
				170	map_bh(bh_result, inode->i_sb, p_blkno);
				171
				172	if (bh_result->b_blocknr == 0) {
				173	err = -EIO;
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	174	mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
				175	(unsigned long long)iblock,
				176	(unsigned long long)p_blkno,
				177	(unsigned long long)OCFS2_I(inode)->ip_blkno);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	178	}
				179
				180	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	181	mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
				182	(unsigned long long)past_eof);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	183
				184	if (create && (iblock >= past_eof))
				185	set_buffer_new(bh_result);
				186
				187	bail:
				188	if (err < 0)
				189	err = -EIO;
				190
				191	mlog_exit(err);
				192	return err;
				193	}
				194
				195	static int ocfs2_readpage(struct file file, struct page page)
				196	{
				197	struct inode *inode = page->mapping->host;
				198	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
				199	int ret, unlock = 1;
				200
				201	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
				202
Mark Fasheh	4bcec18	2006-10-09 16:02:40 -0700	[diff] [blame]	203	ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	204	if (ret != 0) {
				205	if (ret == AOP_TRUNCATED_PAGE)
				206	unlock = 0;
				207	mlog_errno(ret);
				208	goto out;
				209	}
				210
				211	down_read(&OCFS2_I(inode)->ip_alloc_sem);
				212
				213	/*
				214	* i_size might have just been updated as we grabed the meta lock. We
				215	* might now be discovering a truncate that hit on another node.
				216	* block_read_full_page->get_block freaks out if it is asked to read
				217	* beyond the end of a file, so we check here. Callers
				218	* (generic_file_read, fault->nopage) are clever enough to check i_size
				219	* and notice that the page they just read isn't needed.
				220	*
				221	* XXX sys_readahead() seems to get that wrong?
				222	*/
				223	if (start >= i_size_read(inode)) {
				224	char *addr = kmap(page);
				225	memset(addr, 0, PAGE_SIZE);
				226	flush_dcache_page(page);
				227	kunmap(page);
				228	SetPageUptodate(page);
				229	ret = 0;
				230	goto out_alloc;
				231	}
				232
				233	ret = ocfs2_data_lock_with_page(inode, 0, page);
				234	if (ret != 0) {
				235	if (ret == AOP_TRUNCATED_PAGE)
				236	unlock = 0;
				237	mlog_errno(ret);
				238	goto out_alloc;
				239	}
				240
				241	ret = block_read_full_page(page, ocfs2_get_block);
				242	unlock = 0;
				243
				244	ocfs2_data_unlock(inode, 0);
				245	out_alloc:
				246	up_read(&OCFS2_I(inode)->ip_alloc_sem);
				247	ocfs2_meta_unlock(inode, 0);
				248	out:
				249	if (unlock)
				250	unlock_page(page);
				251	mlog_exit(ret);
				252	return ret;
				253	}
				254
				255	/* Note: Because we don't support holes, our allocation has
				256	* already happened (allocation writes zeros to the file data)
				257	* so we don't have to worry about ordered writes in
				258	* ocfs2_writepage.
				259	*
				260	* ->writepage is called during the process of invalidating the page cache
				261	* during blocked lock processing. It can't block on any cluster locks
				262	* to during block mapping. It's relying on the fact that the block
				263	* mapping can't have disappeared under the dirty pages that it is
				264	* being asked to write back.
				265	*/
				266	static int ocfs2_writepage(struct page page, struct writeback_control wbc)
				267	{
				268	int ret;
				269
				270	mlog_entry("(0x%p)\n", page);
				271
				272	ret = block_write_full_page(page, ocfs2_get_block, wbc);
				273
				274	mlog_exit(ret);
				275
				276	return ret;
				277	}
				278
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	279	/* This can also be called from ocfs2_write_zero_page() which has done
				280	* it's own cluster locking. */
				281	int ocfs2_prepare_write_nolock(struct inode inode, struct page page,
				282	unsigned from, unsigned to)
				283	{
				284	int ret;
				285
				286	down_read(&OCFS2_I(inode)->ip_alloc_sem);
				287
				288	ret = block_prepare_write(page, from, to, ocfs2_get_block);
				289
				290	up_read(&OCFS2_I(inode)->ip_alloc_sem);
				291
				292	return ret;
				293	}
				294
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	295	/*
				296	* ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
				297	* from loopback. It must be able to perform its own locking around
				298	* ocfs2_get_block().
				299	*/
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	300	static int ocfs2_prepare_write(struct file file, struct page page,
				301	unsigned from, unsigned to)
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	302	{
				303	struct inode *inode = page->mapping->host;
				304	int ret;
				305
				306	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
				307
Mark Fasheh	4bcec18	2006-10-09 16:02:40 -0700	[diff] [blame]	308	ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	309	if (ret != 0) {
				310	mlog_errno(ret);
				311	goto out;
				312	}
				313
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	314	ret = ocfs2_prepare_write_nolock(inode, page, from, to);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	315
				316	ocfs2_meta_unlock(inode, 0);
				317	out:
				318	mlog_exit(ret);
				319	return ret;
				320	}
				321
				322	/* Taken from ext3. We don't necessarily need the full blown
				323	* functionality yet, but IMHO it's better to cut and paste the whole
				324	* thing so we can avoid introducing our own bugs (and easily pick up
				325	* their fixes when they happen) --Mark */
				326	static int walk_page_buffers( handle_t *handle,
				327	struct buffer_head *head,
				328	unsigned from,
				329	unsigned to,
				330	int *partial,
				331	int (fn)( handle_t handle,
				332	struct buffer_head *bh))
				333	{
				334	struct buffer_head *bh;
				335	unsigned block_start, block_end;
				336	unsigned blocksize = head->b_size;
				337	int err, ret = 0;
				338	struct buffer_head *next;
				339
				340	for ( bh = head, block_start = 0;
				341	ret == 0 && (bh != head \|\| !block_start);
				342	block_start = block_end, bh = next)
				343	{
				344	next = bh->b_this_page;
				345	block_end = block_start + blocksize;
				346	if (block_end <= from \|\| block_start >= to) {
				347	if (partial && !buffer_uptodate(bh))
				348	*partial = 1;
				349	continue;
				350	}
				351	err = (*fn)(handle, bh);
				352	if (!ret)
				353	ret = err;
				354	}
				355	return ret;
				356	}
				357
Mark Fasheh	1fabe14	2006-10-09 18:11:45 -0700	[diff] [blame]	358	handle_t ocfs2_start_walk_page_trans(struct inode inode,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	359	struct page *page,
				360	unsigned from,
				361	unsigned to)
				362	{
				363	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
Mark Fasheh	1fabe14	2006-10-09 18:11:45 -0700	[diff] [blame]	364	handle_t *handle = NULL;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	365	int ret = 0;
				366
Mark Fasheh	65eff9c	2006-10-09 17:26:22 -0700	[diff] [blame]	367	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	368	if (!handle) {
				369	ret = -ENOMEM;
				370	mlog_errno(ret);
				371	goto out;
				372	}
				373
				374	if (ocfs2_should_order_data(inode)) {
Mark Fasheh	1fabe14	2006-10-09 18:11:45 -0700	[diff] [blame]	375	ret = walk_page_buffers(handle,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	376	page_buffers(page),
				377	from, to, NULL,
				378	ocfs2_journal_dirty_data);
				379	if (ret < 0)
				380	mlog_errno(ret);
				381	}
				382	out:
				383	if (ret) {
				384	if (handle)
Mark Fasheh	02dc1af	2006-10-09 16:48:10 -0700	[diff] [blame]	385	ocfs2_commit_trans(osb, handle);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	386	handle = ERR_PTR(ret);
				387	}
				388	return handle;
				389	}
				390
				391	static int ocfs2_commit_write(struct file file, struct page page,
				392	unsigned from, unsigned to)
				393	{
Mark Fasheh	e0b4096	2006-07-11 14:38:54 -0700	[diff] [blame]	394	int ret;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	395	struct buffer_head *di_bh = NULL;
				396	struct inode *inode = page->mapping->host;
Mark Fasheh	1fabe14	2006-10-09 18:11:45 -0700	[diff] [blame]	397	handle_t *handle = NULL;
Mark Fasheh	e0b4096	2006-07-11 14:38:54 -0700	[diff] [blame]	398	struct ocfs2_dinode *di;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	399
				400	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
				401
				402	/* NOTE: ocfs2_file_aio_write has ensured that it's safe for
Mark Fasheh	e0b4096	2006-07-11 14:38:54 -0700	[diff] [blame]	403	* us to continue here without rechecking the I/O against
				404	* changed inode values.
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	405	*
				406	* 1) We're currently holding the inode alloc lock, so no
				407	* nodes can change it underneath us.
				408	*
				409	* 2) We've had to take the metadata lock at least once
Mark Fasheh	e0b4096	2006-07-11 14:38:54 -0700	[diff] [blame]	410	* already to check for extending writes, suid removal, etc.
				411	* The meta data update code then ensures that we don't get a
				412	* stale inode allocation image (i_size, i_clusters, etc).
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	413	*/
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	414
Mark Fasheh	4bcec18	2006-10-09 16:02:40 -0700	[diff] [blame]	415	ret = ocfs2_meta_lock_with_page(inode, &di_bh, 1, page);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	416	if (ret != 0) {
				417	mlog_errno(ret);
				418	goto out;
				419	}
				420
				421	ret = ocfs2_data_lock_with_page(inode, 1, page);
				422	if (ret != 0) {
				423	mlog_errno(ret);
				424	goto out_unlock_meta;
				425	}
				426
Mark Fasheh	e0b4096	2006-07-11 14:38:54 -0700	[diff] [blame]	427	handle = ocfs2_start_walk_page_trans(inode, page, from, to);
				428	if (IS_ERR(handle)) {
				429	ret = PTR_ERR(handle);
				430	goto out_unlock_data;
				431	}
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	432
Mark Fasheh	e0b4096	2006-07-11 14:38:54 -0700	[diff] [blame]	433	/* Mark our buffer early. We'd rather catch this error up here
				434	* as opposed to after a successful commit_write which would
				435	* require us to set back inode->i_size. */
				436	ret = ocfs2_journal_access(handle, inode, di_bh,
				437	OCFS2_JOURNAL_ACCESS_WRITE);
				438	if (ret < 0) {
				439	mlog_errno(ret);
				440	goto out_commit;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	441	}
				442
				443	/* might update i_size */
				444	ret = generic_commit_write(file, page, from, to);
				445	if (ret < 0) {
				446	mlog_errno(ret);
				447	goto out_commit;
				448	}
				449
Mark Fasheh	e0b4096	2006-07-11 14:38:54 -0700	[diff] [blame]	450	di = (struct ocfs2_dinode *)di_bh->b_data;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	451
Mark Fasheh	e0b4096	2006-07-11 14:38:54 -0700	[diff] [blame]	452	/* ocfs2_mark_inode_dirty() is too heavy to use here. */
				453	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
				454	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
				455	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	456
Mark Fasheh	e0b4096	2006-07-11 14:38:54 -0700	[diff] [blame]	457	inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));
				458	di->i_size = cpu_to_le64((u64)i_size_read(inode));
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	459
Mark Fasheh	e0b4096	2006-07-11 14:38:54 -0700	[diff] [blame]	460	ret = ocfs2_journal_dirty(handle, di_bh);
				461	if (ret < 0) {
				462	mlog_errno(ret);
				463	goto out_commit;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	464	}
				465
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	466	out_commit:
Mark Fasheh	02dc1af	2006-10-09 16:48:10 -0700	[diff] [blame]	467	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	468	out_unlock_data:
				469	ocfs2_data_unlock(inode, 1);
				470	out_unlock_meta:
Mark Fasheh	e0b4096	2006-07-11 14:38:54 -0700	[diff] [blame]	471	ocfs2_meta_unlock(inode, 1);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	472	out:
				473	if (di_bh)
				474	brelse(di_bh);
				475
				476	mlog_exit(ret);
				477	return ret;
				478	}
				479
				480	static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
				481	{
				482	sector_t status;
				483	u64 p_blkno = 0;
				484	int err = 0;
				485	struct inode *inode = mapping->host;
				486
				487	mlog_entry("(block = %llu)\n", (unsigned long long)block);
				488
				489	/* We don't need to lock journal system files, since they aren't
				490	* accessed concurrently from multiple nodes.
				491	*/
				492	if (!INODE_JOURNAL(inode)) {
Mark Fasheh	4bcec18	2006-10-09 16:02:40 -0700	[diff] [blame]	493	err = ocfs2_meta_lock(inode, NULL, 0);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	494	if (err) {
				495	if (err != -ENOENT)
				496	mlog_errno(err);
				497	goto bail;
				498	}
				499	down_read(&OCFS2_I(inode)->ip_alloc_sem);
				500	}
				501
				502	err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
				503	NULL);
				504
				505	if (!INODE_JOURNAL(inode)) {
				506	up_read(&OCFS2_I(inode)->ip_alloc_sem);
				507	ocfs2_meta_unlock(inode, 0);
				508	}
				509
				510	if (err) {
				511	mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
				512	(unsigned long long)block);
				513	mlog_errno(err);
				514	goto bail;
				515	}
				516
				517
				518	bail:
				519	status = err ? 0 : p_blkno;
				520
				521	mlog_exit((int)status);
				522
				523	return status;
				524	}
				525
				526	/*
				527	* TODO: Make this into a generic get_blocks function.
				528	*
				529	* From do_direct_io in direct-io.c:
				530	* "So what we do is to permit the ->get_blocks function to populate
				531	* bh.b_size with the size of IO which is permitted at this offset and
				532	* this i_blkbits."
				533	*
				534	* This function is called directly from get_more_blocks in direct-io.c.
				535	*
				536	* called like this: dio->get_blocks(dio->inode, fs_startblk,
				537	* fs_count, map_bh, dio->rw == WRITE);
				538	*/
				539	static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	540	struct buffer_head *bh_result, int create)
				541	{
				542	int ret;
				543	u64 vbo_max; /* file offset, max_blocks from iblock */
				544	u64 p_blkno;
				545	int contig_blocks;
Florin Malita	184d7d2	2006-06-03 19:30:10 -0400	[diff] [blame]	546	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
Badari Pulavarty	1d8fa7a	2006-03-26 01:38:02 -0800	[diff] [blame]	547	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	548
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	549	/* This function won't even be called if the request isn't all
				550	* nicely aligned and of the right size, so there's no need
				551	* for us to check any of that. */
				552
				553	vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;
				554
				555	spin_lock(&OCFS2_I(inode)->ip_lock);
				556	if ((iblock + max_blocks) >
				557	ocfs2_clusters_to_blocks(inode->i_sb,
				558	OCFS2_I(inode)->ip_clusters)) {
				559	spin_unlock(&OCFS2_I(inode)->ip_lock);
				560	ret = -EIO;
				561	goto bail;
				562	}
				563	spin_unlock(&OCFS2_I(inode)->ip_lock);
				564
				565	/* This figures out the size of the next contiguous block, and
				566	* our logical offset */
				567	ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
				568	&contig_blocks);
				569	if (ret) {
				570	mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
				571	(unsigned long long)iblock);
				572	ret = -EIO;
				573	goto bail;
				574	}
				575
				576	map_bh(bh_result, inode->i_sb, p_blkno);
				577
				578	/* make sure we don't map more than max_blocks blocks here as
				579	that's all the kernel will handle at this point. */
				580	if (max_blocks < contig_blocks)
				581	contig_blocks = max_blocks;
				582	bh_result->b_size = contig_blocks << blocksize_bits;
				583	bail:
				584	return ret;
				585	}
				586
				587	/*
				588	* ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
				589	* particularly interested in the aio/dio case. Like the core uses
				590	* i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
				591	* truncation on another.
				592	*/
				593	static void ocfs2_dio_end_io(struct kiocb *iocb,
				594	loff_t offset,
				595	ssize_t bytes,
				596	void *private)
				597	{
Josef Sipek	d28c917	2006-12-08 02:37:25 -0800	[diff] [blame]	598	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	599
				600	/* this io's submitter should not have unlocked this before we could */
				601	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
				602	ocfs2_iocb_clear_rw_locked(iocb);
				603	up_read(&inode->i_alloc_sem);
				604	ocfs2_rw_unlock(inode, 0);
				605	}
				606
				607	static ssize_t ocfs2_direct_IO(int rw,
				608	struct kiocb *iocb,
				609	const struct iovec *iov,
				610	loff_t offset,
				611	unsigned long nr_segs)
				612	{
				613	struct file *file = iocb->ki_filp;
Josef Sipek	d28c917	2006-12-08 02:37:25 -0800	[diff] [blame]	614	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	615	int ret;
				616
				617	mlog_entry_void();
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	618
				619	/*
				620	* We get PR data locks even for O_DIRECT. This allows
				621	* concurrent O_DIRECT I/O but doesn't let O_DIRECT with
				622	* extending and buffered zeroing writes race. If they did
				623	* race then the buffered zeroing could be written back after
				624	* the O_DIRECT I/O. It's one thing to tell people not to mix
				625	* buffered and O_DIRECT writes, but expecting them to
				626	* understand that file extension is also an implicit buffered
				627	* write is too much. By getting the PR we force writeback of
				628	* the buffered zeroing before proceeding.
				629	*/
				630	ret = ocfs2_data_lock(inode, 0);
				631	if (ret < 0) {
				632	mlog_errno(ret);
				633	goto out;
				634	}
				635	ocfs2_data_unlock(inode, 0);
				636
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	637	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
				638	inode->i_sb->s_bdev, iov, offset,
				639	nr_segs,
				640	ocfs2_direct_IO_get_blocks,
				641	ocfs2_dio_end_io);
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	642	out:
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	643	mlog_exit(ret);
				644	return ret;
				645	}
				646
Christoph Hellwig	f5e54d6	2006-06-28 04:26:44 -0700	[diff] [blame]	647	const struct address_space_operations ocfs2_aops = {
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	648	.readpage = ocfs2_readpage,
				649	.writepage = ocfs2_writepage,
				650	.prepare_write = ocfs2_prepare_write,
				651	.commit_write = ocfs2_commit_write,
				652	.bmap = ocfs2_bmap,
				653	.sync_page = block_sync_page,
				654	.direct_IO = ocfs2_direct_IO
				655	};