Blame - fs/ocfs2/aops.c - kernel/msm-4.9

blob: 47152bf9a7f26cdfb82cd02240e5546ba3e47485 [file] [log] [blame]

Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1	/* -- mode: c; c-basic-offset: 8; --
				2	* vim: noexpandtab sw=8 ts=8 sts=0:
				3	*
				4	* Copyright (C) 2002, 2004 Oracle. All rights reserved.
				5	*
				6	* This program is free software; you can redistribute it and/or
				7	* modify it under the terms of the GNU General Public
				8	* License as published by the Free Software Foundation; either
				9	* version 2 of the License, or (at your option) any later version.
				10	*
				11	* This program is distributed in the hope that it will be useful,
				12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				14	* General Public License for more details.
				15	*
				16	* You should have received a copy of the GNU General Public
				17	* License along with this program; if not, write to the
				18	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				19	* Boston, MA 021110-1307, USA.
				20	*/
				21
				22	#include <linux/fs.h>
				23	#include <linux/slab.h>
				24	#include <linux/highmem.h>
				25	#include <linux/pagemap.h>
				26	#include <asm/byteorder.h>
				27
				28	#define MLOG_MASK_PREFIX ML_FILE_IO
				29	#include <cluster/masklog.h>
				30
				31	#include "ocfs2.h"
				32
				33	#include "alloc.h"
				34	#include "aops.h"
				35	#include "dlmglue.h"
				36	#include "extent_map.h"
				37	#include "file.h"
				38	#include "inode.h"
				39	#include "journal.h"
				40	#include "super.h"
				41	#include "symlink.h"
				42
				43	#include "buffer_head_io.h"
				44
				45	static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
				46	struct buffer_head *bh_result, int create)
				47	{
				48	int err = -EIO;
				49	int status;
				50	struct ocfs2_dinode *fe = NULL;
				51	struct buffer_head *bh = NULL;
				52	struct buffer_head *buffer_cache_bh = NULL;
				53	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				54	void *kaddr;
				55
				56	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
				57	(unsigned long long)iblock, bh_result, create);
				58
				59	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
				60
				61	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
				62	mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
				63	(unsigned long long)iblock);
				64	goto bail;
				65	}
				66
				67	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
				68	OCFS2_I(inode)->ip_blkno,
				69	&bh, OCFS2_BH_CACHED, inode);
				70	if (status < 0) {
				71	mlog_errno(status);
				72	goto bail;
				73	}
				74	fe = (struct ocfs2_dinode *) bh->b_data;
				75
				76	if (!OCFS2_IS_VALID_DINODE(fe)) {
Mark Fasheh	b0697053	2006-03-03 10:24:33 -0800	[diff] [blame]	77	mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
				78	(unsigned long long)fe->i_blkno, 7, fe->i_signature);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	79	goto bail;
				80	}
				81
				82	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
				83	le32_to_cpu(fe->i_clusters))) {
				84	mlog(ML_ERROR, "block offset is outside the allocated size: "
				85	"%llu\n", (unsigned long long)iblock);
				86	goto bail;
				87	}
				88
				89	/* We don't use the page cache to create symlink data, so if
				90	* need be, copy it over from the buffer cache. */
				91	if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
				92	u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
				93	iblock;
				94	buffer_cache_bh = sb_getblk(osb->sb, blkno);
				95	if (!buffer_cache_bh) {
				96	mlog(ML_ERROR, "couldn't getblock for symlink!\n");
				97	goto bail;
				98	}
				99
				100	/* we haven't locked out transactions, so a commit
				101	* could've happened. Since we've got a reference on
				102	* the bh, even if it commits while we're doing the
				103	* copy, the data is still good. */
				104	if (buffer_jbd(buffer_cache_bh)
				105	&& ocfs2_inode_is_new(inode)) {
				106	kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
				107	if (!kaddr) {
				108	mlog(ML_ERROR, "couldn't kmap!\n");
				109	goto bail;
				110	}
				111	memcpy(kaddr + (bh_result->b_size * iblock),
				112	buffer_cache_bh->b_data,
				113	bh_result->b_size);
				114	kunmap_atomic(kaddr, KM_USER0);
				115	set_buffer_uptodate(bh_result);
				116	}
				117	brelse(buffer_cache_bh);
				118	}
				119
				120	map_bh(bh_result, inode->i_sb,
				121	le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
				122
				123	err = 0;
				124
				125	bail:
				126	if (bh)
				127	brelse(bh);
				128
				129	mlog_exit(err);
				130	return err;
				131	}
				132
				133	static int ocfs2_get_block(struct inode *inode, sector_t iblock,
				134	struct buffer_head *bh_result, int create)
				135	{
				136	int err = 0;
				137	u64 p_blkno, past_eof;
				138
				139	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
				140	(unsigned long long)iblock, bh_result, create);
				141
				142	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
				143	mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
				144	inode, inode->i_ino);
				145
				146	if (S_ISLNK(inode->i_mode)) {
				147	/* this always does I/O for some reason. */
				148	err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
				149	goto bail;
				150	}
				151
				152	/* this can happen if another node truncs after our extend! */
				153	spin_lock(&OCFS2_I(inode)->ip_lock);
				154	if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
				155	OCFS2_I(inode)->ip_clusters))
				156	err = -EIO;
				157	spin_unlock(&OCFS2_I(inode)->ip_lock);
				158	if (err)
				159	goto bail;
				160
				161	err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
				162	NULL);
				163	if (err) {
				164	mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
Mark Fasheh	b0697053	2006-03-03 10:24:33 -0800	[diff] [blame]	165	"%llu, NULL)\n", err, inode, (unsigned long long)iblock,
				166	(unsigned long long)p_blkno);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	167	goto bail;
				168	}
				169
				170	map_bh(bh_result, inode->i_sb, p_blkno);
				171
				172	if (bh_result->b_blocknr == 0) {
				173	err = -EIO;
Mark Fasheh	b0697053	2006-03-03 10:24:33 -0800	[diff] [blame]	174	mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
				175	(unsigned long long)iblock,
				176	(unsigned long long)p_blkno,
				177	(unsigned long long)OCFS2_I(inode)->ip_blkno);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	178	}
				179
				180	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
Mark Fasheh	b0697053	2006-03-03 10:24:33 -0800	[diff] [blame]	181	mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
				182	(unsigned long long)past_eof);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	183
				184	if (create && (iblock >= past_eof))
				185	set_buffer_new(bh_result);
				186
				187	bail:
				188	if (err < 0)
				189	err = -EIO;
				190
				191	mlog_exit(err);
				192	return err;
				193	}
				194
				195	static int ocfs2_readpage(struct file file, struct page page)
				196	{
				197	struct inode *inode = page->mapping->host;
				198	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
				199	int ret, unlock = 1;
				200
				201	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
				202
				203	ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
				204	if (ret != 0) {
				205	if (ret == AOP_TRUNCATED_PAGE)
				206	unlock = 0;
				207	mlog_errno(ret);
				208	goto out;
				209	}
				210
				211	down_read(&OCFS2_I(inode)->ip_alloc_sem);
				212
				213	/*
				214	* i_size might have just been updated as we grabed the meta lock. We
				215	* might now be discovering a truncate that hit on another node.
				216	* block_read_full_page->get_block freaks out if it is asked to read
				217	* beyond the end of a file, so we check here. Callers
				218	* (generic_file_read, fault->nopage) are clever enough to check i_size
				219	* and notice that the page they just read isn't needed.
				220	*
				221	* XXX sys_readahead() seems to get that wrong?
				222	*/
				223	if (start >= i_size_read(inode)) {
				224	char *addr = kmap(page);
				225	memset(addr, 0, PAGE_SIZE);
				226	flush_dcache_page(page);
				227	kunmap(page);
				228	SetPageUptodate(page);
				229	ret = 0;
				230	goto out_alloc;
				231	}
				232
				233	ret = ocfs2_data_lock_with_page(inode, 0, page);
				234	if (ret != 0) {
				235	if (ret == AOP_TRUNCATED_PAGE)
				236	unlock = 0;
				237	mlog_errno(ret);
				238	goto out_alloc;
				239	}
				240
				241	ret = block_read_full_page(page, ocfs2_get_block);
				242	unlock = 0;
				243
				244	ocfs2_data_unlock(inode, 0);
				245	out_alloc:
				246	up_read(&OCFS2_I(inode)->ip_alloc_sem);
				247	ocfs2_meta_unlock(inode, 0);
				248	out:
				249	if (unlock)
				250	unlock_page(page);
				251	mlog_exit(ret);
				252	return ret;
				253	}
				254
				255	/* Note: Because we don't support holes, our allocation has
				256	* already happened (allocation writes zeros to the file data)
				257	* so we don't have to worry about ordered writes in
				258	* ocfs2_writepage.
				259	*
				260	* ->writepage is called during the process of invalidating the page cache
				261	* during blocked lock processing. It can't block on any cluster locks
				262	* to during block mapping. It's relying on the fact that the block
				263	* mapping can't have disappeared under the dirty pages that it is
				264	* being asked to write back.
				265	*/
				266	static int ocfs2_writepage(struct page page, struct writeback_control wbc)
				267	{
				268	int ret;
				269
				270	mlog_entry("(0x%p)\n", page);
				271
				272	ret = block_write_full_page(page, ocfs2_get_block, wbc);
				273
				274	mlog_exit(ret);
				275
				276	return ret;
				277	}
				278
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	279	/* This can also be called from ocfs2_write_zero_page() which has done
				280	* it's own cluster locking. */
				281	int ocfs2_prepare_write_nolock(struct inode inode, struct page page,
				282	unsigned from, unsigned to)
				283	{
				284	int ret;
				285
				286	down_read(&OCFS2_I(inode)->ip_alloc_sem);
				287
				288	ret = block_prepare_write(page, from, to, ocfs2_get_block);
				289
				290	up_read(&OCFS2_I(inode)->ip_alloc_sem);
				291
				292	return ret;
				293	}
				294
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	295	/*
				296	* ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
				297	* from loopback. It must be able to perform its own locking around
				298	* ocfs2_get_block().
				299	*/
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	300	static int ocfs2_prepare_write(struct file file, struct page page,
				301	unsigned from, unsigned to)
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	302	{
				303	struct inode *inode = page->mapping->host;
				304	int ret;
				305
				306	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
				307
				308	ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
				309	if (ret != 0) {
				310	mlog_errno(ret);
				311	goto out;
				312	}
				313
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	314	ret = ocfs2_prepare_write_nolock(inode, page, from, to);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	315
				316	ocfs2_meta_unlock(inode, 0);
				317	out:
				318	mlog_exit(ret);
				319	return ret;
				320	}
				321
				322	/* Taken from ext3. We don't necessarily need the full blown
				323	* functionality yet, but IMHO it's better to cut and paste the whole
				324	* thing so we can avoid introducing our own bugs (and easily pick up
				325	* their fixes when they happen) --Mark */
				326	static int walk_page_buffers( handle_t *handle,
				327	struct buffer_head *head,
				328	unsigned from,
				329	unsigned to,
				330	int *partial,
				331	int (fn)( handle_t handle,
				332	struct buffer_head *bh))
				333	{
				334	struct buffer_head *bh;
				335	unsigned block_start, block_end;
				336	unsigned blocksize = head->b_size;
				337	int err, ret = 0;
				338	struct buffer_head *next;
				339
				340	for ( bh = head, block_start = 0;
				341	ret == 0 && (bh != head \|\| !block_start);
				342	block_start = block_end, bh = next)
				343	{
				344	next = bh->b_this_page;
				345	block_end = block_start + blocksize;
				346	if (block_end <= from \|\| block_start >= to) {
				347	if (partial && !buffer_uptodate(bh))
				348	*partial = 1;
				349	continue;
				350	}
				351	err = (*fn)(handle, bh);
				352	if (!ret)
				353	ret = err;
				354	}
				355	return ret;
				356	}
				357
				358	struct ocfs2_journal_handle ocfs2_start_walk_page_trans(struct inode inode,
				359	struct page *page,
				360	unsigned from,
				361	unsigned to)
				362	{
				363	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				364	struct ocfs2_journal_handle *handle = NULL;
				365	int ret = 0;
				366
				367	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
				368	if (!handle) {
				369	ret = -ENOMEM;
				370	mlog_errno(ret);
				371	goto out;
				372	}
				373
				374	if (ocfs2_should_order_data(inode)) {
				375	ret = walk_page_buffers(handle->k_handle,
				376	page_buffers(page),
				377	from, to, NULL,
				378	ocfs2_journal_dirty_data);
				379	if (ret < 0)
				380	mlog_errno(ret);
				381	}
				382	out:
				383	if (ret) {
				384	if (handle)
				385	ocfs2_commit_trans(handle);
				386	handle = ERR_PTR(ret);
				387	}
				388	return handle;
				389	}
				390
				391	static int ocfs2_commit_write(struct file file, struct page page,
				392	unsigned from, unsigned to)
				393	{
				394	int ret, extending = 0, locklevel = 0;
				395	loff_t new_i_size;
				396	struct buffer_head *di_bh = NULL;
				397	struct inode *inode = page->mapping->host;
				398	struct ocfs2_journal_handle *handle = NULL;
				399
				400	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
				401
				402	/* NOTE: ocfs2_file_aio_write has ensured that it's safe for
				403	* us to sample inode->i_size here without the metadata lock:
				404	*
				405	* 1) We're currently holding the inode alloc lock, so no
				406	* nodes can change it underneath us.
				407	*
				408	* 2) We've had to take the metadata lock at least once
				409	* already to check for extending writes, hence insuring
				410	* that our current copy is also up to date.
				411	*/
				412	new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				413	if (new_i_size > i_size_read(inode)) {
				414	extending = 1;
				415	locklevel = 1;
				416	}
				417
				418	ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page);
				419	if (ret != 0) {
				420	mlog_errno(ret);
				421	goto out;
				422	}
				423
				424	ret = ocfs2_data_lock_with_page(inode, 1, page);
				425	if (ret != 0) {
				426	mlog_errno(ret);
				427	goto out_unlock_meta;
				428	}
				429
				430	if (extending) {
				431	handle = ocfs2_start_walk_page_trans(inode, page, from, to);
				432	if (IS_ERR(handle)) {
				433	ret = PTR_ERR(handle);
				434	handle = NULL;
				435	goto out_unlock_data;
				436	}
				437
				438	/* Mark our buffer early. We'd rather catch this error up here
				439	* as opposed to after a successful commit_write which would
				440	* require us to set back inode->i_size. */
				441	ret = ocfs2_journal_access(handle, inode, di_bh,
				442	OCFS2_JOURNAL_ACCESS_WRITE);
				443	if (ret < 0) {
				444	mlog_errno(ret);
				445	goto out_commit;
				446	}
				447	}
				448
				449	/* might update i_size */
				450	ret = generic_commit_write(file, page, from, to);
				451	if (ret < 0) {
				452	mlog_errno(ret);
				453	goto out_commit;
				454	}
				455
				456	if (extending) {
				457	loff_t size = (u64) i_size_read(inode);
				458	struct ocfs2_dinode *di =
				459	(struct ocfs2_dinode *)di_bh->b_data;
				460
				461	/* ocfs2_mark_inode_dirty is too heavy to use here. */
				462	inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
				463	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
				464
				465	di->i_size = cpu_to_le64(size);
				466	di->i_ctime = di->i_mtime =
				467	cpu_to_le64(inode->i_mtime.tv_sec);
				468	di->i_ctime_nsec = di->i_mtime_nsec =
				469	cpu_to_le32(inode->i_mtime.tv_nsec);
				470
				471	ret = ocfs2_journal_dirty(handle, di_bh);
				472	if (ret < 0) {
				473	mlog_errno(ret);
				474	goto out_commit;
				475	}
				476	}
				477
				478	BUG_ON(extending && (i_size_read(inode) != new_i_size));
				479
				480	out_commit:
				481	if (handle)
				482	ocfs2_commit_trans(handle);
				483	out_unlock_data:
				484	ocfs2_data_unlock(inode, 1);
				485	out_unlock_meta:
				486	ocfs2_meta_unlock(inode, locklevel);
				487	out:
				488	if (di_bh)
				489	brelse(di_bh);
				490
				491	mlog_exit(ret);
				492	return ret;
				493	}
				494
				495	static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
				496	{
				497	sector_t status;
				498	u64 p_blkno = 0;
				499	int err = 0;
				500	struct inode *inode = mapping->host;
				501
				502	mlog_entry("(block = %llu)\n", (unsigned long long)block);
				503
				504	/* We don't need to lock journal system files, since they aren't
				505	* accessed concurrently from multiple nodes.
				506	*/
				507	if (!INODE_JOURNAL(inode)) {
				508	err = ocfs2_meta_lock(inode, NULL, NULL, 0);
				509	if (err) {
				510	if (err != -ENOENT)
				511	mlog_errno(err);
				512	goto bail;
				513	}
				514	down_read(&OCFS2_I(inode)->ip_alloc_sem);
				515	}
				516
				517	err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
				518	NULL);
				519
				520	if (!INODE_JOURNAL(inode)) {
				521	up_read(&OCFS2_I(inode)->ip_alloc_sem);
				522	ocfs2_meta_unlock(inode, 0);
				523	}
				524
				525	if (err) {
				526	mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
				527	(unsigned long long)block);
				528	mlog_errno(err);
				529	goto bail;
				530	}
				531
				532
				533	bail:
				534	status = err ? 0 : p_blkno;
				535
				536	mlog_exit((int)status);
				537
				538	return status;
				539	}
				540
				541	/*
				542	* TODO: Make this into a generic get_blocks function.
				543	*
				544	* From do_direct_io in direct-io.c:
				545	* "So what we do is to permit the ->get_blocks function to populate
				546	* bh.b_size with the size of IO which is permitted at this offset and
				547	* this i_blkbits."
				548	*
				549	* This function is called directly from get_more_blocks in direct-io.c.
				550	*
				551	* called like this: dio->get_blocks(dio->inode, fs_startblk,
				552	* fs_count, map_bh, dio->rw == WRITE);
				553	*/
				554	static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	555	struct buffer_head *bh_result, int create)
				556	{
				557	int ret;
				558	u64 vbo_max; /* file offset, max_blocks from iblock */
				559	u64 p_blkno;
				560	int contig_blocks;
				561	unsigned char blocksize_bits;
Badari Pulavarty	1d8fa7a	2006-03-26 01:38:02 -0800	[diff] [blame]	562	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	563
				564	if (!inode \|\| !bh_result) {
				565	mlog(ML_ERROR, "inode or bh_result is null\n");
				566	return -EIO;
				567	}
				568
				569	blocksize_bits = inode->i_sb->s_blocksize_bits;
				570
				571	/* This function won't even be called if the request isn't all
				572	* nicely aligned and of the right size, so there's no need
				573	* for us to check any of that. */
				574
				575	vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;
				576
				577	spin_lock(&OCFS2_I(inode)->ip_lock);
				578	if ((iblock + max_blocks) >
				579	ocfs2_clusters_to_blocks(inode->i_sb,
				580	OCFS2_I(inode)->ip_clusters)) {
				581	spin_unlock(&OCFS2_I(inode)->ip_lock);
				582	ret = -EIO;
				583	goto bail;
				584	}
				585	spin_unlock(&OCFS2_I(inode)->ip_lock);
				586
				587	/* This figures out the size of the next contiguous block, and
				588	* our logical offset */
				589	ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
				590	&contig_blocks);
				591	if (ret) {
				592	mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
				593	(unsigned long long)iblock);
				594	ret = -EIO;
				595	goto bail;
				596	}
				597
				598	map_bh(bh_result, inode->i_sb, p_blkno);
				599
				600	/* make sure we don't map more than max_blocks blocks here as
				601	that's all the kernel will handle at this point. */
				602	if (max_blocks < contig_blocks)
				603	contig_blocks = max_blocks;
				604	bh_result->b_size = contig_blocks << blocksize_bits;
				605	bail:
				606	return ret;
				607	}
				608
				609	/*
				610	* ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
				611	* particularly interested in the aio/dio case. Like the core uses
				612	* i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
				613	* truncation on another.
				614	*/
				615	static void ocfs2_dio_end_io(struct kiocb *iocb,
				616	loff_t offset,
				617	ssize_t bytes,
				618	void *private)
				619	{
				620	struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
				621
				622	/* this io's submitter should not have unlocked this before we could */
				623	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
				624	ocfs2_iocb_clear_rw_locked(iocb);
				625	up_read(&inode->i_alloc_sem);
				626	ocfs2_rw_unlock(inode, 0);
				627	}
				628
				629	static ssize_t ocfs2_direct_IO(int rw,
				630	struct kiocb *iocb,
				631	const struct iovec *iov,
				632	loff_t offset,
				633	unsigned long nr_segs)
				634	{
				635	struct file *file = iocb->ki_filp;
				636	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
				637	int ret;
				638
				639	mlog_entry_void();
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	640
				641	/*
				642	* We get PR data locks even for O_DIRECT. This allows
				643	* concurrent O_DIRECT I/O but doesn't let O_DIRECT with
				644	* extending and buffered zeroing writes race. If they did
				645	* race then the buffered zeroing could be written back after
				646	* the O_DIRECT I/O. It's one thing to tell people not to mix
				647	* buffered and O_DIRECT writes, but expecting them to
				648	* understand that file extension is also an implicit buffered
				649	* write is too much. By getting the PR we force writeback of
				650	* the buffered zeroing before proceeding.
				651	*/
				652	ret = ocfs2_data_lock(inode, 0);
				653	if (ret < 0) {
				654	mlog_errno(ret);
				655	goto out;
				656	}
				657	ocfs2_data_unlock(inode, 0);
				658
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	659	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
				660	inode->i_sb->s_bdev, iov, offset,
				661	nr_segs,
				662	ocfs2_direct_IO_get_blocks,
				663	ocfs2_dio_end_io);
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	664	out:
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	665	mlog_exit(ret);
				666	return ret;
				667	}
				668
				669	struct address_space_operations ocfs2_aops = {
				670	.readpage = ocfs2_readpage,
				671	.writepage = ocfs2_writepage,
				672	.prepare_write = ocfs2_prepare_write,
				673	.commit_write = ocfs2_commit_write,
				674	.bmap = ocfs2_bmap,
				675	.sync_page = block_sync_page,
				676	.direct_IO = ocfs2_direct_IO
				677	};