Blame - fs/ocfs2/aops.c - kernel/msm-4.9

blob: 8f4467a930a548648c33b5cf075a4b022989fd31 [file] [log] [blame]

Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1	/* -- mode: c; c-basic-offset: 8; --
				2	* vim: noexpandtab sw=8 ts=8 sts=0:
				3	*
				4	* Copyright (C) 2002, 2004 Oracle. All rights reserved.
				5	*
				6	* This program is free software; you can redistribute it and/or
				7	* modify it under the terms of the GNU General Public
				8	* License as published by the Free Software Foundation; either
				9	* version 2 of the License, or (at your option) any later version.
				10	*
				11	* This program is distributed in the hope that it will be useful,
				12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				14	* General Public License for more details.
				15	*
				16	* You should have received a copy of the GNU General Public
				17	* License along with this program; if not, write to the
				18	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				19	* Boston, MA 021110-1307, USA.
				20	*/
				21
				22	#include <linux/fs.h>
				23	#include <linux/slab.h>
				24	#include <linux/highmem.h>
				25	#include <linux/pagemap.h>
				26	#include <asm/byteorder.h>
				27
				28	#define MLOG_MASK_PREFIX ML_FILE_IO
				29	#include <cluster/masklog.h>
				30
				31	#include "ocfs2.h"
				32
				33	#include "alloc.h"
				34	#include "aops.h"
				35	#include "dlmglue.h"
				36	#include "extent_map.h"
				37	#include "file.h"
				38	#include "inode.h"
				39	#include "journal.h"
				40	#include "super.h"
				41	#include "symlink.h"
				42
				43	#include "buffer_head_io.h"
				44
				45	static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
				46	struct buffer_head *bh_result, int create)
				47	{
				48	int err = -EIO;
				49	int status;
				50	struct ocfs2_dinode *fe = NULL;
				51	struct buffer_head *bh = NULL;
				52	struct buffer_head *buffer_cache_bh = NULL;
				53	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				54	void *kaddr;
				55
				56	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
				57	(unsigned long long)iblock, bh_result, create);
				58
				59	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
				60
				61	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
				62	mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
				63	(unsigned long long)iblock);
				64	goto bail;
				65	}
				66
				67	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
				68	OCFS2_I(inode)->ip_blkno,
				69	&bh, OCFS2_BH_CACHED, inode);
				70	if (status < 0) {
				71	mlog_errno(status);
				72	goto bail;
				73	}
				74	fe = (struct ocfs2_dinode *) bh->b_data;
				75
				76	if (!OCFS2_IS_VALID_DINODE(fe)) {
				77	mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
				78	fe->i_blkno, 7, fe->i_signature);
				79	goto bail;
				80	}
				81
				82	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
				83	le32_to_cpu(fe->i_clusters))) {
				84	mlog(ML_ERROR, "block offset is outside the allocated size: "
				85	"%llu\n", (unsigned long long)iblock);
				86	goto bail;
				87	}
				88
				89	/* We don't use the page cache to create symlink data, so if
				90	* need be, copy it over from the buffer cache. */
				91	if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
				92	u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
				93	iblock;
				94	buffer_cache_bh = sb_getblk(osb->sb, blkno);
				95	if (!buffer_cache_bh) {
				96	mlog(ML_ERROR, "couldn't getblock for symlink!\n");
				97	goto bail;
				98	}
				99
				100	/* we haven't locked out transactions, so a commit
				101	* could've happened. Since we've got a reference on
				102	* the bh, even if it commits while we're doing the
				103	* copy, the data is still good. */
				104	if (buffer_jbd(buffer_cache_bh)
				105	&& ocfs2_inode_is_new(inode)) {
				106	kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
				107	if (!kaddr) {
				108	mlog(ML_ERROR, "couldn't kmap!\n");
				109	goto bail;
				110	}
				111	memcpy(kaddr + (bh_result->b_size * iblock),
				112	buffer_cache_bh->b_data,
				113	bh_result->b_size);
				114	kunmap_atomic(kaddr, KM_USER0);
				115	set_buffer_uptodate(bh_result);
				116	}
				117	brelse(buffer_cache_bh);
				118	}
				119
				120	map_bh(bh_result, inode->i_sb,
				121	le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
				122
				123	err = 0;
				124
				125	bail:
				126	if (bh)
				127	brelse(bh);
				128
				129	mlog_exit(err);
				130	return err;
				131	}
				132
				133	static int ocfs2_get_block(struct inode *inode, sector_t iblock,
				134	struct buffer_head *bh_result, int create)
				135	{
				136	int err = 0;
				137	u64 p_blkno, past_eof;
				138
				139	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
				140	(unsigned long long)iblock, bh_result, create);
				141
				142	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
				143	mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
				144	inode, inode->i_ino);
				145
				146	if (S_ISLNK(inode->i_mode)) {
				147	/* this always does I/O for some reason. */
				148	err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
				149	goto bail;
				150	}
				151
				152	/* this can happen if another node truncs after our extend! */
				153	spin_lock(&OCFS2_I(inode)->ip_lock);
				154	if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
				155	OCFS2_I(inode)->ip_clusters))
				156	err = -EIO;
				157	spin_unlock(&OCFS2_I(inode)->ip_lock);
				158	if (err)
				159	goto bail;
				160
				161	err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
				162	NULL);
				163	if (err) {
				164	mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
				165	"%"MLFu64", NULL)\n", err, inode,
				166	(unsigned long long)iblock, p_blkno);
				167	goto bail;
				168	}
				169
				170	map_bh(bh_result, inode->i_sb, p_blkno);
				171
				172	if (bh_result->b_blocknr == 0) {
				173	err = -EIO;
				174	mlog(ML_ERROR, "iblock = %llu p_blkno = %"MLFu64" "
				175	"blkno=(%"MLFu64")\n", (unsigned long long)iblock,
				176	p_blkno, OCFS2_I(inode)->ip_blkno);
				177	}
				178
				179	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
				180	mlog(0, "Inode %lu, past_eof = %"MLFu64"\n", inode->i_ino, past_eof);
				181
				182	if (create && (iblock >= past_eof))
				183	set_buffer_new(bh_result);
				184
				185	bail:
				186	if (err < 0)
				187	err = -EIO;
				188
				189	mlog_exit(err);
				190	return err;
				191	}
				192
				193	static int ocfs2_readpage(struct file file, struct page page)
				194	{
				195	struct inode *inode = page->mapping->host;
				196	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
				197	int ret, unlock = 1;
				198
				199	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
				200
				201	ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
				202	if (ret != 0) {
				203	if (ret == AOP_TRUNCATED_PAGE)
				204	unlock = 0;
				205	mlog_errno(ret);
				206	goto out;
				207	}
				208
				209	down_read(&OCFS2_I(inode)->ip_alloc_sem);
				210
				211	/*
				212	* i_size might have just been updated as we grabed the meta lock. We
				213	* might now be discovering a truncate that hit on another node.
				214	* block_read_full_page->get_block freaks out if it is asked to read
				215	* beyond the end of a file, so we check here. Callers
				216	* (generic_file_read, fault->nopage) are clever enough to check i_size
				217	* and notice that the page they just read isn't needed.
				218	*
				219	* XXX sys_readahead() seems to get that wrong?
				220	*/
				221	if (start >= i_size_read(inode)) {
				222	char *addr = kmap(page);
				223	memset(addr, 0, PAGE_SIZE);
				224	flush_dcache_page(page);
				225	kunmap(page);
				226	SetPageUptodate(page);
				227	ret = 0;
				228	goto out_alloc;
				229	}
				230
				231	ret = ocfs2_data_lock_with_page(inode, 0, page);
				232	if (ret != 0) {
				233	if (ret == AOP_TRUNCATED_PAGE)
				234	unlock = 0;
				235	mlog_errno(ret);
				236	goto out_alloc;
				237	}
				238
				239	ret = block_read_full_page(page, ocfs2_get_block);
				240	unlock = 0;
				241
				242	ocfs2_data_unlock(inode, 0);
				243	out_alloc:
				244	up_read(&OCFS2_I(inode)->ip_alloc_sem);
				245	ocfs2_meta_unlock(inode, 0);
				246	out:
				247	if (unlock)
				248	unlock_page(page);
				249	mlog_exit(ret);
				250	return ret;
				251	}
				252
				253	/* Note: Because we don't support holes, our allocation has
				254	* already happened (allocation writes zeros to the file data)
				255	* so we don't have to worry about ordered writes in
				256	* ocfs2_writepage.
				257	*
				258	* ->writepage is called during the process of invalidating the page cache
				259	* during blocked lock processing. It can't block on any cluster locks
				260	* to during block mapping. It's relying on the fact that the block
				261	* mapping can't have disappeared under the dirty pages that it is
				262	* being asked to write back.
				263	*/
				264	static int ocfs2_writepage(struct page page, struct writeback_control wbc)
				265	{
				266	int ret;
				267
				268	mlog_entry("(0x%p)\n", page);
				269
				270	ret = block_write_full_page(page, ocfs2_get_block, wbc);
				271
				272	mlog_exit(ret);
				273
				274	return ret;
				275	}
				276
				277	/*
				278	* ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
				279	* from loopback. It must be able to perform its own locking around
				280	* ocfs2_get_block().
				281	*/
				282	int ocfs2_prepare_write(struct file file, struct page page,
				283	unsigned from, unsigned to)
				284	{
				285	struct inode *inode = page->mapping->host;
				286	int ret;
				287
				288	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
				289
				290	ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
				291	if (ret != 0) {
				292	mlog_errno(ret);
				293	goto out;
				294	}
				295
				296	down_read(&OCFS2_I(inode)->ip_alloc_sem);
				297
				298	ret = block_prepare_write(page, from, to, ocfs2_get_block);
				299
				300	up_read(&OCFS2_I(inode)->ip_alloc_sem);
				301
				302	ocfs2_meta_unlock(inode, 0);
				303	out:
				304	mlog_exit(ret);
				305	return ret;
				306	}
				307
				308	/* Taken from ext3. We don't necessarily need the full blown
				309	* functionality yet, but IMHO it's better to cut and paste the whole
				310	* thing so we can avoid introducing our own bugs (and easily pick up
				311	* their fixes when they happen) --Mark */
				312	static int walk_page_buffers( handle_t *handle,
				313	struct buffer_head *head,
				314	unsigned from,
				315	unsigned to,
				316	int *partial,
				317	int (fn)( handle_t handle,
				318	struct buffer_head *bh))
				319	{
				320	struct buffer_head *bh;
				321	unsigned block_start, block_end;
				322	unsigned blocksize = head->b_size;
				323	int err, ret = 0;
				324	struct buffer_head *next;
				325
				326	for ( bh = head, block_start = 0;
				327	ret == 0 && (bh != head \|\| !block_start);
				328	block_start = block_end, bh = next)
				329	{
				330	next = bh->b_this_page;
				331	block_end = block_start + blocksize;
				332	if (block_end <= from \|\| block_start >= to) {
				333	if (partial && !buffer_uptodate(bh))
				334	*partial = 1;
				335	continue;
				336	}
				337	err = (*fn)(handle, bh);
				338	if (!ret)
				339	ret = err;
				340	}
				341	return ret;
				342	}
				343
				344	struct ocfs2_journal_handle ocfs2_start_walk_page_trans(struct inode inode,
				345	struct page *page,
				346	unsigned from,
				347	unsigned to)
				348	{
				349	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				350	struct ocfs2_journal_handle *handle = NULL;
				351	int ret = 0;
				352
				353	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
				354	if (!handle) {
				355	ret = -ENOMEM;
				356	mlog_errno(ret);
				357	goto out;
				358	}
				359
				360	if (ocfs2_should_order_data(inode)) {
				361	ret = walk_page_buffers(handle->k_handle,
				362	page_buffers(page),
				363	from, to, NULL,
				364	ocfs2_journal_dirty_data);
				365	if (ret < 0)
				366	mlog_errno(ret);
				367	}
				368	out:
				369	if (ret) {
				370	if (handle)
				371	ocfs2_commit_trans(handle);
				372	handle = ERR_PTR(ret);
				373	}
				374	return handle;
				375	}
				376
				377	static int ocfs2_commit_write(struct file file, struct page page,
				378	unsigned from, unsigned to)
				379	{
				380	int ret, extending = 0, locklevel = 0;
				381	loff_t new_i_size;
				382	struct buffer_head *di_bh = NULL;
				383	struct inode *inode = page->mapping->host;
				384	struct ocfs2_journal_handle *handle = NULL;
				385
				386	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
				387
				388	/* NOTE: ocfs2_file_aio_write has ensured that it's safe for
				389	* us to sample inode->i_size here without the metadata lock:
				390	*
				391	* 1) We're currently holding the inode alloc lock, so no
				392	* nodes can change it underneath us.
				393	*
				394	* 2) We've had to take the metadata lock at least once
				395	* already to check for extending writes, hence insuring
				396	* that our current copy is also up to date.
				397	*/
				398	new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				399	if (new_i_size > i_size_read(inode)) {
				400	extending = 1;
				401	locklevel = 1;
				402	}
				403
				404	ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page);
				405	if (ret != 0) {
				406	mlog_errno(ret);
				407	goto out;
				408	}
				409
				410	ret = ocfs2_data_lock_with_page(inode, 1, page);
				411	if (ret != 0) {
				412	mlog_errno(ret);
				413	goto out_unlock_meta;
				414	}
				415
				416	if (extending) {
				417	handle = ocfs2_start_walk_page_trans(inode, page, from, to);
				418	if (IS_ERR(handle)) {
				419	ret = PTR_ERR(handle);
				420	handle = NULL;
				421	goto out_unlock_data;
				422	}
				423
				424	/* Mark our buffer early. We'd rather catch this error up here
				425	* as opposed to after a successful commit_write which would
				426	* require us to set back inode->i_size. */
				427	ret = ocfs2_journal_access(handle, inode, di_bh,
				428	OCFS2_JOURNAL_ACCESS_WRITE);
				429	if (ret < 0) {
				430	mlog_errno(ret);
				431	goto out_commit;
				432	}
				433	}
				434
				435	/* might update i_size */
				436	ret = generic_commit_write(file, page, from, to);
				437	if (ret < 0) {
				438	mlog_errno(ret);
				439	goto out_commit;
				440	}
				441
				442	if (extending) {
				443	loff_t size = (u64) i_size_read(inode);
				444	struct ocfs2_dinode *di =
				445	(struct ocfs2_dinode *)di_bh->b_data;
				446
				447	/* ocfs2_mark_inode_dirty is too heavy to use here. */
				448	inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
				449	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
				450
				451	di->i_size = cpu_to_le64(size);
				452	di->i_ctime = di->i_mtime =
				453	cpu_to_le64(inode->i_mtime.tv_sec);
				454	di->i_ctime_nsec = di->i_mtime_nsec =
				455	cpu_to_le32(inode->i_mtime.tv_nsec);
				456
				457	ret = ocfs2_journal_dirty(handle, di_bh);
				458	if (ret < 0) {
				459	mlog_errno(ret);
				460	goto out_commit;
				461	}
				462	}
				463
				464	BUG_ON(extending && (i_size_read(inode) != new_i_size));
				465
				466	out_commit:
				467	if (handle)
				468	ocfs2_commit_trans(handle);
				469	out_unlock_data:
				470	ocfs2_data_unlock(inode, 1);
				471	out_unlock_meta:
				472	ocfs2_meta_unlock(inode, locklevel);
				473	out:
				474	if (di_bh)
				475	brelse(di_bh);
				476
				477	mlog_exit(ret);
				478	return ret;
				479	}
				480
				481	static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
				482	{
				483	sector_t status;
				484	u64 p_blkno = 0;
				485	int err = 0;
				486	struct inode *inode = mapping->host;
				487
				488	mlog_entry("(block = %llu)\n", (unsigned long long)block);
				489
				490	/* We don't need to lock journal system files, since they aren't
				491	* accessed concurrently from multiple nodes.
				492	*/
				493	if (!INODE_JOURNAL(inode)) {
				494	err = ocfs2_meta_lock(inode, NULL, NULL, 0);
				495	if (err) {
				496	if (err != -ENOENT)
				497	mlog_errno(err);
				498	goto bail;
				499	}
				500	down_read(&OCFS2_I(inode)->ip_alloc_sem);
				501	}
				502
				503	err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
				504	NULL);
				505
				506	if (!INODE_JOURNAL(inode)) {
				507	up_read(&OCFS2_I(inode)->ip_alloc_sem);
				508	ocfs2_meta_unlock(inode, 0);
				509	}
				510
				511	if (err) {
				512	mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
				513	(unsigned long long)block);
				514	mlog_errno(err);
				515	goto bail;
				516	}
				517
				518
				519	bail:
				520	status = err ? 0 : p_blkno;
				521
				522	mlog_exit((int)status);
				523
				524	return status;
				525	}
				526
				527	/*
				528	* TODO: Make this into a generic get_blocks function.
				529	*
				530	* From do_direct_io in direct-io.c:
				531	* "So what we do is to permit the ->get_blocks function to populate
				532	* bh.b_size with the size of IO which is permitted at this offset and
				533	* this i_blkbits."
				534	*
				535	* This function is called directly from get_more_blocks in direct-io.c.
				536	*
				537	* called like this: dio->get_blocks(dio->inode, fs_startblk,
				538	* fs_count, map_bh, dio->rw == WRITE);
				539	*/
				540	static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
				541	unsigned long max_blocks,
				542	struct buffer_head *bh_result, int create)
				543	{
				544	int ret;
				545	u64 vbo_max; /* file offset, max_blocks from iblock */
				546	u64 p_blkno;
				547	int contig_blocks;
				548	unsigned char blocksize_bits;
				549
				550	if (!inode \|\| !bh_result) {
				551	mlog(ML_ERROR, "inode or bh_result is null\n");
				552	return -EIO;
				553	}
				554
				555	blocksize_bits = inode->i_sb->s_blocksize_bits;
				556
				557	/* This function won't even be called if the request isn't all
				558	* nicely aligned and of the right size, so there's no need
				559	* for us to check any of that. */
				560
				561	vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;
				562
				563	spin_lock(&OCFS2_I(inode)->ip_lock);
				564	if ((iblock + max_blocks) >
				565	ocfs2_clusters_to_blocks(inode->i_sb,
				566	OCFS2_I(inode)->ip_clusters)) {
				567	spin_unlock(&OCFS2_I(inode)->ip_lock);
				568	ret = -EIO;
				569	goto bail;
				570	}
				571	spin_unlock(&OCFS2_I(inode)->ip_lock);
				572
				573	/* This figures out the size of the next contiguous block, and
				574	* our logical offset */
				575	ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
				576	&contig_blocks);
				577	if (ret) {
				578	mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
				579	(unsigned long long)iblock);
				580	ret = -EIO;
				581	goto bail;
				582	}
				583
				584	map_bh(bh_result, inode->i_sb, p_blkno);
				585
				586	/* make sure we don't map more than max_blocks blocks here as
				587	that's all the kernel will handle at this point. */
				588	if (max_blocks < contig_blocks)
				589	contig_blocks = max_blocks;
				590	bh_result->b_size = contig_blocks << blocksize_bits;
				591	bail:
				592	return ret;
				593	}
				594
				595	/*
				596	* ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
				597	* particularly interested in the aio/dio case. Like the core uses
				598	* i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
				599	* truncation on another.
				600	*/
				601	static void ocfs2_dio_end_io(struct kiocb *iocb,
				602	loff_t offset,
				603	ssize_t bytes,
				604	void *private)
				605	{
				606	struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
				607
				608	/* this io's submitter should not have unlocked this before we could */
				609	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
				610	ocfs2_iocb_clear_rw_locked(iocb);
				611	up_read(&inode->i_alloc_sem);
				612	ocfs2_rw_unlock(inode, 0);
				613	}
				614
				615	static ssize_t ocfs2_direct_IO(int rw,
				616	struct kiocb *iocb,
				617	const struct iovec *iov,
				618	loff_t offset,
				619	unsigned long nr_segs)
				620	{
				621	struct file *file = iocb->ki_filp;
				622	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
				623	int ret;
				624
				625	mlog_entry_void();
				626	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
				627	inode->i_sb->s_bdev, iov, offset,
				628	nr_segs,
				629	ocfs2_direct_IO_get_blocks,
				630	ocfs2_dio_end_io);
				631	mlog_exit(ret);
				632	return ret;
				633	}
				634
				635	struct address_space_operations ocfs2_aops = {
				636	.readpage = ocfs2_readpage,
				637	.writepage = ocfs2_writepage,
				638	.prepare_write = ocfs2_prepare_write,
				639	.commit_write = ocfs2_commit_write,
				640	.bmap = ocfs2_bmap,
				641	.sync_page = block_sync_page,
				642	.direct_IO = ocfs2_direct_IO
				643	};