Blame - fs/xfs/linux-2.6/xfs_lrw.c - kernel/msm-4.9

blob: ff145fd0d1a42e3771a0f4fd4691565864fce7b6 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or modify it
				5	* under the terms of version 2 of the GNU General Public License as
				6	* published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it would be useful, but
				9	* WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
				11	*
				12	* Further, this software is distributed without any warranty that it is
				13	* free of the rightful claim of any third person regarding infringement
				14	* or the like. Any license provided herein, whether implied or
				15	* otherwise, applies only to this software file. Patent licenses, if
				16	* any, provided herein do not apply to combinations of this program with
				17	* other software, or any other product whatsoever.
				18	*
				19	* You should have received a copy of the GNU General Public License along
				20	* with this program; if not, write the Free Software Foundation, Inc., 59
				21	* Temple Place - Suite 330, Boston MA 02111-1307, USA.
				22	*
				23	* Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
				24	* Mountain View, CA 94043, or:
				25	*
				26	* http://www.sgi.com
				27	*
				28	* For further information regarding this notice, see:
				29	*
				30	* http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
				31	*/
				32	/*
				33	* fs/xfs/linux/xfs_lrw.c (Linux Read Write stuff)
				34	*
				35	*/
				36
				37	#include "xfs.h"
				38
				39	#include "xfs_fs.h"
				40	#include "xfs_inum.h"
				41	#include "xfs_log.h"
				42	#include "xfs_trans.h"
				43	#include "xfs_sb.h"
				44	#include "xfs_ag.h"
				45	#include "xfs_dir.h"
				46	#include "xfs_dir2.h"
				47	#include "xfs_alloc.h"
				48	#include "xfs_dmapi.h"
				49	#include "xfs_quota.h"
				50	#include "xfs_mount.h"
				51	#include "xfs_alloc_btree.h"
				52	#include "xfs_bmap_btree.h"
				53	#include "xfs_ialloc_btree.h"
				54	#include "xfs_btree.h"
				55	#include "xfs_ialloc.h"
				56	#include "xfs_attr_sf.h"
				57	#include "xfs_dir_sf.h"
				58	#include "xfs_dir2_sf.h"
				59	#include "xfs_dinode.h"
				60	#include "xfs_inode.h"
				61	#include "xfs_bmap.h"
				62	#include "xfs_bit.h"
				63	#include "xfs_rtalloc.h"
				64	#include "xfs_error.h"
				65	#include "xfs_itable.h"
				66	#include "xfs_rw.h"
				67	#include "xfs_acl.h"
				68	#include "xfs_cap.h"
				69	#include "xfs_mac.h"
				70	#include "xfs_attr.h"
				71	#include "xfs_inode_item.h"
				72	#include "xfs_buf_item.h"
				73	#include "xfs_utils.h"
				74	#include "xfs_iomap.h"
				75
				76	#include <linux/capability.h>
				77	#include <linux/writeback.h>
				78
				79
				80	#if defined(XFS_RW_TRACE)
				81	void
				82	xfs_rw_enter_trace(
				83	int tag,
				84	xfs_iocore_t *io,
				85	void *data,
				86	size_t segs,
				87	loff_t offset,
				88	int ioflags)
				89	{
				90	xfs_inode_t *ip = XFS_IO_INODE(io);
				91
				92	if (ip->i_rwtrace == NULL)
				93	return;
				94	ktrace_enter(ip->i_rwtrace,
				95	(void *)(unsigned long)tag,
				96	(void *)ip,
				97	(void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
				98	(void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
				99	(void *)data,
				100	(void *)((unsigned long)segs),
				101	(void *)((unsigned long)((offset >> 32) & 0xffffffff)),
				102	(void *)((unsigned long)(offset & 0xffffffff)),
				103	(void *)((unsigned long)ioflags),
				104	(void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)),
				105	(void *)((unsigned long)(io->io_new_size & 0xffffffff)),
				106	(void *)NULL,
				107	(void *)NULL,
				108	(void *)NULL,
				109	(void *)NULL,
				110	(void *)NULL);
				111	}
				112
				113	void
				114	xfs_inval_cached_trace(
				115	xfs_iocore_t *io,
				116	xfs_off_t offset,
				117	xfs_off_t len,
				118	xfs_off_t first,
				119	xfs_off_t last)
				120	{
				121	xfs_inode_t *ip = XFS_IO_INODE(io);
				122
				123	if (ip->i_rwtrace == NULL)
				124	return;
				125	ktrace_enter(ip->i_rwtrace,
				126	(void *)(__psint_t)XFS_INVAL_CACHED,
				127	(void *)ip,
				128	(void *)((unsigned long)((offset >> 32) & 0xffffffff)),
				129	(void *)((unsigned long)(offset & 0xffffffff)),
				130	(void *)((unsigned long)((len >> 32) & 0xffffffff)),
				131	(void *)((unsigned long)(len & 0xffffffff)),
				132	(void *)((unsigned long)((first >> 32) & 0xffffffff)),
				133	(void *)((unsigned long)(first & 0xffffffff)),
				134	(void *)((unsigned long)((last >> 32) & 0xffffffff)),
				135	(void *)((unsigned long)(last & 0xffffffff)),
				136	(void *)NULL,
				137	(void *)NULL,
				138	(void *)NULL,
				139	(void *)NULL,
				140	(void *)NULL,
				141	(void *)NULL);
				142	}
				143	#endif
				144
				145	/*
				146	* xfs_iozero
				147	*
				148	* xfs_iozero clears the specified range of buffer supplied,
				149	* and marks all the affected blocks as valid and modified. If
				150	* an affected block is not allocated, it will be allocated. If
				151	* an affected block is not completely overwritten, and is not
				152	* valid before the operation, it will be read from disk before
				153	* being partially zeroed.
				154	*/
				155	STATIC int
				156	xfs_iozero(
				157	struct inode ip, / inode */
				158	loff_t pos, /* offset in file */
				159	size_t count, /* size of data to zero */
				160	loff_t end_size) /* max file size to set */
				161	{
				162	unsigned bytes;
				163	struct page *page;
				164	struct address_space *mapping;
				165	char *kaddr;
				166	int status;
				167
				168	mapping = ip->i_mapping;
				169	do {
				170	unsigned long index, offset;
				171
				172	offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
				173	index = pos >> PAGE_CACHE_SHIFT;
				174	bytes = PAGE_CACHE_SIZE - offset;
				175	if (bytes > count)
				176	bytes = count;
				177
				178	status = -ENOMEM;
				179	page = grab_cache_page(mapping, index);
				180	if (!page)
				181	break;
				182
				183	kaddr = kmap(page);
				184	status = mapping->a_ops->prepare_write(NULL, page, offset,
				185	offset + bytes);
				186	if (status) {
				187	goto unlock;
				188	}
				189
				190	memset((void *) (kaddr + offset), 0, bytes);
				191	flush_dcache_page(page);
				192	status = mapping->a_ops->commit_write(NULL, page, offset,
				193	offset + bytes);
				194	if (!status) {
				195	pos += bytes;
				196	count -= bytes;
				197	if (pos > i_size_read(ip))
				198	i_size_write(ip, pos < end_size ? pos : end_size);
				199	}
				200
				201	unlock:
				202	kunmap(page);
				203	unlock_page(page);
				204	page_cache_release(page);
				205	if (status)
				206	break;
				207	} while (count);
				208
				209	return (-status);
				210	}
				211
				212	/*
				213	* xfs_inval_cached_pages
				214	*
				215	* This routine is responsible for keeping direct I/O and buffered I/O
				216	* somewhat coherent. From here we make sure that we're at least
				217	* temporarily holding the inode I/O lock exclusively and then call
				218	* the page cache to flush and invalidate any cached pages. If there
				219	* are no cached pages this routine will be very quick.
				220	*/
				221	void
				222	xfs_inval_cached_pages(
				223	vnode_t *vp,
				224	xfs_iocore_t *io,
				225	xfs_off_t offset,
				226	int write,
				227	int relock)
				228	{
				229	if (VN_CACHED(vp)) {
				230	xfs_inval_cached_trace(io, offset, -1, ctooff(offtoct(offset)), -1);
				231	VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(offset)), -1, FI_REMAPF_LOCKED);
				232	}
				233
				234	}
				235
				236	ssize_t /* bytes read, or (-) error */
				237	xfs_read(
				238	bhv_desc_t *bdp,
				239	struct kiocb *iocb,
				240	const struct iovec *iovp,
				241	unsigned int segs,
				242	loff_t *offset,
				243	int ioflags,
				244	cred_t *credp)
				245	{
				246	struct file *file = iocb->ki_filp;
				247	struct inode *inode = file->f_mapping->host;
				248	size_t size = 0;
				249	ssize_t ret;
				250	xfs_fsize_t n;
				251	xfs_inode_t *ip;
				252	xfs_mount_t *mp;
				253	vnode_t *vp;
				254	unsigned long seg;
				255
				256	ip = XFS_BHVTOI(bdp);
				257	vp = BHV_TO_VNODE(bdp);
				258	mp = ip->i_mount;
				259
				260	XFS_STATS_INC(xs_read_calls);
				261
				262	/* START copy & waste from filemap.c */
				263	for (seg = 0; seg < segs; seg++) {
				264	const struct iovec *iv = &iovp[seg];
				265
				266	/*
				267	* If any segment has a negative length, or the cumulative
				268	* length ever wraps negative then return -EINVAL.
				269	*/
				270	size += iv->iov_len;
				271	if (unlikely((ssize_t)(size\|iv->iov_len) < 0))
				272	return XFS_ERROR(-EINVAL);
				273	}
				274	/* END copy & waste from filemap.c */
				275
				276	if (unlikely(ioflags & IO_ISDIRECT)) {
				277	xfs_buftarg_t *target =
				278	(ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
				279	mp->m_rtdev_targp : mp->m_ddev_targp;
				280	if ((*offset & target->pbr_smask) \|\|
				281	(size & target->pbr_smask)) {
				282	if (*offset == ip->i_d.di_size) {
				283	return (0);
				284	}
				285	return -XFS_ERROR(EINVAL);
				286	}
				287	}
				288
				289	n = XFS_MAXIOFFSET(mp) - *offset;
				290	if ((n <= 0) \|\| (size == 0))
				291	return 0;
				292
				293	if (n < size)
				294	size = n;
				295
				296	if (XFS_FORCED_SHUTDOWN(mp)) {
				297	return -EIO;
				298	}
				299
				300	if (unlikely(ioflags & IO_ISDIRECT))
				301	down(&inode->i_sem);
				302	xfs_ilock(ip, XFS_IOLOCK_SHARED);
				303
				304	if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
				305	!(ioflags & IO_INVIS)) {
				306	vrwlock_t locktype = VRWLOCK_READ;
				307
				308	ret = -XFS_SEND_DATA(mp, DM_EVENT_READ,
				309	BHV_TO_VNODE(bdp), *offset, size,
				310	FILP_DELAY_FLAG(file), &locktype);
				311	if (ret) {
				312	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
				313	goto unlock_isem;
				314	}
				315	}
				316
				317	xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
				318	(void )iovp, segs, offset, ioflags);
				319	ret = __generic_file_aio_read(iocb, iovp, segs, offset);
				320	if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
				321	ret = wait_on_sync_kiocb(iocb);
				322	if (ret > 0)
				323	XFS_STATS_ADD(xs_read_bytes, ret);
				324
				325	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
				326
				327	if (likely(!(ioflags & IO_INVIS)))
				328	xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
				329
				330	unlock_isem:
				331	if (unlikely(ioflags & IO_ISDIRECT))
				332	up(&inode->i_sem);
				333	return ret;
				334	}
				335
				336	ssize_t
				337	xfs_sendfile(
				338	bhv_desc_t *bdp,
				339	struct file *filp,
				340	loff_t *offset,
				341	int ioflags,
				342	size_t count,
				343	read_actor_t actor,
				344	void *target,
				345	cred_t *credp)
				346	{
				347	ssize_t ret;
				348	xfs_fsize_t n;
				349	xfs_inode_t *ip;
				350	xfs_mount_t *mp;
				351	vnode_t *vp;
				352
				353	ip = XFS_BHVTOI(bdp);
				354	vp = BHV_TO_VNODE(bdp);
				355	mp = ip->i_mount;
				356
				357	XFS_STATS_INC(xs_read_calls);
				358
				359	n = XFS_MAXIOFFSET(mp) - *offset;
				360	if ((n <= 0) \|\| (count == 0))
				361	return 0;
				362
				363	if (n < count)
				364	count = n;
				365
				366	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
				367	return -EIO;
				368
				369	xfs_ilock(ip, XFS_IOLOCK_SHARED);
				370
				371	if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
				372	(!(ioflags & IO_INVIS))) {
				373	vrwlock_t locktype = VRWLOCK_READ;
				374	int error;
				375
				376	error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), *offset, count,
				377	FILP_DELAY_FLAG(filp), &locktype);
				378	if (error) {
				379	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
				380	return -error;
				381	}
				382	}
				383	xfs_rw_enter_trace(XFS_SENDFILE_ENTER, &ip->i_iocore,
				384	(void )(unsigned long)target, count, offset, ioflags);
				385	ret = generic_file_sendfile(filp, offset, count, actor, target);
				386
				387	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
				388
				389	if (ret > 0)
				390	XFS_STATS_ADD(xs_read_bytes, ret);
				391
				392	if (likely(!(ioflags & IO_INVIS)))
				393	xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
				394
				395	return ret;
				396	}
				397
				398	/*
				399	* This routine is called to handle zeroing any space in the last
				400	* block of the file that is beyond the EOF. We do this since the
				401	* size is being increased without writing anything to that block
				402	* and we don't want anyone to read the garbage on the disk.
				403	*/
				404	STATIC int /* error (positive) */
				405	xfs_zero_last_block(
				406	struct inode *ip,
				407	xfs_iocore_t *io,
				408	xfs_off_t offset,
				409	xfs_fsize_t isize,
				410	xfs_fsize_t end_size)
				411	{
				412	xfs_fileoff_t last_fsb;
				413	xfs_mount_t *mp;
				414	int nimaps;
				415	int zero_offset;
				416	int zero_len;
				417	int isize_fsb_offset;
				418	int error = 0;
				419	xfs_bmbt_irec_t imap;
				420	loff_t loff;
				421	size_t lsize;
				422
				423	ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);
				424	ASSERT(offset > isize);
				425
				426	mp = io->io_mount;
				427
				428	isize_fsb_offset = XFS_B_FSB_OFFSET(mp, isize);
				429	if (isize_fsb_offset == 0) {
				430	/*
				431	* There are no extra bytes in the last block on disk to
				432	* zero, so return.
				433	*/
				434	return 0;
				435	}
				436
				437	last_fsb = XFS_B_TO_FSBT(mp, isize);
				438	nimaps = 1;
				439	error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap,
				440	&nimaps, NULL);
				441	if (error) {
				442	return error;
				443	}
				444	ASSERT(nimaps > 0);
				445	/*
				446	* If the block underlying isize is just a hole, then there
				447	* is nothing to zero.
				448	*/
				449	if (imap.br_startblock == HOLESTARTBLOCK) {
				450	return 0;
				451	}
				452	/*
				453	* Zero the part of the last block beyond the EOF, and write it
				454	* out sync. We need to drop the ilock while we do this so we
				455	* don't deadlock when the buffer cache calls back to us.
				456	*/
				457	XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL\| XFS_EXTSIZE_RD);
				458	loff = XFS_FSB_TO_B(mp, last_fsb);
				459	lsize = XFS_FSB_TO_B(mp, 1);
				460
				461	zero_offset = isize_fsb_offset;
				462	zero_len = mp->m_sb.sb_blocksize - isize_fsb_offset;
				463
				464	error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size);
				465
				466	XFS_ILOCK(mp, io, XFS_ILOCK_EXCL\|XFS_EXTSIZE_RD);
				467	ASSERT(error >= 0);
				468	return error;
				469	}
				470
				471	/*
				472	* Zero any on disk space between the current EOF and the new,
				473	* larger EOF. This handles the normal case of zeroing the remainder
				474	* of the last block in the file and the unusual case of zeroing blocks
				475	* out beyond the size of the file. This second case only happens
				476	* with fixed size extents and when the system crashes before the inode
				477	* size was updated but after blocks were allocated. If fill is set,
				478	* then any holes in the range are filled and zeroed. If not, the holes
				479	* are left alone as holes.
				480	*/
				481
				482	int /* error (positive) */
				483	xfs_zero_eof(
				484	vnode_t *vp,
				485	xfs_iocore_t *io,
				486	xfs_off_t offset, /* starting I/O offset */
				487	xfs_fsize_t isize, /* current inode size */
				488	xfs_fsize_t end_size) /* terminal inode size */
				489	{
				490	struct inode *ip = LINVFS_GET_IP(vp);
				491	xfs_fileoff_t start_zero_fsb;
				492	xfs_fileoff_t end_zero_fsb;
				493	xfs_fileoff_t prev_zero_fsb;
				494	xfs_fileoff_t zero_count_fsb;
				495	xfs_fileoff_t last_fsb;
				496	xfs_extlen_t buf_len_fsb;
				497	xfs_extlen_t prev_zero_count;
				498	xfs_mount_t *mp;
				499	int nimaps;
				500	int error = 0;
				501	xfs_bmbt_irec_t imap;
				502	loff_t loff;
				503	size_t lsize;
				504
				505	ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
				506	ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
				507
				508	mp = io->io_mount;
				509
				510	/*
				511	* First handle zeroing the block on which isize resides.
				512	* We only zero a part of that block so it is handled specially.
				513	*/
				514	error = xfs_zero_last_block(ip, io, offset, isize, end_size);
				515	if (error) {
				516	ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
				517	ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
				518	return error;
				519	}
				520
				521	/*
				522	* Calculate the range between the new size and the old
				523	* where blocks needing to be zeroed may exist. To get the
				524	* block where the last byte in the file currently resides,
				525	* we need to subtract one from the size and truncate back
				526	* to a block boundary. We subtract 1 in case the size is
				527	* exactly on a block boundary.
				528	*/
				529	last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
				530	start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
				531	end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
				532	ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
				533	if (last_fsb == end_zero_fsb) {
				534	/*
				535	* The size was only incremented on its last block.
				536	* We took care of that above, so just return.
				537	*/
				538	return 0;
				539	}
				540
				541	ASSERT(start_zero_fsb <= end_zero_fsb);
				542	prev_zero_fsb = NULLFILEOFF;
				543	prev_zero_count = 0;
				544	while (start_zero_fsb <= end_zero_fsb) {
				545	nimaps = 1;
				546	zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
				547	error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb,
				548	0, NULL, 0, &imap, &nimaps, NULL);
				549	if (error) {
				550	ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
				551	ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
				552	return error;
				553	}
				554	ASSERT(nimaps > 0);
				555
				556	if (imap.br_state == XFS_EXT_UNWRITTEN \|\|
				557	imap.br_startblock == HOLESTARTBLOCK) {
				558	/*
				559	* This loop handles initializing pages that were
				560	* partially initialized by the code below this
				561	* loop. It basically zeroes the part of the page
				562	* that sits on a hole and sets the page as P_HOLE
				563	* and calls remapf if it is a mapped file.
				564	*/
				565	prev_zero_fsb = NULLFILEOFF;
				566	prev_zero_count = 0;
				567	start_zero_fsb = imap.br_startoff +
				568	imap.br_blockcount;
				569	ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
				570	continue;
				571	}
				572
				573	/*
				574	* There are blocks in the range requested.
				575	* Zero them a single write at a time. We actually
				576	* don't zero the entire range returned if it is
				577	* too big and simply loop around to get the rest.
				578	* That is not the most efficient thing to do, but it
				579	* is simple and this path should not be exercised often.
				580	*/
				581	buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount,
				582	mp->m_writeio_blocks << 8);
				583	/*
				584	* Drop the inode lock while we're doing the I/O.
				585	* We'll still have the iolock to protect us.
				586	*/
				587	XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL\|XFS_EXTSIZE_RD);
				588
				589	loff = XFS_FSB_TO_B(mp, start_zero_fsb);
				590	lsize = XFS_FSB_TO_B(mp, buf_len_fsb);
				591
				592	error = xfs_iozero(ip, loff, lsize, end_size);
				593
				594	if (error) {
				595	goto out_lock;
				596	}
				597
				598	prev_zero_fsb = start_zero_fsb;
				599	prev_zero_count = buf_len_fsb;
				600	start_zero_fsb = imap.br_startoff + buf_len_fsb;
				601	ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
				602
				603	XFS_ILOCK(mp, io, XFS_ILOCK_EXCL\|XFS_EXTSIZE_RD);
				604	}
				605
				606	return 0;
				607
				608	out_lock:
				609
				610	XFS_ILOCK(mp, io, XFS_ILOCK_EXCL\|XFS_EXTSIZE_RD);
				611	ASSERT(error >= 0);
				612	return error;
				613	}
				614
				615	ssize_t /* bytes written, or (-) error */
				616	xfs_write(
				617	bhv_desc_t *bdp,
				618	struct kiocb *iocb,
				619	const struct iovec *iovp,
				620	unsigned int nsegs,
				621	loff_t *offset,
				622	int ioflags,
				623	cred_t *credp)
				624	{
				625	struct file *file = iocb->ki_filp;
				626	struct address_space *mapping = file->f_mapping;
				627	struct inode *inode = mapping->host;
				628	unsigned long segs = nsegs;
				629	xfs_inode_t *xip;
				630	xfs_mount_t *mp;
				631	ssize_t ret = 0, error = 0;
				632	xfs_fsize_t isize, new_size;
				633	xfs_iocore_t *io;
				634	vnode_t *vp;
				635	unsigned long seg;
				636	int iolock;
				637	int eventsent = 0;
				638	vrwlock_t locktype;
				639	size_t ocount = 0, count;
				640	loff_t pos;
				641	int need_isem = 1, need_flush = 0;
				642
				643	XFS_STATS_INC(xs_write_calls);
				644
				645	vp = BHV_TO_VNODE(bdp);
				646	xip = XFS_BHVTOI(bdp);
				647
				648	for (seg = 0; seg < segs; seg++) {
				649	const struct iovec *iv = &iovp[seg];
				650
				651	/*
				652	* If any segment has a negative length, or the cumulative
				653	* length ever wraps negative then return -EINVAL.
				654	*/
				655	ocount += iv->iov_len;
				656	if (unlikely((ssize_t)(ocount\|iv->iov_len) < 0))
				657	return -EINVAL;
				658	if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
				659	continue;
				660	if (seg == 0)
				661	return -EFAULT;
				662	segs = seg;
				663	ocount -= iv->iov_len; /* This segment is no good */
				664	break;
				665	}
				666
				667	count = ocount;
				668	pos = *offset;
				669
				670	if (count == 0)
				671	return 0;
				672
				673	io = &xip->i_iocore;
				674	mp = io->io_mount;
				675
				676	if (XFS_FORCED_SHUTDOWN(mp))
				677	return -EIO;
				678
				679	fs_check_frozen(vp->v_vfsp, SB_FREEZE_WRITE);
				680
				681	if (ioflags & IO_ISDIRECT) {
				682	xfs_buftarg_t *target =
				683	(xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
				684	mp->m_rtdev_targp : mp->m_ddev_targp;
				685
				686	if ((pos & target->pbr_smask) \|\| (count & target->pbr_smask))
				687	return XFS_ERROR(-EINVAL);
				688
				689	if (!VN_CACHED(vp) && pos < i_size_read(inode))
				690	need_isem = 0;
				691
				692	if (VN_CACHED(vp))
				693	need_flush = 1;
				694	}
				695
				696	relock:
				697	if (need_isem) {
				698	iolock = XFS_IOLOCK_EXCL;
				699	locktype = VRWLOCK_WRITE;
				700
				701	down(&inode->i_sem);
				702	} else {
				703	iolock = XFS_IOLOCK_SHARED;
				704	locktype = VRWLOCK_WRITE_DIRECT;
				705	}
				706
				707	xfs_ilock(xip, XFS_ILOCK_EXCL\|iolock);
				708
				709	isize = i_size_read(inode);
				710
				711	if (file->f_flags & O_APPEND)
				712	*offset = isize;
				713
				714	start:
				715	error = -generic_write_checks(file, &pos, &count,
				716	S_ISBLK(inode->i_mode));
				717	if (error) {
				718	xfs_iunlock(xip, XFS_ILOCK_EXCL\|iolock);
				719	goto out_unlock_isem;
				720	}
				721
				722	new_size = pos + count;
				723	if (new_size > isize)
				724	io->io_new_size = new_size;
				725
				726	if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
				727	!(ioflags & IO_INVIS) && !eventsent)) {
				728	loff_t savedsize = pos;
				729	int dmflags = FILP_DELAY_FLAG(file);
				730
				731	if (need_isem)
				732	dmflags \|= DM_FLAGS_ISEM;
				733
				734	xfs_iunlock(xip, XFS_ILOCK_EXCL);
				735	error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
				736	pos, count,
				737	dmflags, &locktype);
				738	if (error) {
				739	xfs_iunlock(xip, iolock);
				740	goto out_unlock_isem;
				741	}
				742	xfs_ilock(xip, XFS_ILOCK_EXCL);
				743	eventsent = 1;
				744
				745	/*
				746	* The iolock was dropped and reaquired in XFS_SEND_DATA
				747	* so we have to recheck the size when appending.
				748	* We will only "goto start;" once, since having sent the
				749	* event prevents another call to XFS_SEND_DATA, which is
				750	* what allows the size to change in the first place.
				751	*/
				752	if ((file->f_flags & O_APPEND) && savedsize != isize) {
				753	pos = isize = xip->i_d.di_size;
				754	goto start;
				755	}
				756	}
				757
				758	/*
				759	* On Linux, generic_file_write updates the times even if
				760	* no data is copied in so long as the write had a size.
				761	*
				762	* We must update xfs' times since revalidate will overcopy xfs.
				763	*/
				764	if (!(ioflags & IO_INVIS)) {
				765	xfs_ichgtime(xip, XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
				766	inode_update_time(inode, 1);
				767	}
				768
				769	/*
				770	* If the offset is beyond the size of the file, we have a couple
				771	* of things to do. First, if there is already space allocated
				772	* we need to either create holes or zero the disk or ...
				773	*
				774	* If there is a page where the previous size lands, we need
				775	* to zero it out up to the new size.
				776	*/
				777
				778	if (pos > isize) {
				779	error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos,
				780	isize, pos + count);
				781	if (error) {
				782	xfs_iunlock(xip, XFS_ILOCK_EXCL\|iolock);
				783	goto out_unlock_isem;
				784	}
				785	}
				786	xfs_iunlock(xip, XFS_ILOCK_EXCL);
				787
				788	/*
				789	* If we're writing the file then make sure to clear the
				790	* setuid and setgid bits if the process is not being run
				791	* by root. This keeps people from modifying setuid and
				792	* setgid binaries.
				793	*/
				794
				795	if (((xip->i_d.di_mode & S_ISUID) \|\|
				796	((xip->i_d.di_mode & (S_ISGID \| S_IXGRP)) ==
				797	(S_ISGID \| S_IXGRP))) &&
				798	!capable(CAP_FSETID)) {
				799	error = xfs_write_clear_setuid(xip);
				800	if (likely(!error))
				801	error = -remove_suid(file->f_dentry);
				802	if (unlikely(error)) {
				803	xfs_iunlock(xip, iolock);
				804	goto out_unlock_isem;
				805	}
				806	}
				807
				808	retry:
				809	/* We can write back this queue in page reclaim */
				810	current->backing_dev_info = mapping->backing_dev_info;
				811
				812	if ((ioflags & IO_ISDIRECT)) {
				813	if (need_flush) {
				814	xfs_inval_cached_trace(io, pos, -1,
				815	ctooff(offtoct(pos)), -1);
				816	VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(pos)),
				817	-1, FI_REMAPF_LOCKED);
				818	}
				819
				820	if (need_isem) {
				821	/* demote the lock now the cached pages are gone */
				822	XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL);
				823	up(&inode->i_sem);
				824
				825	iolock = XFS_IOLOCK_SHARED;
				826	locktype = VRWLOCK_WRITE_DIRECT;
				827	need_isem = 0;
				828	}
				829
				830	xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, (void *)iovp, segs,
				831	*offset, ioflags);
				832	ret = generic_file_direct_write(iocb, iovp,
				833	&segs, pos, offset, count, ocount);
				834
				835	/*
				836	* direct-io write to a hole: fall through to buffered I/O
				837	* for completing the rest of the request.
				838	*/
				839	if (ret >= 0 && ret != count) {
				840	XFS_STATS_ADD(xs_write_bytes, ret);
				841
				842	pos += ret;
				843	count -= ret;
				844
				845	need_isem = 1;
				846	ioflags &= ~IO_ISDIRECT;
				847	xfs_iunlock(xip, iolock);
				848	goto relock;
				849	}
				850	} else {
				851	xfs_rw_enter_trace(XFS_WRITE_ENTER, io, (void *)iovp, segs,
				852	*offset, ioflags);
				853	ret = generic_file_buffered_write(iocb, iovp, segs,
				854	pos, offset, count, ret);
				855	}
				856
				857	current->backing_dev_info = NULL;
				858
				859	if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
				860	ret = wait_on_sync_kiocb(iocb);
				861
				862	if ((ret == -ENOSPC) &&
				863	DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
				864	!(ioflags & IO_INVIS)) {
				865
				866	xfs_rwunlock(bdp, locktype);
				867	error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
				868	DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
				869	0, 0, 0); /* Delay flag intentionally unused */
				870	if (error)
				871	goto out_unlock_isem;
				872	xfs_rwlock(bdp, locktype);
				873	pos = xip->i_d.di_size;
				874	ret = 0;
				875	goto retry;
				876	}
				877
				878	if (*offset > xip->i_d.di_size) {
				879	xfs_ilock(xip, XFS_ILOCK_EXCL);
				880	if (*offset > xip->i_d.di_size) {
				881	xip->i_d.di_size = *offset;
				882	i_size_write(inode, *offset);
				883	xip->i_update_core = 1;
				884	xip->i_update_size = 1;
				885	}
				886	xfs_iunlock(xip, XFS_ILOCK_EXCL);
				887	}
				888
				889	error = -ret;
				890	if (ret <= 0)
				891	goto out_unlock_internal;
				892
				893	XFS_STATS_ADD(xs_write_bytes, ret);
				894
				895	/* Handle various SYNC-type writes */
				896	if ((file->f_flags & O_SYNC) \|\| IS_SYNC(inode)) {
				897	/*
				898	* If we're treating this as O_DSYNC and we have not updated the
				899	* size, force the log.
				900	*/
				901	if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
				902	!(xip->i_update_size)) {
				903	xfs_inode_log_item_t *iip = xip->i_itemp;
				904
				905	/*
				906	* If an allocation transaction occurred
				907	* without extending the size, then we have to force
				908	* the log up the proper point to ensure that the
				909	* allocation is permanent. We can't count on
				910	* the fact that buffered writes lock out direct I/O
				911	* writes - the direct I/O write could have extended
				912	* the size nontransactionally, then finished before
				913	* we started. xfs_write_file will think that the file
				914	* didn't grow but the update isn't safe unless the
				915	* size change is logged.
				916	*
				917	* Force the log if we've committed a transaction
				918	* against the inode or if someone else has and
				919	* the commit record hasn't gone to disk (e.g.
				920	* the inode is pinned). This guarantees that
				921	* all changes affecting the inode are permanent
				922	* when we return.
				923	*/
				924	if (iip && iip->ili_last_lsn) {
				925	xfs_log_force(mp, iip->ili_last_lsn,
				926	XFS_LOG_FORCE \| XFS_LOG_SYNC);
				927	} else if (xfs_ipincount(xip) > 0) {
				928	xfs_log_force(mp, (xfs_lsn_t)0,
				929	XFS_LOG_FORCE \| XFS_LOG_SYNC);
				930	}
				931
				932	} else {
				933	xfs_trans_t *tp;
				934
				935	/*
				936	* O_SYNC or O_DSYNC _with_ a size update are handled
				937	* the same way.
				938	*
				939	* If the write was synchronous then we need to make
				940	* sure that the inode modification time is permanent.
				941	* We'll have updated the timestamp above, so here
				942	* we use a synchronous transaction to log the inode.
				943	* It's not fast, but it's necessary.
				944	*
				945	* If this a dsync write and the size got changed
				946	* non-transactionally, then we need to ensure that
				947	* the size change gets logged in a synchronous
				948	* transaction.
				949	*/
				950
				951	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
				952	if ((error = xfs_trans_reserve(tp, 0,
				953	XFS_SWRITE_LOG_RES(mp),
				954	0, 0, 0))) {
				955	/* Transaction reserve failed */
				956	xfs_trans_cancel(tp, 0);
				957	} else {
				958	/* Transaction reserve successful */
				959	xfs_ilock(xip, XFS_ILOCK_EXCL);
				960	xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
				961	xfs_trans_ihold(tp, xip);
				962	xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
				963	xfs_trans_set_sync(tp);
				964	error = xfs_trans_commit(tp, 0, NULL);
				965	xfs_iunlock(xip, XFS_ILOCK_EXCL);
				966	}
				967	if (error)
				968	goto out_unlock_internal;
				969	}
				970
				971	xfs_rwunlock(bdp, locktype);
				972	if (need_isem)
				973	up(&inode->i_sem);
				974
				975	error = sync_page_range(inode, mapping, pos, ret);
				976	if (!error)
				977	error = ret;
				978	return error;
				979	}
				980
				981	out_unlock_internal:
				982	xfs_rwunlock(bdp, locktype);
				983	out_unlock_isem:
				984	if (need_isem)
				985	up(&inode->i_sem);
				986	return -error;
				987	}
				988
				989	/*
				990	* All xfs metadata buffers except log state machine buffers
				991	* get this attached as their b_bdstrat callback function.
				992	* This is so that we can catch a buffer
				993	* after prematurely unpinning it to forcibly shutdown the filesystem.
				994	*/
				995	int
				996	xfs_bdstrat_cb(struct xfs_buf *bp)
				997	{
				998	xfs_mount_t *mp;
				999
				1000	mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
				1001	if (!XFS_FORCED_SHUTDOWN(mp)) {
				1002	pagebuf_iorequest(bp);
				1003	return 0;
				1004	} else {
				1005	xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
				1006	/*
				1007	* Metadata write that didn't get logged but
				1008	* written delayed anyway. These aren't associated
				1009	* with a transaction, and can be ignored.
				1010	*/
				1011	if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
				1012	(XFS_BUF_ISREAD(bp)) == 0)
				1013	return (xfs_bioerror_relse(bp));
				1014	else
				1015	return (xfs_bioerror(bp));
				1016	}
				1017	}
				1018
				1019
				1020	int
				1021	xfs_bmap(bhv_desc_t *bdp,
				1022	xfs_off_t offset,
				1023	ssize_t count,
				1024	int flags,
				1025	xfs_iomap_t *iomapp,
				1026	int *niomaps)
				1027	{
				1028	xfs_inode_t *ip = XFS_BHVTOI(bdp);
				1029	xfs_iocore_t *io = &ip->i_iocore;
				1030
				1031	ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
				1032	ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
				1033	((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0));
				1034
				1035	return xfs_iomap(io, offset, count, flags, iomapp, niomaps);
				1036	}
				1037
				1038	/*
				1039	* Wrapper around bdstrat so that we can stop data
				1040	* from going to disk in case we are shutting down the filesystem.
				1041	* Typically user data goes thru this path; one of the exceptions
				1042	* is the superblock.
				1043	*/
				1044	int
				1045	xfsbdstrat(
				1046	struct xfs_mount *mp,
				1047	struct xfs_buf *bp)
				1048	{
				1049	ASSERT(mp);
				1050	if (!XFS_FORCED_SHUTDOWN(mp)) {
				1051	/* Grio redirection would go here
				1052	* if (XFS_BUF_IS_GRIO(bp)) {
				1053	*/
				1054
				1055	pagebuf_iorequest(bp);
				1056	return 0;
				1057	}
				1058
				1059	xfs_buftrace("XFSBDSTRAT IOERROR", bp);
				1060	return (xfs_bioerror_relse(bp));
				1061	}
				1062
				1063	/*
				1064	* If the underlying (data/log/rt) device is readonly, there are some
				1065	* operations that cannot proceed.
				1066	*/
				1067	int
				1068	xfs_dev_is_read_only(
				1069	xfs_mount_t *mp,
				1070	char *message)
				1071	{
				1072	if (xfs_readonly_buftarg(mp->m_ddev_targp) \|\|
				1073	xfs_readonly_buftarg(mp->m_logdev_targp) \|\|
				1074	(mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
				1075	cmn_err(CE_NOTE,
				1076	"XFS: %s required on read-only device.", message);
				1077	cmn_err(CE_NOTE,
				1078	"XFS: write access unavailable, cannot proceed.");
				1079	return EROFS;
				1080	}
				1081	return 0;
				1082	}