Blame - fs/buffer.c - kernel/msm

blob: ac84cd13075d60142002897661ba067d60b65239 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/fs/buffer.c
				3	*
				4	* Copyright (C) 1991, 1992, 2002 Linus Torvalds
				5	*/
				6
				7	/*
				8	* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
				9	*
				10	* Removed a lot of unnecessary code and simplified things now that
				11	* the buffer cache isn't our primary cache - Andrew Tridgell 12/96
				12	*
				13	* Speed up hash, lru, and free list operations. Use gfp() for allocating
				14	* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
				15	*
				16	* Added 32k buffer block sizes - these are required older ARM systems. - RMK
				17	*
				18	* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
				19	*/
				20
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	21	#include <linux/kernel.h>
				22	#include <linux/syscalls.h>
				23	#include <linux/fs.h>
				24	#include <linux/mm.h>
				25	#include <linux/percpu.h>
				26	#include <linux/slab.h>
Randy Dunlap	16f7e0f	2006-01-11 12:17:46 -0800	[diff] [blame]	27	#include <linux/capability.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	28	#include <linux/blkdev.h>
				29	#include <linux/file.h>
				30	#include <linux/quotaops.h>
				31	#include <linux/highmem.h>
				32	#include <linux/module.h>
				33	#include <linux/writeback.h>
				34	#include <linux/hash.h>
				35	#include <linux/suspend.h>
				36	#include <linux/buffer_head.h>
Andrew Morton	55e829a	2006-12-10 02:19:27 -0800	[diff] [blame]	37	#include <linux/task_io_accounting_ops.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	38	#include <linux/bio.h>
				39	#include <linux/notifier.h>
				40	#include <linux/cpu.h>
				41	#include <linux/bitops.h>
				42	#include <linux/mpage.h>
Ingo Molnar	fb1c8f9	2005-09-10 00:25:56 -0700	[diff] [blame]	43	#include <linux/bit_spinlock.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	44
				45	static int fsync_buffers_list(spinlock_t lock, struct list_head list);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	46
				47	#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
				48
				49	inline void
				50	init_buffer(struct buffer_head bh, bh_end_io_t handler, void *private)
				51	{
				52	bh->b_end_io = handler;
				53	bh->b_private = private;
				54	}
				55
				56	static int sync_buffer(void *word)
				57	{
				58	struct block_device *bd;
				59	struct buffer_head *bh
				60	= container_of(word, struct buffer_head, b_state);
				61
				62	smp_mb();
				63	bd = bh->b_bdev;
				64	if (bd)
				65	blk_run_address_space(bd->bd_inode->i_mapping);
				66	io_schedule();
				67	return 0;
				68	}
				69
Harvey Harrison	fc9b52c	2008-02-08 04:19:52 -0800	[diff] [blame]	70	void __lock_buffer(struct buffer_head *bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	71	{
				72	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
				73	TASK_UNINTERRUPTIBLE);
				74	}
				75	EXPORT_SYMBOL(__lock_buffer);
				76
Harvey Harrison	fc9b52c	2008-02-08 04:19:52 -0800	[diff] [blame]	77	void unlock_buffer(struct buffer_head *bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	78	{
Nick Piggin	72ed3d0	2007-02-10 01:46:22 -0800	[diff] [blame]	79	smp_mb__before_clear_bit();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	80	clear_buffer_locked(bh);
				81	smp_mb__after_clear_bit();
				82	wake_up_bit(&bh->b_state, BH_Lock);
				83	}
				84
				85	/*
				86	* Block until a buffer comes unlocked. This doesn't stop it
				87	* from becoming locked again - you have to lock it yourself
				88	* if you want to preserve its state.
				89	*/
				90	void __wait_on_buffer(struct buffer_head * bh)
				91	{
				92	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
				93	}
				94
				95	static void
				96	__clear_page_buffers(struct page *page)
				97	{
				98	ClearPagePrivate(page);
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	99	set_page_private(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	100	page_cache_release(page);
				101	}
				102
				103	static void buffer_io_error(struct buffer_head *bh)
				104	{
				105	char b[BDEVNAME_SIZE];
				106
				107	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
				108	bdevname(bh->b_bdev, b),
				109	(unsigned long long)bh->b_blocknr);
				110	}
				111
				112	/*
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	113	* End-of-IO handler helper function which does not touch the bh after
				114	* unlocking it.
				115	* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
				116	* a race there is benign: unlock_buffer() only use the bh's address for
				117	* hashing after unlocking the buffer, so it doesn't actually touch the bh
				118	* itself.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	119	*/
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	120	static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	121	{
				122	if (uptodate) {
				123	set_buffer_uptodate(bh);
				124	} else {
				125	/* This happens, due to failed READA attempts. */
				126	clear_buffer_uptodate(bh);
				127	}
				128	unlock_buffer(bh);
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	129	}
				130
				131	/*
				132	* Default synchronous end-of-IO handler.. Just mark it up-to-date and
				133	* unlock the buffer. This is what ll_rw_block uses too.
				134	*/
				135	void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
				136	{
				137	__end_buffer_read_notouch(bh, uptodate);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	138	put_bh(bh);
				139	}
				140
				141	void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
				142	{
				143	char b[BDEVNAME_SIZE];
				144
				145	if (uptodate) {
				146	set_buffer_uptodate(bh);
				147	} else {
				148	if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
				149	buffer_io_error(bh);
				150	printk(KERN_WARNING "lost page write due to "
				151	"I/O error on %s\n",
				152	bdevname(bh->b_bdev, b));
				153	}
				154	set_buffer_write_io_error(bh);
				155	clear_buffer_uptodate(bh);
				156	}
				157	unlock_buffer(bh);
				158	put_bh(bh);
				159	}
				160
				161	/*
				162	* Write out and wait upon all the dirty data associated with a block
				163	* device via its mapping. Does not take the superblock lock.
				164	*/
				165	int sync_blockdev(struct block_device *bdev)
				166	{
				167	int ret = 0;
				168
OGAWA Hirofumi	28fd129	2006-01-08 01:02:14 -0800	[diff] [blame]	169	if (bdev)
				170	ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	171	return ret;
				172	}
				173	EXPORT_SYMBOL(sync_blockdev);
				174
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	175	/*
				176	* Write out and wait upon all dirty data associated with this
				177	* device. Filesystem data as well as the underlying block
				178	* device. Takes the superblock lock.
				179	*/
				180	int fsync_bdev(struct block_device *bdev)
				181	{
				182	struct super_block *sb = get_super(bdev);
				183	if (sb) {
				184	int res = fsync_super(sb);
				185	drop_super(sb);
				186	return res;
				187	}
				188	return sync_blockdev(bdev);
				189	}
				190
				191	/**
				192	* freeze_bdev -- lock a filesystem and force it into a consistent state
				193	* @bdev: blockdevice to lock
				194	*
David Chinner	f73ca1b	2007-01-10 23:15:41 -0800	[diff] [blame]	195	* This takes the block device bd_mount_sem to make sure no new mounts
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	196	* happen on bdev until thaw_bdev() is called.
				197	* If a superblock is found on this device, we take the s_umount semaphore
				198	* on it to make sure nobody unmounts until the snapshot creation is done.
				199	*/
				200	struct super_block freeze_bdev(struct block_device bdev)
				201	{
				202	struct super_block *sb;
				203
David Chinner	f73ca1b	2007-01-10 23:15:41 -0800	[diff] [blame]	204	down(&bdev->bd_mount_sem);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	205	sb = get_super(bdev);
				206	if (sb && !(sb->s_flags & MS_RDONLY)) {
				207	sb->s_frozen = SB_FREEZE_WRITE;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	208	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	209
OGAWA Hirofumi	d25b9a1	2006-03-25 03:07:44 -0800	[diff] [blame]	210	__fsync_super(sb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	211
				212	sb->s_frozen = SB_FREEZE_TRANS;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	213	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	214
				215	sync_blockdev(sb->s_bdev);
				216
				217	if (sb->s_op->write_super_lockfs)
				218	sb->s_op->write_super_lockfs(sb);
				219	}
				220
				221	sync_blockdev(bdev);
				222	return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
				223	}
				224	EXPORT_SYMBOL(freeze_bdev);
				225
				226	/**
				227	* thaw_bdev -- unlock filesystem
				228	* @bdev: blockdevice to unlock
				229	* @sb: associated superblock
				230	*
				231	* Unlocks the filesystem and marks it writeable again after freeze_bdev().
				232	*/
				233	void thaw_bdev(struct block_device bdev, struct super_block sb)
				234	{
				235	if (sb) {
				236	BUG_ON(sb->s_bdev != bdev);
				237
				238	if (sb->s_op->unlockfs)
				239	sb->s_op->unlockfs(sb);
				240	sb->s_frozen = SB_UNFROZEN;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	241	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	242	wake_up(&sb->s_wait_unfrozen);
				243	drop_super(sb);
				244	}
				245
David Chinner	f73ca1b	2007-01-10 23:15:41 -0800	[diff] [blame]	246	up(&bdev->bd_mount_sem);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	247	}
				248	EXPORT_SYMBOL(thaw_bdev);
				249
				250	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	251	* Various filesystems appear to want __find_get_block to be non-blocking.
				252	* But it's the page lock which protects the buffers. To get around this,
				253	* we get exclusion from try_to_free_buffers with the blockdev mapping's
				254	* private_lock.
				255	*
				256	* Hack idea: for the blockdev mapping, i_bufferlist_lock contention
				257	* may be quite high. This code could TryLock the page, and if that
				258	* succeeds, there is no need to take private_lock. (But if
				259	* private_lock is contended then so is mapping->tree_lock).
				260	*/
				261	static struct buffer_head *
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	262	__find_get_block_slow(struct block_device *bdev, sector_t block)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	263	{
				264	struct inode *bd_inode = bdev->bd_inode;
				265	struct address_space *bd_mapping = bd_inode->i_mapping;
				266	struct buffer_head *ret = NULL;
				267	pgoff_t index;
				268	struct buffer_head *bh;
				269	struct buffer_head *head;
				270	struct page *page;
				271	int all_mapped = 1;
				272
				273	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
				274	page = find_get_page(bd_mapping, index);
				275	if (!page)
				276	goto out;
				277
				278	spin_lock(&bd_mapping->private_lock);
				279	if (!page_has_buffers(page))
				280	goto out_unlock;
				281	head = page_buffers(page);
				282	bh = head;
				283	do {
				284	if (bh->b_blocknr == block) {
				285	ret = bh;
				286	get_bh(bh);
				287	goto out_unlock;
				288	}
				289	if (!buffer_mapped(bh))
				290	all_mapped = 0;
				291	bh = bh->b_this_page;
				292	} while (bh != head);
				293
				294	/* we might be here because some of the buffers on this page are
				295	* not mapped. This is due to various races between
				296	* file io on the block device and getblk. It gets dealt with
				297	* elsewhere, don't buffer_error if we had some unmapped buffers
				298	*/
				299	if (all_mapped) {
				300	printk("__find_get_block_slow() failed. "
				301	"block=%llu, b_blocknr=%llu\n",
Badari Pulavarty	205f87f	2006-03-26 01:38:00 -0800	[diff] [blame]	302	(unsigned long long)block,
				303	(unsigned long long)bh->b_blocknr);
				304	printk("b_state=0x%08lx, b_size=%zu\n",
				305	bh->b_state, bh->b_size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	306	printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
				307	}
				308	out_unlock:
				309	spin_unlock(&bd_mapping->private_lock);
				310	page_cache_release(page);
				311	out:
				312	return ret;
				313	}
				314
				315	/* If invalidate_buffers() will trash dirty buffers, it means some kind
				316	of fs corruption is going on. Trashing dirty data always imply losing
				317	information that was supposed to be just stored on the physical layer
				318	by the user.
				319
				320	Thus invalidate_buffers in general usage is not allwowed to trash
				321	dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
				322	be preserved. These buffers are simply skipped.
				323
				324	We also skip buffers which are still in use. For example this can
				325	happen if a userspace program is reading the block device.
				326
				327	NOTE: In the case where the user removed a removable-media-disk even if
				328	there's still dirty data not synced on disk (due a bug in the device driver
				329	or due an error of the user), by not destroying the dirty buffers we could
				330	generate corruption also on the next media inserted, thus a parameter is
				331	necessary to handle this case in the most safe way possible (trying
				332	to not corrupt also the new disk inserted with the data belonging to
				333	the old now corrupted disk). Also for the ramdisk the natural thing
				334	to do in order to release the ramdisk memory is to destroy dirty buffers.
				335
				336	These are two special cases. Normal usage imply the device driver
				337	to issue a sync on the device (without waiting I/O completion) and
				338	then an invalidate_buffers call that doesn't trash dirty buffers.
				339
				340	For handling cache coherency with the blkdev pagecache the 'update' case
				341	is been introduced. It is needed to re-read from disk any pinned
				342	buffer. NOTE: re-reading from disk is destructive so we can do it only
				343	when we assume nobody is changing the buffercache under our I/O and when
				344	we think the disk contains more recent information than the buffercache.
				345	The update == 1 pass marks the buffers we need to update, the update == 2
				346	pass does the actual I/O. */
Peter Zijlstra	f98393a	2007-05-06 14:49:54 -0700	[diff] [blame]	347	void invalidate_bdev(struct block_device *bdev)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	348	{
Andrew Morton	0e1dfc6	2006-07-30 03:03:28 -0700	[diff] [blame]	349	struct address_space *mapping = bdev->bd_inode->i_mapping;
				350
				351	if (mapping->nrpages == 0)
				352	return;
				353
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	354	invalidate_bh_lrus();
Andrew Morton	fc0ecff	2007-02-10 01:45:39 -0800	[diff] [blame]	355	invalidate_mapping_pages(mapping, 0, -1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	356	}
				357
				358	/*
				359	* Kick pdflush then try to free up some ZONE_NORMAL memory.
				360	*/
				361	static void free_more_memory(void)
				362	{
Mel Gorman	dd1a239	2008-04-28 02:12:17 -0700	[diff] [blame^]	363	struct zoneref *zrefs;
Mel Gorman	0e88460	2008-04-28 02:12:14 -0700	[diff] [blame]	364	int nid;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	365
Pekka J Enberg	687a21c	2005-06-28 20:44:55 -0700	[diff] [blame]	366	wakeup_pdflush(1024);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	367	yield();
				368
Mel Gorman	0e88460	2008-04-28 02:12:14 -0700	[diff] [blame]	369	for_each_online_node(nid) {
Mel Gorman	dd1a239	2008-04-28 02:12:17 -0700	[diff] [blame^]	370	zrefs = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
Mel Gorman	54a6eb5	2008-04-28 02:12:16 -0700	[diff] [blame]	371	gfp_zone(GFP_NOFS));
Mel Gorman	dd1a239	2008-04-28 02:12:17 -0700	[diff] [blame^]	372	if (zrefs->zone)
Mel Gorman	54a6eb5	2008-04-28 02:12:16 -0700	[diff] [blame]	373	try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
				374	GFP_NOFS);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	375	}
				376	}
				377
				378	/*
				379	* I/O completion handler for block_read_full_page() - pages
				380	* which come unlocked at the end of I/O.
				381	*/
				382	static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
				383	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	384	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	385	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	386	struct buffer_head *tmp;
				387	struct page *page;
				388	int page_uptodate = 1;
				389
				390	BUG_ON(!buffer_async_read(bh));
				391
				392	page = bh->b_page;
				393	if (uptodate) {
				394	set_buffer_uptodate(bh);
				395	} else {
				396	clear_buffer_uptodate(bh);
				397	if (printk_ratelimit())
				398	buffer_io_error(bh);
				399	SetPageError(page);
				400	}
				401
				402	/*
				403	* Be _very_ careful from here on. Bad things can happen if
				404	* two buffer heads end IO at almost the same time and both
				405	* decide that the page is now completely done.
				406	*/
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	407	first = page_buffers(page);
				408	local_irq_save(flags);
				409	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	410	clear_buffer_async_read(bh);
				411	unlock_buffer(bh);
				412	tmp = bh;
				413	do {
				414	if (!buffer_uptodate(tmp))
				415	page_uptodate = 0;
				416	if (buffer_async_read(tmp)) {
				417	BUG_ON(!buffer_locked(tmp));
				418	goto still_busy;
				419	}
				420	tmp = tmp->b_this_page;
				421	} while (tmp != bh);
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	422	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				423	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	424
				425	/*
				426	* If none of the buffers had errors and they are all
				427	* uptodate then we can set the page uptodate.
				428	*/
				429	if (page_uptodate && !PageError(page))
				430	SetPageUptodate(page);
				431	unlock_page(page);
				432	return;
				433
				434	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	435	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				436	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	437	return;
				438	}
				439
				440	/*
				441	* Completion handler for block_write_full_page() - pages which are unlocked
				442	* during I/O, and which have PageWriteback cleared upon I/O completion.
				443	*/
Adrian Bunk	b6cd0b7	2006-06-27 02:53:54 -0700	[diff] [blame]	444	static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	445	{
				446	char b[BDEVNAME_SIZE];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	447	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	448	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	449	struct buffer_head *tmp;
				450	struct page *page;
				451
				452	BUG_ON(!buffer_async_write(bh));
				453
				454	page = bh->b_page;
				455	if (uptodate) {
				456	set_buffer_uptodate(bh);
				457	} else {
				458	if (printk_ratelimit()) {
				459	buffer_io_error(bh);
				460	printk(KERN_WARNING "lost page write due to "
				461	"I/O error on %s\n",
				462	bdevname(bh->b_bdev, b));
				463	}
				464	set_bit(AS_EIO, &page->mapping->flags);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	465	set_buffer_write_io_error(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	466	clear_buffer_uptodate(bh);
				467	SetPageError(page);
				468	}
				469
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	470	first = page_buffers(page);
				471	local_irq_save(flags);
				472	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
				473
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	474	clear_buffer_async_write(bh);
				475	unlock_buffer(bh);
				476	tmp = bh->b_this_page;
				477	while (tmp != bh) {
				478	if (buffer_async_write(tmp)) {
				479	BUG_ON(!buffer_locked(tmp));
				480	goto still_busy;
				481	}
				482	tmp = tmp->b_this_page;
				483	}
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	484	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				485	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	486	end_page_writeback(page);
				487	return;
				488
				489	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	490	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				491	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	492	return;
				493	}
				494
				495	/*
				496	* If a page's buffers are under async readin (end_buffer_async_read
				497	* completion) then there is a possibility that another thread of
				498	* control could lock one of the buffers after it has completed
				499	* but while some of the other buffers have not completed. This
				500	* locked buffer would confuse end_buffer_async_read() into not unlocking
				501	* the page. So the absence of BH_Async_Read tells end_buffer_async_read()
				502	* that this buffer is not under async I/O.
				503	*
				504	* The page comes unlocked when it has no locked buffer_async buffers
				505	* left.
				506	*
				507	* PageLocked prevents anyone starting new async I/O reads any of
				508	* the buffers.
				509	*
				510	* PageWriteback is used to prevent simultaneous writeout of the same
				511	* page.
				512	*
				513	* PageLocked prevents anyone from starting writeback of a page which is
				514	* under read I/O (PageWriteback is only ever set against a locked page).
				515	*/
				516	static void mark_buffer_async_read(struct buffer_head *bh)
				517	{
				518	bh->b_end_io = end_buffer_async_read;
				519	set_buffer_async_read(bh);
				520	}
				521
				522	void mark_buffer_async_write(struct buffer_head *bh)
				523	{
				524	bh->b_end_io = end_buffer_async_write;
				525	set_buffer_async_write(bh);
				526	}
				527	EXPORT_SYMBOL(mark_buffer_async_write);
				528
				529
				530	/*
				531	* fs/buffer.c contains helper functions for buffer-backed address space's
				532	* fsync functions. A common requirement for buffer-based filesystems is
				533	* that certain data from the backing blockdev needs to be written out for
				534	* a successful fsync(). For example, ext2 indirect blocks need to be
				535	* written back and waited upon before fsync() returns.
				536	*
				537	* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
				538	* inode_has_buffers() and invalidate_inode_buffers() are provided for the
				539	* management of a list of dependent buffers at ->i_mapping->private_list.
				540	*
				541	* Locking is a little subtle: try_to_free_buffers() will remove buffers
				542	* from their controlling inode's queue when they are being freed. But
				543	* try_to_free_buffers() will be operating against the blockdev mapping
				544	* at the time, not against the S_ISREG file which depends on those buffers.
				545	* So the locking for private_list is via the private_lock in the address_space
				546	* which backs the buffers. Which is different from the address_space
				547	* against which the buffers are listed. So for a particular address_space,
				548	* mapping->private_lock does not protect mapping->private_list! In fact,
				549	* mapping->private_list will always be protected by the backing blockdev's
				550	* ->private_lock.
				551	*
				552	* Which introduces a requirement: all buffers on an address_space's
				553	* ->private_list must be from the same address_space: the blockdev's.
				554	*
				555	* address_spaces which do not place buffers at ->private_list via these
				556	* utility functions are free to use private_lock and private_list for
				557	* whatever they want. The only requirement is that list_empty(private_list)
				558	* be true at clear_inode() time.
				559	*
				560	* FIXME: clear_inode should not call invalidate_inode_buffers(). The
				561	* filesystems should do that. invalidate_inode_buffers() should just go
				562	* BUG_ON(!list_empty).
				563	*
				564	* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
				565	* take an address_space, not an inode. And it should be called
				566	* mark_buffer_dirty_fsync() to clearly define why those buffers are being
				567	* queued up.
				568	*
				569	* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
				570	* list if it is already on a list. Because if the buffer is on a list,
				571	* it must already be on the right one. If not, the filesystem is being
				572	* silly. This will save a ton of locking. But first we have to ensure
				573	* that buffers are taken off the old inode's list when they are freed
				574	* (presumably in truncate). That requires careful auditing of all
				575	* filesystems (do it inside bforget()). It could also be done by bringing
				576	* b_inode back.
				577	*/
				578
				579	/*
				580	* The buffer's backing address_space's private_lock must be held
				581	*/
				582	static inline void __remove_assoc_queue(struct buffer_head *bh)
				583	{
				584	list_del_init(&bh->b_assoc_buffers);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	585	WARN_ON(!bh->b_assoc_map);
				586	if (buffer_write_io_error(bh))
				587	set_bit(AS_EIO, &bh->b_assoc_map->flags);
				588	bh->b_assoc_map = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	589	}
				590
				591	int inode_has_buffers(struct inode *inode)
				592	{
				593	return !list_empty(&inode->i_data.private_list);
				594	}
				595
				596	/*
				597	* osync is designed to support O_SYNC io. It waits synchronously for
				598	* all already-submitted IO to complete, but does not queue any new
				599	* writes to the disk.
				600	*
				601	* To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
				602	* you dirty the buffers, and then use osync_inode_buffers to wait for
				603	* completion. Any other dirty buffers which are not yet queued for
				604	* write will not be flushed to disk by the osync.
				605	*/
				606	static int osync_buffers_list(spinlock_t lock, struct list_head list)
				607	{
				608	struct buffer_head *bh;
				609	struct list_head *p;
				610	int err = 0;
				611
				612	spin_lock(lock);
				613	repeat:
				614	list_for_each_prev(p, list) {
				615	bh = BH_ENTRY(p);
				616	if (buffer_locked(bh)) {
				617	get_bh(bh);
				618	spin_unlock(lock);
				619	wait_on_buffer(bh);
				620	if (!buffer_uptodate(bh))
				621	err = -EIO;
				622	brelse(bh);
				623	spin_lock(lock);
				624	goto repeat;
				625	}
				626	}
				627	spin_unlock(lock);
				628	return err;
				629	}
				630
				631	/**
Randy Dunlap	78a4a50	2008-02-29 22:02:31 -0800	[diff] [blame]	632	* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	633	* @mapping: the mapping which wants those buffers written
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	634	*
				635	* Starts I/O against the buffers at mapping->private_list, and waits upon
				636	* that I/O.
				637	*
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	638	* Basically, this is a convenience function for fsync().
				639	* @mapping is a file or directory which needs those buffers to be written for
				640	* a successful fsync().
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	641	*/
				642	int sync_mapping_buffers(struct address_space *mapping)
				643	{
				644	struct address_space *buffer_mapping = mapping->assoc_mapping;
				645
				646	if (buffer_mapping == NULL \|\| list_empty(&mapping->private_list))
				647	return 0;
				648
				649	return fsync_buffers_list(&buffer_mapping->private_lock,
				650	&mapping->private_list);
				651	}
				652	EXPORT_SYMBOL(sync_mapping_buffers);
				653
				654	/*
				655	* Called when we've recently written block `bblock', and it is known that
				656	* `bblock' was for a buffer_boundary() buffer. This means that the block at
				657	* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
				658	* dirty, schedule it for IO. So that indirects merge nicely with their data.
				659	*/
				660	void write_boundary_block(struct block_device *bdev,
				661	sector_t bblock, unsigned blocksize)
				662	{
				663	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
				664	if (bh) {
				665	if (buffer_dirty(bh))
				666	ll_rw_block(WRITE, 1, &bh);
				667	put_bh(bh);
				668	}
				669	}
				670
				671	void mark_buffer_dirty_inode(struct buffer_head bh, struct inode inode)
				672	{
				673	struct address_space *mapping = inode->i_mapping;
				674	struct address_space *buffer_mapping = bh->b_page->mapping;
				675
				676	mark_buffer_dirty(bh);
				677	if (!mapping->assoc_mapping) {
				678	mapping->assoc_mapping = buffer_mapping;
				679	} else {
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	680	BUG_ON(mapping->assoc_mapping != buffer_mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	681	}
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	682	if (!bh->b_assoc_map) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	683	spin_lock(&buffer_mapping->private_lock);
				684	list_move_tail(&bh->b_assoc_buffers,
				685	&mapping->private_list);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	686	bh->b_assoc_map = mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	687	spin_unlock(&buffer_mapping->private_lock);
				688	}
				689	}
				690	EXPORT_SYMBOL(mark_buffer_dirty_inode);
				691
				692	/*
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	693	* Mark the page dirty, and set it dirty in the radix tree, and mark the inode
				694	* dirty.
				695	*
				696	* If warn is true, then emit a warning if the page is not uptodate and has
				697	* not been truncated.
				698	*/
				699	static int __set_page_dirty(struct page *page,
				700	struct address_space *mapping, int warn)
				701	{
				702	if (unlikely(!mapping))
				703	return !TestSetPageDirty(page);
				704
				705	if (TestSetPageDirty(page))
				706	return 0;
				707
				708	write_lock_irq(&mapping->tree_lock);
				709	if (page->mapping) { /* Race with truncate? */
				710	WARN_ON_ONCE(warn && !PageUptodate(page));
				711
				712	if (mapping_cap_account_dirty(mapping)) {
				713	__inc_zone_page_state(page, NR_FILE_DIRTY);
Peter Zijlstra	c9e51e4	2007-10-16 23:25:47 -0700	[diff] [blame]	714	__inc_bdi_stat(mapping->backing_dev_info,
				715	BDI_RECLAIMABLE);
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	716	task_io_account_write(PAGE_CACHE_SIZE);
				717	}
				718	radix_tree_tag_set(&mapping->page_tree,
				719	page_index(page), PAGECACHE_TAG_DIRTY);
				720	}
				721	write_unlock_irq(&mapping->tree_lock);
				722	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
				723
				724	return 1;
				725	}
				726
				727	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	728	* Add a page to the dirty page list.
				729	*
				730	* It is a sad fact of life that this function is called from several places
				731	* deeply under spinlocking. It may not sleep.
				732	*
				733	* If the page has buffers, the uptodate buffers are set dirty, to preserve
				734	* dirty-state coherency between the page and the buffers. It the page does
				735	* not have buffers then when they are later attached they will all be set
				736	* dirty.
				737	*
				738	* The buffers are dirtied before the page is dirtied. There's a small race
				739	* window in which a writepage caller may see the page cleanness but not the
				740	* buffer dirtiness. That's fine. If this code were to set the page dirty
				741	* before the buffers, a concurrent writepage caller could clear the page dirty
				742	* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
				743	* page on the dirty page list.
				744	*
				745	* We use private_lock to lock against try_to_free_buffers while using the
				746	* page's buffer list. Also use this to protect against clean buffers being
				747	* added to the page after it was set dirty.
				748	*
				749	* FIXME: may need to call ->reservepage here as well. That's rather up to the
				750	* address_space though.
				751	*/
				752	int __set_page_dirty_buffers(struct page *page)
				753	{
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	754	struct address_space *mapping = page_mapping(page);
Nick Piggin	ebf7a22	2006-10-10 04:36:54 +0200	[diff] [blame]	755
				756	if (unlikely(!mapping))
				757	return !TestSetPageDirty(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	758
				759	spin_lock(&mapping->private_lock);
				760	if (page_has_buffers(page)) {
				761	struct buffer_head *head = page_buffers(page);
				762	struct buffer_head *bh = head;
				763
				764	do {
				765	set_buffer_dirty(bh);
				766	bh = bh->b_this_page;
				767	} while (bh != head);
				768	}
				769	spin_unlock(&mapping->private_lock);
				770
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	771	return __set_page_dirty(page, mapping, 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	772	}
				773	EXPORT_SYMBOL(__set_page_dirty_buffers);
				774
				775	/*
				776	* Write out and wait upon a list of buffers.
				777	*
				778	* We have conflicting pressures: we want to make sure that all
				779	* initially dirty buffers get waited on, but that any subsequently
				780	* dirtied buffers don't. After all, we don't want fsync to last
				781	* forever if somebody is actively writing to the file.
				782	*
				783	* Do this in two main stages: first we copy dirty buffers to a
				784	* temporary inode list, queueing the writes as we go. Then we clean
				785	* up, waiting for those writes to complete.
				786	*
				787	* During this second stage, any subsequent updates to the file may end
				788	* up refiling the buffer on the original inode's dirty list again, so
				789	* there is a chance we will end up with a buffer queued for write but
				790	* not yet completed on that list. So, as a final cleanup we go through
				791	* the osync code to catch these locked, dirty buffers without requeuing
				792	* any newly dirty buffers for write.
				793	*/
				794	static int fsync_buffers_list(spinlock_t lock, struct list_head list)
				795	{
				796	struct buffer_head *bh;
				797	struct list_head tmp;
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	798	struct address_space *mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	799	int err = 0, err2;
				800
				801	INIT_LIST_HEAD(&tmp);
				802
				803	spin_lock(lock);
				804	while (!list_empty(list)) {
				805	bh = BH_ENTRY(list->next);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	806	mapping = bh->b_assoc_map;
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	807	__remove_assoc_queue(bh);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	808	/* Avoid race with mark_buffer_dirty_inode() which does
				809	* a lockless check and we rely on seeing the dirty bit */
				810	smp_mb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	811	if (buffer_dirty(bh) \|\| buffer_locked(bh)) {
				812	list_add(&bh->b_assoc_buffers, &tmp);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	813	bh->b_assoc_map = mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	814	if (buffer_dirty(bh)) {
				815	get_bh(bh);
				816	spin_unlock(lock);
				817	/*
				818	* Ensure any pending I/O completes so that
				819	* ll_rw_block() actually writes the current
				820	* contents - it is a noop if I/O is still in
				821	* flight on potentially older contents.
				822	*/
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	823	ll_rw_block(SWRITE, 1, &bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	824	brelse(bh);
				825	spin_lock(lock);
				826	}
				827	}
				828	}
				829
				830	while (!list_empty(&tmp)) {
				831	bh = BH_ENTRY(tmp.prev);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	832	get_bh(bh);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	833	mapping = bh->b_assoc_map;
				834	__remove_assoc_queue(bh);
				835	/* Avoid race with mark_buffer_dirty_inode() which does
				836	* a lockless check and we rely on seeing the dirty bit */
				837	smp_mb();
				838	if (buffer_dirty(bh)) {
				839	list_add(&bh->b_assoc_buffers,
Jan Kara	e389229	2008-03-04 14:28:33 -0800	[diff] [blame]	840	&mapping->private_list);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	841	bh->b_assoc_map = mapping;
				842	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	843	spin_unlock(lock);
				844	wait_on_buffer(bh);
				845	if (!buffer_uptodate(bh))
				846	err = -EIO;
				847	brelse(bh);
				848	spin_lock(lock);
				849	}
				850
				851	spin_unlock(lock);
				852	err2 = osync_buffers_list(lock, list);
				853	if (err)
				854	return err;
				855	else
				856	return err2;
				857	}
				858
				859	/*
				860	* Invalidate any and all dirty buffers on a given inode. We are
				861	* probably unmounting the fs, but that doesn't mean we have already
				862	* done a sync(). Just drop the buffers from the inode list.
				863	*
				864	* NOTE: we take the inode's blockdev's mapping's private_lock. Which
				865	* assumes that all the buffers are against the blockdev. Not true
				866	* for reiserfs.
				867	*/
				868	void invalidate_inode_buffers(struct inode *inode)
				869	{
				870	if (inode_has_buffers(inode)) {
				871	struct address_space *mapping = &inode->i_data;
				872	struct list_head *list = &mapping->private_list;
				873	struct address_space *buffer_mapping = mapping->assoc_mapping;
				874
				875	spin_lock(&buffer_mapping->private_lock);
				876	while (!list_empty(list))
				877	__remove_assoc_queue(BH_ENTRY(list->next));
				878	spin_unlock(&buffer_mapping->private_lock);
				879	}
				880	}
				881
				882	/*
				883	* Remove any clean buffers from the inode's buffer list. This is called
				884	* when we're trying to free the inode itself. Those buffers can pin it.
				885	*
				886	* Returns true if all buffers were removed.
				887	*/
				888	int remove_inode_buffers(struct inode *inode)
				889	{
				890	int ret = 1;
				891
				892	if (inode_has_buffers(inode)) {
				893	struct address_space *mapping = &inode->i_data;
				894	struct list_head *list = &mapping->private_list;
				895	struct address_space *buffer_mapping = mapping->assoc_mapping;
				896
				897	spin_lock(&buffer_mapping->private_lock);
				898	while (!list_empty(list)) {
				899	struct buffer_head *bh = BH_ENTRY(list->next);
				900	if (buffer_dirty(bh)) {
				901	ret = 0;
				902	break;
				903	}
				904	__remove_assoc_queue(bh);
				905	}
				906	spin_unlock(&buffer_mapping->private_lock);
				907	}
				908	return ret;
				909	}
				910
				911	/*
				912	* Create the appropriate buffers when given a page for data area and
				913	* the size of each buffer.. Use the bh->b_this_page linked list to
				914	* follow the buffers created. Return NULL if unable to create more
				915	* buffers.
				916	*
				917	* The retry flag is used to differentiate async IO (paging, swapping)
				918	* which may not fail from ordinary buffer allocations.
				919	*/
				920	struct buffer_head alloc_page_buffers(struct page page, unsigned long size,
				921	int retry)
				922	{
				923	struct buffer_head bh, head;
				924	long offset;
				925
				926	try_again:
				927	head = NULL;
				928	offset = PAGE_SIZE;
				929	while ((offset -= size) >= 0) {
				930	bh = alloc_buffer_head(GFP_NOFS);
				931	if (!bh)
				932	goto no_grow;
				933
				934	bh->b_bdev = NULL;
				935	bh->b_this_page = head;
				936	bh->b_blocknr = -1;
				937	head = bh;
				938
				939	bh->b_state = 0;
				940	atomic_set(&bh->b_count, 0);
Chris Mason	fc5cd58	2006-02-01 03:06:48 -0800	[diff] [blame]	941	bh->b_private = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	942	bh->b_size = size;
				943
				944	/* Link the buffer to its page */
				945	set_bh_page(bh, page, offset);
				946
Nathan Scott	01ffe33	2006-01-17 09:02:07 +1100	[diff] [blame]	947	init_buffer(bh, NULL, NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	948	}
				949	return head;
				950	/*
				951	* In case anything failed, we just free everything we got.
				952	*/
				953	no_grow:
				954	if (head) {
				955	do {
				956	bh = head;
				957	head = head->b_this_page;
				958	free_buffer_head(bh);
				959	} while (head);
				960	}
				961
				962	/*
				963	* Return failure for non-async IO requests. Async IO requests
				964	* are not allowed to fail, so we have to wait until buffer heads
				965	* become available. But we don't want tasks sleeping with
				966	* partially complete buffers, so all were released above.
				967	*/
				968	if (!retry)
				969	return NULL;
				970
				971	/* We're _really_ low on memory. Now we just
				972	* wait for old buffer heads to become free due to
				973	* finishing IO. Since this is an async request and
				974	* the reserve list is empty, we're sure there are
				975	* async buffer heads in use.
				976	*/
				977	free_more_memory();
				978	goto try_again;
				979	}
				980	EXPORT_SYMBOL_GPL(alloc_page_buffers);
				981
				982	static inline void
				983	link_dev_buffers(struct page page, struct buffer_head head)
				984	{
				985	struct buffer_head bh, tail;
				986
				987	bh = head;
				988	do {
				989	tail = bh;
				990	bh = bh->b_this_page;
				991	} while (bh);
				992	tail->b_this_page = head;
				993	attach_page_buffers(page, head);
				994	}
				995
				996	/*
				997	* Initialise the state of a blockdev page's buffers.
				998	*/
				999	static void
				1000	init_page_buffers(struct page page, struct block_device bdev,
				1001	sector_t block, int size)
				1002	{
				1003	struct buffer_head *head = page_buffers(page);
				1004	struct buffer_head *bh = head;
				1005	int uptodate = PageUptodate(page);
				1006
				1007	do {
				1008	if (!buffer_mapped(bh)) {
				1009	init_buffer(bh, NULL, NULL);
				1010	bh->b_bdev = bdev;
				1011	bh->b_blocknr = block;
				1012	if (uptodate)
				1013	set_buffer_uptodate(bh);
				1014	set_buffer_mapped(bh);
				1015	}
				1016	block++;
				1017	bh = bh->b_this_page;
				1018	} while (bh != head);
				1019	}
				1020
				1021	/*
				1022	* Create the page-cache page that contains the requested block.
				1023	*
				1024	* This is user purely for blockdev mappings.
				1025	*/
				1026	static struct page *
				1027	grow_dev_page(struct block_device *bdev, sector_t block,
				1028	pgoff_t index, int size)
				1029	{
				1030	struct inode *inode = bdev->bd_inode;
				1031	struct page *page;
				1032	struct buffer_head *bh;
				1033
Christoph Lameter	ea12589	2007-05-16 22:11:21 -0700	[diff] [blame]	1034	page = find_or_create_page(inode->i_mapping, index,
Mel Gorman	769848c	2007-07-17 04:03:05 -0700	[diff] [blame]	1035	(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)\|__GFP_MOVABLE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1036	if (!page)
				1037	return NULL;
				1038
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	1039	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1040
				1041	if (page_has_buffers(page)) {
				1042	bh = page_buffers(page);
				1043	if (bh->b_size == size) {
				1044	init_page_buffers(page, bdev, block, size);
				1045	return page;
				1046	}
				1047	if (!try_to_free_buffers(page))
				1048	goto failed;
				1049	}
				1050
				1051	/*
				1052	* Allocate some buffers for this page
				1053	*/
				1054	bh = alloc_page_buffers(page, size, 0);
				1055	if (!bh)
				1056	goto failed;
				1057
				1058	/*
				1059	* Link the page to the buffers and initialise them. Take the
				1060	* lock to be atomic wrt __find_get_block(), which does not
				1061	* run under the page lock.
				1062	*/
				1063	spin_lock(&inode->i_mapping->private_lock);
				1064	link_dev_buffers(page, bh);
				1065	init_page_buffers(page, bdev, block, size);
				1066	spin_unlock(&inode->i_mapping->private_lock);
				1067	return page;
				1068
				1069	failed:
				1070	BUG();
				1071	unlock_page(page);
				1072	page_cache_release(page);
				1073	return NULL;
				1074	}
				1075
				1076	/*
				1077	* Create buffers for the specified block device block's page. If
				1078	* that page was dirty, the buffers are set dirty also.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1079	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1080	static int
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1081	grow_buffers(struct block_device *bdev, sector_t block, int size)
				1082	{
				1083	struct page *page;
				1084	pgoff_t index;
				1085	int sizebits;
				1086
				1087	sizebits = -1;
				1088	do {
				1089	sizebits++;
				1090	} while ((size << sizebits) < PAGE_SIZE);
				1091
				1092	index = block >> sizebits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1093
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1094	/*
				1095	* Check for a block which wants to lie outside our maximum possible
				1096	* pagecache index. (this comparison is done using sector_t types).
				1097	*/
				1098	if (unlikely(index != block >> sizebits)) {
				1099	char b[BDEVNAME_SIZE];
				1100
				1101	printk(KERN_ERR "%s: requested out-of-range block %llu for "
				1102	"device %s\n",
				1103	__FUNCTION__, (unsigned long long)block,
				1104	bdevname(bdev, b));
				1105	return -EIO;
				1106	}
				1107	block = index << sizebits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1108	/* Create a page with the proper size buffers.. */
				1109	page = grow_dev_page(bdev, block, index, size);
				1110	if (!page)
				1111	return 0;
				1112	unlock_page(page);
				1113	page_cache_release(page);
				1114	return 1;
				1115	}
				1116
Adrian Bunk	75c96f8	2005-05-05 16:16:09 -0700	[diff] [blame]	1117	static struct buffer_head *
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1118	__getblk_slow(struct block_device *bdev, sector_t block, int size)
				1119	{
				1120	/* Size must be multiple of hard sectorsize */
				1121	if (unlikely(size & (bdev_hardsect_size(bdev)-1) \|\|
				1122	(size < 512 \|\| size > PAGE_SIZE))) {
				1123	printk(KERN_ERR "getblk(): invalid block size %d requested\n",
				1124	size);
				1125	printk(KERN_ERR "hardsect size: %d\n",
				1126	bdev_hardsect_size(bdev));
				1127
				1128	dump_stack();
				1129	return NULL;
				1130	}
				1131
				1132	for (;;) {
				1133	struct buffer_head * bh;
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1134	int ret;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1135
				1136	bh = __find_get_block(bdev, block, size);
				1137	if (bh)
				1138	return bh;
				1139
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1140	ret = grow_buffers(bdev, block, size);
				1141	if (ret < 0)
				1142	return NULL;
				1143	if (ret == 0)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1144	free_more_memory();
				1145	}
				1146	}
				1147
				1148	/*
				1149	* The relationship between dirty buffers and dirty pages:
				1150	*
				1151	* Whenever a page has any dirty buffers, the page's dirty bit is set, and
				1152	* the page is tagged dirty in its radix tree.
				1153	*
				1154	* At all times, the dirtiness of the buffers represents the dirtiness of
				1155	* subsections of the page. If the page has buffers, the page dirty bit is
				1156	* merely a hint about the true dirty state.
				1157	*
				1158	* When a page is set dirty in its entirety, all its buffers are marked dirty
				1159	* (if the page has buffers).
				1160	*
				1161	* When a buffer is marked dirty, its page is dirtied, but the page's other
				1162	* buffers are not.
				1163	*
				1164	* Also. When blockdev buffers are explicitly read with bread(), they
				1165	* individually become uptodate. But their backing page remains not
				1166	* uptodate - even if all of its buffers are uptodate. A subsequent
				1167	* block_read_full_page() against that page will discover all the uptodate
				1168	* buffers, will set the page uptodate and will perform no I/O.
				1169	*/
				1170
				1171	/**
				1172	* mark_buffer_dirty - mark a buffer_head as needing writeout
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1173	* @bh: the buffer_head to mark dirty
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1174	*
				1175	* mark_buffer_dirty() will set the dirty bit against the buffer, then set its
				1176	* backing page dirty, then tag the page as dirty in its address_space's radix
				1177	* tree and then attach the address_space's inode to its superblock's dirty
				1178	* inode list.
				1179	*
				1180	* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
				1181	* mapping->tree_lock and the global inode_lock.
				1182	*/
Harvey Harrison	fc9b52c	2008-02-08 04:19:52 -0800	[diff] [blame]	1183	void mark_buffer_dirty(struct buffer_head *bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1184	{
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	1185	WARN_ON_ONCE(!buffer_uptodate(bh));
Linus Torvalds	1be62dc	2008-04-04 14:38:17 -0700	[diff] [blame]	1186
				1187	/*
				1188	* Very carefully optimize the it-is-already-dirty case.
				1189	*
				1190	* Don't let the final "is it dirty" escape to before we
				1191	* perhaps modified the buffer.
				1192	*/
				1193	if (buffer_dirty(bh)) {
				1194	smp_mb();
				1195	if (buffer_dirty(bh))
				1196	return;
				1197	}
				1198
				1199	if (!test_set_buffer_dirty(bh))
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	1200	__set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1201	}
				1202
				1203	/*
				1204	* Decrement a buffer_head's reference count. If all buffers against a page
				1205	* have zero reference count, are clean and unlocked, and if the page is clean
				1206	* and unlocked then try_to_free_buffers() may strip the buffers from the page
				1207	* in preparation for freeing it (sometimes, rarely, buffers are removed from
				1208	* a page but it ends up not being freed, and buffers may later be reattached).
				1209	*/
				1210	void __brelse(struct buffer_head * buf)
				1211	{
				1212	if (atomic_read(&buf->b_count)) {
				1213	put_bh(buf);
				1214	return;
				1215	}
				1216	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
				1217	WARN_ON(1);
				1218	}
				1219
				1220	/*
				1221	* bforget() is like brelse(), except it discards any
				1222	* potentially dirty data.
				1223	*/
				1224	void __bforget(struct buffer_head *bh)
				1225	{
				1226	clear_buffer_dirty(bh);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	1227	if (bh->b_assoc_map) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1228	struct address_space *buffer_mapping = bh->b_page->mapping;
				1229
				1230	spin_lock(&buffer_mapping->private_lock);
				1231	list_del_init(&bh->b_assoc_buffers);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	1232	bh->b_assoc_map = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1233	spin_unlock(&buffer_mapping->private_lock);
				1234	}
				1235	__brelse(bh);
				1236	}
				1237
				1238	static struct buffer_head __bread_slow(struct buffer_head bh)
				1239	{
				1240	lock_buffer(bh);
				1241	if (buffer_uptodate(bh)) {
				1242	unlock_buffer(bh);
				1243	return bh;
				1244	} else {
				1245	get_bh(bh);
				1246	bh->b_end_io = end_buffer_read_sync;
				1247	submit_bh(READ, bh);
				1248	wait_on_buffer(bh);
				1249	if (buffer_uptodate(bh))
				1250	return bh;
				1251	}
				1252	brelse(bh);
				1253	return NULL;
				1254	}
				1255
				1256	/*
				1257	* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
				1258	* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
				1259	* refcount elevated by one when they're in an LRU. A buffer can only appear
				1260	* once in a particular CPU's LRU. A single buffer can be present in multiple
				1261	* CPU's LRUs at the same time.
				1262	*
				1263	* This is a transparent caching front-end to sb_bread(), sb_getblk() and
				1264	* sb_find_get_block().
				1265	*
				1266	* The LRUs themselves only need locking against invalidate_bh_lrus. We use
				1267	* a local interrupt disable for that.
				1268	*/
				1269
				1270	#define BH_LRU_SIZE 8
				1271
				1272	struct bh_lru {
				1273	struct buffer_head *bhs[BH_LRU_SIZE];
				1274	};
				1275
				1276	static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
				1277
				1278	#ifdef CONFIG_SMP
				1279	#define bh_lru_lock() local_irq_disable()
				1280	#define bh_lru_unlock() local_irq_enable()
				1281	#else
				1282	#define bh_lru_lock() preempt_disable()
				1283	#define bh_lru_unlock() preempt_enable()
				1284	#endif
				1285
				1286	static inline void check_irqs_on(void)
				1287	{
				1288	#ifdef irqs_disabled
				1289	BUG_ON(irqs_disabled());
				1290	#endif
				1291	}
				1292
				1293	/*
				1294	* The LRU management algorithm is dopey-but-simple. Sorry.
				1295	*/
				1296	static void bh_lru_install(struct buffer_head *bh)
				1297	{
				1298	struct buffer_head *evictee = NULL;
				1299	struct bh_lru *lru;
				1300
				1301	check_irqs_on();
				1302	bh_lru_lock();
				1303	lru = &__get_cpu_var(bh_lrus);
				1304	if (lru->bhs[0] != bh) {
				1305	struct buffer_head *bhs[BH_LRU_SIZE];
				1306	int in;
				1307	int out = 0;
				1308
				1309	get_bh(bh);
				1310	bhs[out++] = bh;
				1311	for (in = 0; in < BH_LRU_SIZE; in++) {
				1312	struct buffer_head *bh2 = lru->bhs[in];
				1313
				1314	if (bh2 == bh) {
				1315	__brelse(bh2);
				1316	} else {
				1317	if (out >= BH_LRU_SIZE) {
				1318	BUG_ON(evictee != NULL);
				1319	evictee = bh2;
				1320	} else {
				1321	bhs[out++] = bh2;
				1322	}
				1323	}
				1324	}
				1325	while (out < BH_LRU_SIZE)
				1326	bhs[out++] = NULL;
				1327	memcpy(lru->bhs, bhs, sizeof(bhs));
				1328	}
				1329	bh_lru_unlock();
				1330
				1331	if (evictee)
				1332	__brelse(evictee);
				1333	}
				1334
				1335	/*
				1336	* Look up the bh in this cpu's LRU. If it's there, move it to the head.
				1337	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1338	static struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1339	lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1340	{
				1341	struct buffer_head *ret = NULL;
				1342	struct bh_lru *lru;
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1343	unsigned int i;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1344
				1345	check_irqs_on();
				1346	bh_lru_lock();
				1347	lru = &__get_cpu_var(bh_lrus);
				1348	for (i = 0; i < BH_LRU_SIZE; i++) {
				1349	struct buffer_head *bh = lru->bhs[i];
				1350
				1351	if (bh && bh->b_bdev == bdev &&
				1352	bh->b_blocknr == block && bh->b_size == size) {
				1353	if (i) {
				1354	while (i) {
				1355	lru->bhs[i] = lru->bhs[i - 1];
				1356	i--;
				1357	}
				1358	lru->bhs[0] = bh;
				1359	}
				1360	get_bh(bh);
				1361	ret = bh;
				1362	break;
				1363	}
				1364	}
				1365	bh_lru_unlock();
				1366	return ret;
				1367	}
				1368
				1369	/*
				1370	* Perform a pagecache lookup for the matching buffer. If it's there, refresh
				1371	* it in the LRU and mark it as accessed. If it is not present then return
				1372	* NULL
				1373	*/
				1374	struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1375	__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1376	{
				1377	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
				1378
				1379	if (bh == NULL) {
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1380	bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1381	if (bh)
				1382	bh_lru_install(bh);
				1383	}
				1384	if (bh)
				1385	touch_buffer(bh);
				1386	return bh;
				1387	}
				1388	EXPORT_SYMBOL(__find_get_block);
				1389
				1390	/*
				1391	* __getblk will locate (and, if necessary, create) the buffer_head
				1392	* which corresponds to the passed block_device, block and size. The
				1393	* returned buffer has its reference count incremented.
				1394	*
				1395	* __getblk() cannot fail - it just keeps trying. If you pass it an
				1396	* illegal block number, __getblk() will happily return a buffer_head
				1397	* which represents the non-existent block. Very weird.
				1398	*
				1399	* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
				1400	* attempt is failing. FIXME, perhaps?
				1401	*/
				1402	struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1403	__getblk(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1404	{
				1405	struct buffer_head *bh = __find_get_block(bdev, block, size);
				1406
				1407	might_sleep();
				1408	if (bh == NULL)
				1409	bh = __getblk_slow(bdev, block, size);
				1410	return bh;
				1411	}
				1412	EXPORT_SYMBOL(__getblk);
				1413
				1414	/*
				1415	* Do async read-ahead on a buffer..
				1416	*/
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1417	void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1418	{
				1419	struct buffer_head *bh = __getblk(bdev, block, size);
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1420	if (likely(bh)) {
				1421	ll_rw_block(READA, 1, &bh);
				1422	brelse(bh);
				1423	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1424	}
				1425	EXPORT_SYMBOL(__breadahead);
				1426
				1427	/**
				1428	* __bread() - reads a specified block and returns the bh
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1429	* @bdev: the block_device to read from
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1430	* @block: number of block
				1431	* @size: size (in bytes) to read
				1432	*
				1433	* Reads a specified block, and returns buffer head that contains it.
				1434	* It returns NULL if the block was unreadable.
				1435	*/
				1436	struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1437	__bread(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1438	{
				1439	struct buffer_head *bh = __getblk(bdev, block, size);
				1440
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1441	if (likely(bh) && !buffer_uptodate(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1442	bh = __bread_slow(bh);
				1443	return bh;
				1444	}
				1445	EXPORT_SYMBOL(__bread);
				1446
				1447	/*
				1448	* invalidate_bh_lrus() is called rarely - but not only at unmount.
				1449	* This doesn't race because it runs in each cpu either in irq
				1450	* or with preempt disabled.
				1451	*/
				1452	static void invalidate_bh_lru(void *arg)
				1453	{
				1454	struct bh_lru *b = &get_cpu_var(bh_lrus);
				1455	int i;
				1456
				1457	for (i = 0; i < BH_LRU_SIZE; i++) {
				1458	brelse(b->bhs[i]);
				1459	b->bhs[i] = NULL;
				1460	}
				1461	put_cpu_var(bh_lrus);
				1462	}
				1463
Peter Zijlstra	f9a1439	2007-05-06 14:49:55 -0700	[diff] [blame]	1464	void invalidate_bh_lrus(void)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1465	{
				1466	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
				1467	}
Nick Piggin	9db5579	2008-02-08 04:19:49 -0800	[diff] [blame]	1468	EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1469
				1470	void set_bh_page(struct buffer_head *bh,
				1471	struct page *page, unsigned long offset)
				1472	{
				1473	bh->b_page = page;
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	1474	BUG_ON(offset >= PAGE_SIZE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1475	if (PageHighMem(page))
				1476	/*
				1477	* This catches illegal uses and preserves the offset:
				1478	*/
				1479	bh->b_data = (char *)(0 + offset);
				1480	else
				1481	bh->b_data = page_address(page) + offset;
				1482	}
				1483	EXPORT_SYMBOL(set_bh_page);
				1484
				1485	/*
				1486	* Called when truncating a buffer on a page completely.
				1487	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1488	static void discard_buffer(struct buffer_head * bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1489	{
				1490	lock_buffer(bh);
				1491	clear_buffer_dirty(bh);
				1492	bh->b_bdev = NULL;
				1493	clear_buffer_mapped(bh);
				1494	clear_buffer_req(bh);
				1495	clear_buffer_new(bh);
				1496	clear_buffer_delay(bh);
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	1497	clear_buffer_unwritten(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1498	unlock_buffer(bh);
				1499	}
				1500
				1501	/**
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1502	* block_invalidatepage - invalidate part of all of a buffer-backed page
				1503	*
				1504	* @page: the page which is affected
				1505	* @offset: the index of the truncation point
				1506	*
				1507	* block_invalidatepage() is called when all or part of the page has become
				1508	* invalidatedby a truncate operation.
				1509	*
				1510	* block_invalidatepage() does not have to release all buffers, but it must
				1511	* ensure that no dirty buffer is left outside @offset and that no I/O
				1512	* is underway against any of the blocks which are outside the truncation
				1513	* point. Because the caller is about to free (and possibly reuse) those
				1514	* blocks on-disk.
				1515	*/
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1516	void block_invalidatepage(struct page *page, unsigned long offset)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1517	{
				1518	struct buffer_head head, bh, *next;
				1519	unsigned int curr_off = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1520
				1521	BUG_ON(!PageLocked(page));
				1522	if (!page_has_buffers(page))
				1523	goto out;
				1524
				1525	head = page_buffers(page);
				1526	bh = head;
				1527	do {
				1528	unsigned int next_off = curr_off + bh->b_size;
				1529	next = bh->b_this_page;
				1530
				1531	/*
				1532	* is this block fully invalidated?
				1533	*/
				1534	if (offset <= curr_off)
				1535	discard_buffer(bh);
				1536	curr_off = next_off;
				1537	bh = next;
				1538	} while (bh != head);
				1539
				1540	/*
				1541	* We release buffers only if the entire page is being invalidated.
				1542	* The get_block cached value has been unconditionally invalidated,
				1543	* so real IO is not possible anymore.
				1544	*/
				1545	if (offset == 0)
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1546	try_to_release_page(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1547	out:
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1548	return;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1549	}
				1550	EXPORT_SYMBOL(block_invalidatepage);
				1551
				1552	/*
				1553	* We attach and possibly dirty the buffers atomically wrt
				1554	* __set_page_dirty_buffers() via private_lock. try_to_free_buffers
				1555	* is already excluded via the page lock.
				1556	*/
				1557	void create_empty_buffers(struct page *page,
				1558	unsigned long blocksize, unsigned long b_state)
				1559	{
				1560	struct buffer_head bh, head, *tail;
				1561
				1562	head = alloc_page_buffers(page, blocksize, 1);
				1563	bh = head;
				1564	do {
				1565	bh->b_state \|= b_state;
				1566	tail = bh;
				1567	bh = bh->b_this_page;
				1568	} while (bh);
				1569	tail->b_this_page = head;
				1570
				1571	spin_lock(&page->mapping->private_lock);
				1572	if (PageUptodate(page) \|\| PageDirty(page)) {
				1573	bh = head;
				1574	do {
				1575	if (PageDirty(page))
				1576	set_buffer_dirty(bh);
				1577	if (PageUptodate(page))
				1578	set_buffer_uptodate(bh);
				1579	bh = bh->b_this_page;
				1580	} while (bh != head);
				1581	}
				1582	attach_page_buffers(page, head);
				1583	spin_unlock(&page->mapping->private_lock);
				1584	}
				1585	EXPORT_SYMBOL(create_empty_buffers);
				1586
				1587	/*
				1588	* We are taking a block for data and we don't want any output from any
				1589	* buffer-cache aliases starting from return from that function and
				1590	* until the moment when something will explicitly mark the buffer
				1591	* dirty (hopefully that will not happen until we will free that block ;-)
				1592	* We don't even need to mark it not-uptodate - nobody can expect
				1593	* anything from a newly allocated buffer anyway. We used to used
				1594	* unmap_buffer() for such invalidation, but that was wrong. We definitely
				1595	* don't want to mark the alias unmapped, for example - it would confuse
				1596	* anyone who might pick it with bread() afterwards...
				1597	*
				1598	* Also.. Note that bforget() doesn't lock the buffer. So there can
				1599	* be writeout I/O going on against recently-freed buffers. We don't
				1600	* wait on that I/O in bforget() - it's more efficient to wait on the I/O
				1601	* only if we really need to. That happens here.
				1602	*/
				1603	void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
				1604	{
				1605	struct buffer_head *old_bh;
				1606
				1607	might_sleep();
				1608
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1609	old_bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1610	if (old_bh) {
				1611	clear_buffer_dirty(old_bh);
				1612	wait_on_buffer(old_bh);
				1613	clear_buffer_req(old_bh);
				1614	__brelse(old_bh);
				1615	}
				1616	}
				1617	EXPORT_SYMBOL(unmap_underlying_metadata);
				1618
				1619	/*
				1620	* NOTE! All mapped/uptodate combinations are valid:
				1621	*
				1622	* Mapped Uptodate Meaning
				1623	*
				1624	* No No "unknown" - must do get_block()
				1625	* No Yes "hole" - zero-filled
				1626	* Yes No "allocated" - allocated on disk, not read in
				1627	* Yes Yes "valid" - allocated and up-to-date in memory.
				1628	*
				1629	* "Dirty" is valid only with the last case (mapped+uptodate).
				1630	*/
				1631
				1632	/*
				1633	* While block_write_full_page is writing back the dirty buffers under
				1634	* the page lock, whoever dirtied the buffers may decide to clean them
				1635	* again at any time. We handle that by only looking at the buffer
				1636	* state inside lock_buffer().
				1637	*
				1638	* If block_write_full_page() is called for regular writeback
				1639	* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
				1640	* locked buffer. This only can happen if someone has written the buffer
				1641	* directly, with submit_bh(). At the address_space level PageWriteback
				1642	* prevents this contention from occurring.
				1643	*/
				1644	static int __block_write_full_page(struct inode inode, struct page page,
				1645	get_block_t get_block, struct writeback_control wbc)
				1646	{
				1647	int err;
				1648	sector_t block;
				1649	sector_t last_block;
Andrew Morton	f0fbd5f	2005-05-05 16:15:48 -0700	[diff] [blame]	1650	struct buffer_head bh, head;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1651	const unsigned blocksize = 1 << inode->i_blkbits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1652	int nr_underway = 0;
				1653
				1654	BUG_ON(!PageLocked(page));
				1655
				1656	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
				1657
				1658	if (!page_has_buffers(page)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1659	create_empty_buffers(page, blocksize,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1660	(1 << BH_Dirty)\|(1 << BH_Uptodate));
				1661	}
				1662
				1663	/*
				1664	* Be very careful. We have no exclusion from __set_page_dirty_buffers
				1665	* here, and the (potentially unmapped) buffers may become dirty at
				1666	* any time. If a buffer becomes dirty here after we've inspected it
				1667	* then we just miss that fact, and the page stays dirty.
				1668	*
				1669	* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
				1670	* handle that here by just cleaning them.
				1671	*/
				1672
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	1673	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1674	head = page_buffers(page);
				1675	bh = head;
				1676
				1677	/*
				1678	* Get all the dirty buffers mapped to disk addresses and
				1679	* handle any aliases from the underlying blockdev's mapping.
				1680	*/
				1681	do {
				1682	if (block > last_block) {
				1683	/*
				1684	* mapped buffers outside i_size will occur, because
				1685	* this page can be outside i_size when there is a
				1686	* truncate in progress.
				1687	*/
				1688	/*
				1689	* The buffer was zeroed by block_write_full_page()
				1690	*/
				1691	clear_buffer_dirty(bh);
				1692	set_buffer_uptodate(bh);
				1693	} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1694	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1695	err = get_block(inode, block, bh, 1);
				1696	if (err)
				1697	goto recover;
				1698	if (buffer_new(bh)) {
				1699	/* blockdev mappings never come here */
				1700	clear_buffer_new(bh);
				1701	unmap_underlying_metadata(bh->b_bdev,
				1702	bh->b_blocknr);
				1703	}
				1704	}
				1705	bh = bh->b_this_page;
				1706	block++;
				1707	} while (bh != head);
				1708
				1709	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1710	if (!buffer_mapped(bh))
				1711	continue;
				1712	/*
				1713	* If it's a fully non-blocking write attempt and we cannot
				1714	* lock the buffer then redirty the page. Note that this can
				1715	* potentially cause a busy-wait loop from pdflush and kswapd
				1716	* activity, but those code paths have their own higher-level
				1717	* throttling.
				1718	*/
				1719	if (wbc->sync_mode != WB_SYNC_NONE \|\| !wbc->nonblocking) {
				1720	lock_buffer(bh);
				1721	} else if (test_set_buffer_locked(bh)) {
				1722	redirty_page_for_writepage(wbc, page);
				1723	continue;
				1724	}
				1725	if (test_clear_buffer_dirty(bh)) {
				1726	mark_buffer_async_write(bh);
				1727	} else {
				1728	unlock_buffer(bh);
				1729	}
				1730	} while ((bh = bh->b_this_page) != head);
				1731
				1732	/*
				1733	* The page and its buffers are protected by PageWriteback(), so we can
				1734	* drop the bh refcounts early.
				1735	*/
				1736	BUG_ON(PageWriteback(page));
				1737	set_page_writeback(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1738
				1739	do {
				1740	struct buffer_head *next = bh->b_this_page;
				1741	if (buffer_async_write(bh)) {
				1742	submit_bh(WRITE, bh);
				1743	nr_underway++;
				1744	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1745	bh = next;
				1746	} while (bh != head);
Andrew Morton	05937ba	2005-05-05 16:15:47 -0700	[diff] [blame]	1747	unlock_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1748
				1749	err = 0;
				1750	done:
				1751	if (nr_underway == 0) {
				1752	/*
				1753	* The page was marked dirty, but the buffers were
				1754	* clean. Someone wrote them back by hand with
				1755	* ll_rw_block/submit_bh. A rare case.
				1756	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1757	end_page_writeback(page);
Nick Piggin	3d67f2d	2007-05-06 14:49:05 -0700	[diff] [blame]	1758
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1759	/*
				1760	* The page and buffer_heads can be released at any time from
				1761	* here on.
				1762	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1763	}
				1764	return err;
				1765
				1766	recover:
				1767	/*
				1768	* ENOSPC, or some other error. We may already have added some
				1769	* blocks to the file, so we need to write these out to avoid
				1770	* exposing stale data.
				1771	* The page is currently locked and not marked for writeback
				1772	*/
				1773	bh = head;
				1774	/* Recovery: lock and submit the mapped buffers */
				1775	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1776	if (buffer_mapped(bh) && buffer_dirty(bh)) {
				1777	lock_buffer(bh);
				1778	mark_buffer_async_write(bh);
				1779	} else {
				1780	/*
				1781	* The buffer may have been set dirty during
				1782	* attachment to a dirty page.
				1783	*/
				1784	clear_buffer_dirty(bh);
				1785	}
				1786	} while ((bh = bh->b_this_page) != head);
				1787	SetPageError(page);
				1788	BUG_ON(PageWriteback(page));
Andrew Morton	7e4c369	2007-05-08 00:23:27 -0700	[diff] [blame]	1789	mapping_set_error(page->mapping, err);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1790	set_page_writeback(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1791	do {
				1792	struct buffer_head *next = bh->b_this_page;
				1793	if (buffer_async_write(bh)) {
				1794	clear_buffer_dirty(bh);
				1795	submit_bh(WRITE, bh);
				1796	nr_underway++;
				1797	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1798	bh = next;
				1799	} while (bh != head);
Nick Piggin	ffda9d3	2007-02-20 13:57:54 -0800	[diff] [blame]	1800	unlock_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1801	goto done;
				1802	}
				1803
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1804	/*
				1805	* If a page has any new buffers, zero them out here, and mark them uptodate
				1806	* and dirty so they'll be written out (in order to prevent uninitialised
				1807	* block data from leaking). And clear the new bit.
				1808	*/
				1809	void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
				1810	{
				1811	unsigned int block_start, block_end;
				1812	struct buffer_head head, bh;
				1813
				1814	BUG_ON(!PageLocked(page));
				1815	if (!page_has_buffers(page))
				1816	return;
				1817
				1818	bh = head = page_buffers(page);
				1819	block_start = 0;
				1820	do {
				1821	block_end = block_start + bh->b_size;
				1822
				1823	if (buffer_new(bh)) {
				1824	if (block_end > from && block_start < to) {
				1825	if (!PageUptodate(page)) {
				1826	unsigned start, size;
				1827
				1828	start = max(from, block_start);
				1829	size = min(to, block_end) - start;
				1830
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	1831	zero_user(page, start, size);
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1832	set_buffer_uptodate(bh);
				1833	}
				1834
				1835	clear_buffer_new(bh);
				1836	mark_buffer_dirty(bh);
				1837	}
				1838	}
				1839
				1840	block_start = block_end;
				1841	bh = bh->b_this_page;
				1842	} while (bh != head);
				1843	}
				1844	EXPORT_SYMBOL(page_zero_new_buffers);
				1845
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1846	static int __block_prepare_write(struct inode inode, struct page page,
				1847	unsigned from, unsigned to, get_block_t *get_block)
				1848	{
				1849	unsigned block_start, block_end;
				1850	sector_t block;
				1851	int err = 0;
				1852	unsigned blocksize, bbits;
				1853	struct buffer_head bh, head, wait[2], *wait_bh=wait;
				1854
				1855	BUG_ON(!PageLocked(page));
				1856	BUG_ON(from > PAGE_CACHE_SIZE);
				1857	BUG_ON(to > PAGE_CACHE_SIZE);
				1858	BUG_ON(from > to);
				1859
				1860	blocksize = 1 << inode->i_blkbits;
				1861	if (!page_has_buffers(page))
				1862	create_empty_buffers(page, blocksize, 0);
				1863	head = page_buffers(page);
				1864
				1865	bbits = inode->i_blkbits;
				1866	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
				1867
				1868	for(bh = head, block_start = 0; bh != head \|\| !block_start;
				1869	block++, block_start=block_end, bh = bh->b_this_page) {
				1870	block_end = block_start + blocksize;
				1871	if (block_end <= from \|\| block_start >= to) {
				1872	if (PageUptodate(page)) {
				1873	if (!buffer_uptodate(bh))
				1874	set_buffer_uptodate(bh);
				1875	}
				1876	continue;
				1877	}
				1878	if (buffer_new(bh))
				1879	clear_buffer_new(bh);
				1880	if (!buffer_mapped(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1881	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1882	err = get_block(inode, block, bh, 1);
				1883	if (err)
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1884	break;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1885	if (buffer_new(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1886	unmap_underlying_metadata(bh->b_bdev,
				1887	bh->b_blocknr);
				1888	if (PageUptodate(page)) {
Nick Piggin	637aff4	2007-10-16 01:25:00 -0700	[diff] [blame]	1889	clear_buffer_new(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1890	set_buffer_uptodate(bh);
Nick Piggin	637aff4	2007-10-16 01:25:00 -0700	[diff] [blame]	1891	mark_buffer_dirty(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1892	continue;
				1893	}
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	1894	if (block_end > to \|\| block_start < from)
				1895	zero_user_segments(page,
				1896	to, block_end,
				1897	block_start, from);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1898	continue;
				1899	}
				1900	}
				1901	if (PageUptodate(page)) {
				1902	if (!buffer_uptodate(bh))
				1903	set_buffer_uptodate(bh);
				1904	continue;
				1905	}
				1906	if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	1907	!buffer_unwritten(bh) &&
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1908	(block_start < from \|\| block_end > to)) {
				1909	ll_rw_block(READ, 1, &bh);
				1910	*wait_bh++=bh;
				1911	}
				1912	}
				1913	/*
				1914	* If we issued read requests - let them complete.
				1915	*/
				1916	while(wait_bh > wait) {
				1917	wait_on_buffer(*--wait_bh);
				1918	if (!buffer_uptodate(*wait_bh))
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1919	err = -EIO;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1920	}
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1921	if (unlikely(err))
				1922	page_zero_new_buffers(page, from, to);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1923	return err;
				1924	}
				1925
				1926	static int __block_commit_write(struct inode inode, struct page page,
				1927	unsigned from, unsigned to)
				1928	{
				1929	unsigned block_start, block_end;
				1930	int partial = 0;
				1931	unsigned blocksize;
				1932	struct buffer_head bh, head;
				1933
				1934	blocksize = 1 << inode->i_blkbits;
				1935
				1936	for(bh = head = page_buffers(page), block_start = 0;
				1937	bh != head \|\| !block_start;
				1938	block_start=block_end, bh = bh->b_this_page) {
				1939	block_end = block_start + blocksize;
				1940	if (block_end <= from \|\| block_start >= to) {
				1941	if (!buffer_uptodate(bh))
				1942	partial = 1;
				1943	} else {
				1944	set_buffer_uptodate(bh);
				1945	mark_buffer_dirty(bh);
				1946	}
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1947	clear_buffer_new(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1948	}
				1949
				1950	/*
				1951	* If this is a partial write which happened to make all buffers
				1952	* uptodate then we can optimize away a bogus readpage() for
				1953	* the next read(). Here we 'discover' whether the page went
				1954	* uptodate as a result of this (potentially partial) write.
				1955	*/
				1956	if (!partial)
				1957	SetPageUptodate(page);
				1958	return 0;
				1959	}
				1960
				1961	/*
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1962	* block_write_begin takes care of the basic task of block allocation and
				1963	* bringing partial write blocks uptodate first.
				1964	*
				1965	* If *pagep is not NULL, then block_write_begin uses the locked page
				1966	* at *pagep rather than allocating its own. In this case, the page will
				1967	* not be unlocked or deallocated on failure.
				1968	*/
				1969	int block_write_begin(struct file file, struct address_space mapping,
				1970	loff_t pos, unsigned len, unsigned flags,
				1971	struct page pagep, void fsdata,
				1972	get_block_t *get_block)
				1973	{
				1974	struct inode *inode = mapping->host;
				1975	int status = 0;
				1976	struct page *page;
				1977	pgoff_t index;
				1978	unsigned start, end;
				1979	int ownpage = 0;
				1980
				1981	index = pos >> PAGE_CACHE_SHIFT;
				1982	start = pos & (PAGE_CACHE_SIZE - 1);
				1983	end = start + len;
				1984
				1985	page = *pagep;
				1986	if (page == NULL) {
				1987	ownpage = 1;
				1988	page = __grab_cache_page(mapping, index);
				1989	if (!page) {
				1990	status = -ENOMEM;
				1991	goto out;
				1992	}
				1993	*pagep = page;
				1994	} else
				1995	BUG_ON(!PageLocked(page));
				1996
				1997	status = __block_prepare_write(inode, page, start, end, get_block);
				1998	if (unlikely(status)) {
				1999	ClearPageUptodate(page);
				2000
				2001	if (ownpage) {
				2002	unlock_page(page);
				2003	page_cache_release(page);
				2004	*pagep = NULL;
				2005
				2006	/*
				2007	* prepare_write() may have instantiated a few blocks
				2008	* outside i_size. Trim these off again. Don't need
				2009	* i_size_read because we hold i_mutex.
				2010	*/
				2011	if (pos + len > inode->i_size)
				2012	vmtruncate(inode, inode->i_size);
				2013	}
				2014	goto out;
				2015	}
				2016
				2017	out:
				2018	return status;
				2019	}
				2020	EXPORT_SYMBOL(block_write_begin);
				2021
				2022	int block_write_end(struct file file, struct address_space mapping,
				2023	loff_t pos, unsigned len, unsigned copied,
				2024	struct page page, void fsdata)
				2025	{
				2026	struct inode *inode = mapping->host;
				2027	unsigned start;
				2028
				2029	start = pos & (PAGE_CACHE_SIZE - 1);
				2030
				2031	if (unlikely(copied < len)) {
				2032	/*
				2033	* The buffers that were written will now be uptodate, so we
				2034	* don't have to worry about a readpage reading them and
				2035	* overwriting a partial write. However if we have encountered
				2036	* a short write and only partially written into a buffer, it
				2037	* will not be marked uptodate, so a readpage might come in and
				2038	* destroy our partial write.
				2039	*
				2040	* Do the simplest thing, and just treat any short write to a
				2041	* non uptodate page as a zero-length write, and force the
				2042	* caller to redo the whole thing.
				2043	*/
				2044	if (!PageUptodate(page))
				2045	copied = 0;
				2046
				2047	page_zero_new_buffers(page, start+copied, start+len);
				2048	}
				2049	flush_dcache_page(page);
				2050
				2051	/* This could be a short (even 0-length) commit */
				2052	__block_commit_write(inode, page, start, start+copied);
				2053
				2054	return copied;
				2055	}
				2056	EXPORT_SYMBOL(block_write_end);
				2057
				2058	int generic_write_end(struct file file, struct address_space mapping,
				2059	loff_t pos, unsigned len, unsigned copied,
				2060	struct page page, void fsdata)
				2061	{
				2062	struct inode *inode = mapping->host;
				2063
				2064	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
				2065
				2066	/*
				2067	* No need to use i_size_read() here, the i_size
				2068	* cannot change under us because we hold i_mutex.
				2069	*
				2070	* But it's important to update i_size while still holding page lock:
				2071	* page writeout could otherwise come in and zero beyond i_size.
				2072	*/
				2073	if (pos+copied > inode->i_size) {
				2074	i_size_write(inode, pos+copied);
				2075	mark_inode_dirty(inode);
				2076	}
				2077
				2078	unlock_page(page);
				2079	page_cache_release(page);
				2080
				2081	return copied;
				2082	}
				2083	EXPORT_SYMBOL(generic_write_end);
				2084
				2085	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2086	* Generic "read page" function for block devices that have the normal
				2087	* get_block functionality. This is most of the block device filesystems.
				2088	* Reads the page asynchronously --- the unlock_buffer() and
				2089	* set/clear_buffer_uptodate() functions propagate buffer state into the
				2090	* page struct once IO has completed.
				2091	*/
				2092	int block_read_full_page(struct page page, get_block_t get_block)
				2093	{
				2094	struct inode *inode = page->mapping->host;
				2095	sector_t iblock, lblock;
				2096	struct buffer_head bh, head, *arr[MAX_BUF_PER_PAGE];
				2097	unsigned int blocksize;
				2098	int nr, i;
				2099	int fully_mapped = 1;
				2100
Matt Mackall	cd7619d	2005-05-01 08:59:01 -0700	[diff] [blame]	2101	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2102	blocksize = 1 << inode->i_blkbits;
				2103	if (!page_has_buffers(page))
				2104	create_empty_buffers(page, blocksize, 0);
				2105	head = page_buffers(page);
				2106
				2107	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
				2108	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
				2109	bh = head;
				2110	nr = 0;
				2111	i = 0;
				2112
				2113	do {
				2114	if (buffer_uptodate(bh))
				2115	continue;
				2116
				2117	if (!buffer_mapped(bh)) {
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2118	int err = 0;
				2119
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2120	fully_mapped = 0;
				2121	if (iblock < lblock) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2122	WARN_ON(bh->b_size != blocksize);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2123	err = get_block(inode, iblock, bh, 0);
				2124	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2125	SetPageError(page);
				2126	}
				2127	if (!buffer_mapped(bh)) {
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2128	zero_user(page, i * blocksize, blocksize);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2129	if (!err)
				2130	set_buffer_uptodate(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2131	continue;
				2132	}
				2133	/*
				2134	* get_block() might have updated the buffer
				2135	* synchronously
				2136	*/
				2137	if (buffer_uptodate(bh))
				2138	continue;
				2139	}
				2140	arr[nr++] = bh;
				2141	} while (i++, iblock++, (bh = bh->b_this_page) != head);
				2142
				2143	if (fully_mapped)
				2144	SetPageMappedToDisk(page);
				2145
				2146	if (!nr) {
				2147	/*
				2148	* All buffers are uptodate - we can set the page uptodate
				2149	* as well. But not if get_block() returned an error.
				2150	*/
				2151	if (!PageError(page))
				2152	SetPageUptodate(page);
				2153	unlock_page(page);
				2154	return 0;
				2155	}
				2156
				2157	/* Stage two: lock the buffers */
				2158	for (i = 0; i < nr; i++) {
				2159	bh = arr[i];
				2160	lock_buffer(bh);
				2161	mark_buffer_async_read(bh);
				2162	}
				2163
				2164	/*
				2165	* Stage 3: start the IO. Check for uptodateness
				2166	* inside the buffer lock in case another process reading
				2167	* the underlying blockdev brought it uptodate (the sct fix).
				2168	*/
				2169	for (i = 0; i < nr; i++) {
				2170	bh = arr[i];
				2171	if (buffer_uptodate(bh))
				2172	end_buffer_async_read(bh, 1);
				2173	else
				2174	submit_bh(READ, bh);
				2175	}
				2176	return 0;
				2177	}
				2178
				2179	/* utility function for filesystems that need to do work on expanding
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2180	* truncates. Uses filesystem pagecache writes to allow the filesystem to
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2181	* deal with the hole.
				2182	*/
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2183	int generic_cont_expand_simple(struct inode *inode, loff_t size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2184	{
				2185	struct address_space *mapping = inode->i_mapping;
				2186	struct page *page;
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2187	void *fsdata;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2188	unsigned long limit;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2189	int err;
				2190
				2191	err = -EFBIG;
				2192	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
				2193	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
				2194	send_sig(SIGXFSZ, current, 0);
				2195	goto out;
				2196	}
				2197	if (size > inode->i_sb->s_maxbytes)
				2198	goto out;
				2199
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2200	err = pagecache_write_begin(NULL, mapping, size, 0,
				2201	AOP_FLAG_UNINTERRUPTIBLE\|AOP_FLAG_CONT_EXPAND,
				2202	&page, &fsdata);
				2203	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2204	goto out;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2205
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2206	err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
				2207	BUG_ON(err > 0);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2208
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2209	out:
				2210	return err;
				2211	}
				2212
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2213	int cont_expand_zero(struct file file, struct address_space mapping,
				2214	loff_t pos, loff_t *bytes)
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2215	{
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2216	struct inode *inode = mapping->host;
				2217	unsigned blocksize = 1 << inode->i_blkbits;
				2218	struct page *page;
				2219	void *fsdata;
				2220	pgoff_t index, curidx;
				2221	loff_t curpos;
				2222	unsigned zerofrom, offset, len;
				2223	int err = 0;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2224
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2225	index = pos >> PAGE_CACHE_SHIFT;
				2226	offset = pos & ~PAGE_CACHE_MASK;
				2227
				2228	while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
				2229	zerofrom = curpos & ~PAGE_CACHE_MASK;
				2230	if (zerofrom & (blocksize-1)) {
				2231	*bytes \|= (blocksize-1);
				2232	(*bytes)++;
				2233	}
				2234	len = PAGE_CACHE_SIZE - zerofrom;
				2235
				2236	err = pagecache_write_begin(file, mapping, curpos, len,
				2237	AOP_FLAG_UNINTERRUPTIBLE,
				2238	&page, &fsdata);
				2239	if (err)
				2240	goto out;
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2241	zero_user(page, zerofrom, len);
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2242	err = pagecache_write_end(file, mapping, curpos, len, len,
				2243	page, fsdata);
				2244	if (err < 0)
				2245	goto out;
				2246	BUG_ON(err != len);
				2247	err = 0;
				2248	}
				2249
				2250	/* page covers the boundary, find the boundary offset */
				2251	if (index == curidx) {
				2252	zerofrom = curpos & ~PAGE_CACHE_MASK;
				2253	/* if we will expand the thing last block will be filled */
				2254	if (offset <= zerofrom) {
				2255	goto out;
				2256	}
				2257	if (zerofrom & (blocksize-1)) {
				2258	*bytes \|= (blocksize-1);
				2259	(*bytes)++;
				2260	}
				2261	len = offset - zerofrom;
				2262
				2263	err = pagecache_write_begin(file, mapping, curpos, len,
				2264	AOP_FLAG_UNINTERRUPTIBLE,
				2265	&page, &fsdata);
				2266	if (err)
				2267	goto out;
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2268	zero_user(page, zerofrom, len);
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2269	err = pagecache_write_end(file, mapping, curpos, len, len,
				2270	page, fsdata);
				2271	if (err < 0)
				2272	goto out;
				2273	BUG_ON(err != len);
				2274	err = 0;
				2275	}
				2276	out:
				2277	return err;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2278	}
				2279
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2280	/*
				2281	* For moronic filesystems that do not allow holes in file.
				2282	* We may have to extend the file.
				2283	*/
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2284	int cont_write_begin(struct file file, struct address_space mapping,
				2285	loff_t pos, unsigned len, unsigned flags,
				2286	struct page pagep, void fsdata,
				2287	get_block_t get_block, loff_t bytes)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2288	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2289	struct inode *inode = mapping->host;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2290	unsigned blocksize = 1 << inode->i_blkbits;
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2291	unsigned zerofrom;
				2292	int err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2293
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2294	err = cont_expand_zero(file, mapping, pos, bytes);
				2295	if (err)
				2296	goto out;
				2297
				2298	zerofrom = *bytes & ~PAGE_CACHE_MASK;
				2299	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
				2300	*bytes \|= (blocksize-1);
				2301	(*bytes)++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2302	}
				2303
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2304	*pagep = NULL;
				2305	err = block_write_begin(file, mapping, pos, len,
				2306	flags, pagep, fsdata, get_block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2307	out:
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2308	return err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2309	}
				2310
				2311	int block_prepare_write(struct page *page, unsigned from, unsigned to,
				2312	get_block_t *get_block)
				2313	{
				2314	struct inode *inode = page->mapping->host;
				2315	int err = __block_prepare_write(inode, page, from, to, get_block);
				2316	if (err)
				2317	ClearPageUptodate(page);
				2318	return err;
				2319	}
				2320
				2321	int block_commit_write(struct page *page, unsigned from, unsigned to)
				2322	{
				2323	struct inode *inode = page->mapping->host;
				2324	__block_commit_write(inode,page,from,to);
				2325	return 0;
				2326	}
				2327
				2328	int generic_commit_write(struct file file, struct page page,
				2329	unsigned from, unsigned to)
				2330	{
				2331	struct inode *inode = page->mapping->host;
				2332	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				2333	__block_commit_write(inode,page,from,to);
				2334	/*
				2335	* No need to use i_size_read() here, the i_size
Jes Sorensen	1b1dcc1	2006-01-09 15:59:24 -0800	[diff] [blame]	2336	* cannot change under us because we hold i_mutex.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2337	*/
				2338	if (pos > inode->i_size) {
				2339	i_size_write(inode, pos);
				2340	mark_inode_dirty(inode);
				2341	}
				2342	return 0;
				2343	}
				2344
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2345	/*
				2346	* block_page_mkwrite() is not allowed to change the file size as it gets
				2347	* called from a page fault handler when a page is first dirtied. Hence we must
				2348	* be careful to check for EOF conditions here. We set the page up correctly
				2349	* for a written page which means we get ENOSPC checking when writing into
				2350	* holes and correct delalloc and unwritten extent mapping on filesystems that
				2351	* support these features.
				2352	*
				2353	* We are not allowed to take the i_mutex here so we have to play games to
				2354	* protect against truncate races as the page could now be beyond EOF. Because
				2355	* vmtruncate() writes the inode size before removing pages, once we have the
				2356	* page lock we can determine safely if the page is beyond EOF. If it is not
				2357	* beyond EOF, then the page is guaranteed safe against truncation until we
				2358	* unlock the page.
				2359	*/
				2360	int
				2361	block_page_mkwrite(struct vm_area_struct vma, struct page page,
				2362	get_block_t get_block)
				2363	{
				2364	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
				2365	unsigned long end;
				2366	loff_t size;
				2367	int ret = -EINVAL;
				2368
				2369	lock_page(page);
				2370	size = i_size_read(inode);
				2371	if ((page->mapping != inode->i_mapping) \|\|
Nick Piggin	1833633	2007-07-20 00:31:45 -0700	[diff] [blame]	2372	(page_offset(page) > size)) {
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2373	/* page got truncated out from underneath us */
				2374	goto out_unlock;
				2375	}
				2376
				2377	/* page is wholly or partially inside EOF */
				2378	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
				2379	end = size & ~PAGE_CACHE_MASK;
				2380	else
				2381	end = PAGE_CACHE_SIZE;
				2382
				2383	ret = block_prepare_write(page, 0, end, get_block);
				2384	if (!ret)
				2385	ret = block_commit_write(page, 0, end);
				2386
				2387	out_unlock:
				2388	unlock_page(page);
				2389	return ret;
				2390	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2391
				2392	/*
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2393	* nobh_write_begin()'s prereads are special: the buffer_heads are freed
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2394	* immediately, while under the page lock. So it needs a special end_io
				2395	* handler which does not touch the bh after unlocking it.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2396	*/
				2397	static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
				2398	{
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	2399	__end_buffer_read_notouch(bh, uptodate);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2400	}
				2401
				2402	/*
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2403	* Attach the singly-linked list of buffers created by nobh_write_begin, to
				2404	* the page (converting it to circular linked list and taking care of page
				2405	* dirty races).
				2406	*/
				2407	static void attach_nobh_buffers(struct page page, struct buffer_head head)
				2408	{
				2409	struct buffer_head *bh;
				2410
				2411	BUG_ON(!PageLocked(page));
				2412
				2413	spin_lock(&page->mapping->private_lock);
				2414	bh = head;
				2415	do {
				2416	if (PageDirty(page))
				2417	set_buffer_dirty(bh);
				2418	if (!bh->b_this_page)
				2419	bh->b_this_page = head;
				2420	bh = bh->b_this_page;
				2421	} while (bh != head);
				2422	attach_page_buffers(page, head);
				2423	spin_unlock(&page->mapping->private_lock);
				2424	}
				2425
				2426	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2427	* On entry, the page is fully not uptodate.
				2428	* On exit the page is fully uptodate in the areas outside (from,to)
				2429	*/
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2430	int nobh_write_begin(struct file file, struct address_space mapping,
				2431	loff_t pos, unsigned len, unsigned flags,
				2432	struct page pagep, void fsdata,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2433	get_block_t *get_block)
				2434	{
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2435	struct inode *inode = mapping->host;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2436	const unsigned blkbits = inode->i_blkbits;
				2437	const unsigned blocksize = 1 << blkbits;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2438	struct buffer_head head, bh;
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2439	struct page *page;
				2440	pgoff_t index;
				2441	unsigned from, to;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2442	unsigned block_in_page;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2443	unsigned block_start, block_end;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2444	sector_t block_in_file;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2445	int nr_reads = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2446	int ret = 0;
				2447	int is_mapped_to_disk = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2448
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2449	index = pos >> PAGE_CACHE_SHIFT;
				2450	from = pos & (PAGE_CACHE_SIZE - 1);
				2451	to = from + len;
				2452
				2453	page = __grab_cache_page(mapping, index);
				2454	if (!page)
				2455	return -ENOMEM;
				2456	*pagep = page;
				2457	*fsdata = NULL;
				2458
				2459	if (page_has_buffers(page)) {
				2460	unlock_page(page);
				2461	page_cache_release(page);
				2462	*pagep = NULL;
				2463	return block_write_begin(file, mapping, pos, len, flags, pagep,
				2464	fsdata, get_block);
				2465	}
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2466
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2467	if (PageMappedToDisk(page))
				2468	return 0;
				2469
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2470	/*
				2471	* Allocate buffers so that we can keep track of state, and potentially
				2472	* attach them to the page if an error occurs. In the common case of
				2473	* no error, they will just be freed again without ever being attached
				2474	* to the page (which is all OK, because we're under the page lock).
				2475	*
				2476	* Be careful: the buffer linked list is a NULL terminated one, rather
				2477	* than the circular one we're used to.
				2478	*/
				2479	head = alloc_page_buffers(page, blocksize, 0);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2480	if (!head) {
				2481	ret = -ENOMEM;
				2482	goto out_release;
				2483	}
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2484
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2485	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2486
				2487	/*
				2488	* We loop across all blocks in the page, whether or not they are
				2489	* part of the affected region. This is so we can discover if the
				2490	* page is fully mapped-to-disk.
				2491	*/
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2492	for (block_start = 0, block_in_page = 0, bh = head;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2493	block_start < PAGE_CACHE_SIZE;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2494	block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2495	int create;
				2496
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2497	block_end = block_start + blocksize;
				2498	bh->b_state = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2499	create = 1;
				2500	if (block_start >= to)
				2501	create = 0;
				2502	ret = get_block(inode, block_in_file + block_in_page,
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2503	bh, create);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2504	if (ret)
				2505	goto failed;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2506	if (!buffer_mapped(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2507	is_mapped_to_disk = 0;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2508	if (buffer_new(bh))
				2509	unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
				2510	if (PageUptodate(page)) {
				2511	set_buffer_uptodate(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2512	continue;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2513	}
				2514	if (buffer_new(bh) \|\| !buffer_mapped(bh)) {
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2515	zero_user_segments(page, block_start, from,
				2516	to, block_end);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2517	continue;
				2518	}
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2519	if (buffer_uptodate(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2520	continue; /* reiserfs does this */
				2521	if (block_start < from \|\| block_end > to) {
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2522	lock_buffer(bh);
				2523	bh->b_end_io = end_buffer_read_nobh;
				2524	submit_bh(READ, bh);
				2525	nr_reads++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2526	}
				2527	}
				2528
				2529	if (nr_reads) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2530	/*
				2531	* The page is locked, so these buffers are protected from
				2532	* any VM or truncate activity. Hence we don't need to care
				2533	* for the buffer_head refcounts.
				2534	*/
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2535	for (bh = head; bh; bh = bh->b_this_page) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2536	wait_on_buffer(bh);
				2537	if (!buffer_uptodate(bh))
				2538	ret = -EIO;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2539	}
				2540	if (ret)
				2541	goto failed;
				2542	}
				2543
				2544	if (is_mapped_to_disk)
				2545	SetPageMappedToDisk(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2546
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2547	fsdata = head; / to be released by nobh_write_end */
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2548
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2549	return 0;
				2550
				2551	failed:
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2552	BUG_ON(!ret);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2553	/*
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2554	* Error recovery is a bit difficult. We need to zero out blocks that
				2555	* were newly allocated, and dirty them to ensure they get written out.
				2556	* Buffers need to be attached to the page at this point, otherwise
				2557	* the handling of potential IO errors during writeout would be hard
				2558	* (could try doing synchronous writeout, but what if that fails too?)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2559	*/
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2560	attach_nobh_buffers(page, head);
				2561	page_zero_new_buffers(page, from, to);
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2562
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2563	out_release:
				2564	unlock_page(page);
				2565	page_cache_release(page);
				2566	*pagep = NULL;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2567
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2568	if (pos + len > inode->i_size)
				2569	vmtruncate(inode, inode->i_size);
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2570
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2571	return ret;
				2572	}
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2573	EXPORT_SYMBOL(nobh_write_begin);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2574
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2575	int nobh_write_end(struct file file, struct address_space mapping,
				2576	loff_t pos, unsigned len, unsigned copied,
				2577	struct page page, void fsdata)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2578	{
				2579	struct inode *inode = page->mapping->host;
Nick Piggin	efdc313	2007-10-21 06:57:41 +0200	[diff] [blame]	2580	struct buffer_head *head = fsdata;
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2581	struct buffer_head *bh;
Dmitri Monakhov	5b41e74	2008-03-28 14:15:52 -0700	[diff] [blame]	2582	BUG_ON(fsdata != NULL && page_has_buffers(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2583
Dmitri Monakhov	5b41e74	2008-03-28 14:15:52 -0700	[diff] [blame]	2584	if (unlikely(copied < len) && !page_has_buffers(page))
				2585	attach_nobh_buffers(page, head);
				2586	if (page_has_buffers(page))
				2587	return generic_write_end(file, mapping, pos, len,
				2588	copied, page, fsdata);
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2589
Nick Piggin	22c8ca7	2007-02-20 13:58:09 -0800	[diff] [blame]	2590	SetPageUptodate(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2591	set_page_dirty(page);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2592	if (pos+copied > inode->i_size) {
				2593	i_size_write(inode, pos+copied);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2594	mark_inode_dirty(inode);
				2595	}
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2596
				2597	unlock_page(page);
				2598	page_cache_release(page);
				2599
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2600	while (head) {
				2601	bh = head;
				2602	head = head->b_this_page;
				2603	free_buffer_head(bh);
				2604	}
				2605
				2606	return copied;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2607	}
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2608	EXPORT_SYMBOL(nobh_write_end);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2609
				2610	/*
				2611	* nobh_writepage() - based on block_full_write_page() except
				2612	* that it tries to operate without attaching bufferheads to
				2613	* the page.
				2614	*/
				2615	int nobh_writepage(struct page page, get_block_t get_block,
				2616	struct writeback_control *wbc)
				2617	{
				2618	struct inode * const inode = page->mapping->host;
				2619	loff_t i_size = i_size_read(inode);
				2620	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2621	unsigned offset;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2622	int ret;
				2623
				2624	/* Is the page fully inside i_size? */
				2625	if (page->index < end_index)
				2626	goto out;
				2627
				2628	/* Is the page fully outside i_size? (truncate in progress) */
				2629	offset = i_size & (PAGE_CACHE_SIZE-1);
				2630	if (page->index >= end_index+1 \|\| !offset) {
				2631	/*
				2632	* The page may have dirty, unmapped buffers. For example,
				2633	* they may have been added in ext3_writepage(). Make them
				2634	* freeable here, so the page does not leak.
				2635	*/
				2636	#if 0
				2637	/* Not really sure about this - do we need this ? */
				2638	if (page->mapping->a_ops->invalidatepage)
				2639	page->mapping->a_ops->invalidatepage(page, offset);
				2640	#endif
				2641	unlock_page(page);
				2642	return 0; /* don't care */
				2643	}
				2644
				2645	/*
				2646	* The page straddles i_size. It must be zeroed out on each and every
				2647	* writepage invocation because it may be mmapped. "A file is mapped
				2648	* in multiples of the page size. For a file that is not a multiple of
				2649	* the page size, the remaining memory is zeroed when mapped, and
				2650	* writes to that region are not written out to the file."
				2651	*/
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2652	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2653	out:
				2654	ret = mpage_writepage(page, get_block, wbc);
				2655	if (ret == -EAGAIN)
				2656	ret = __block_write_full_page(inode, page, get_block, wbc);
				2657	return ret;
				2658	}
				2659	EXPORT_SYMBOL(nobh_writepage);
				2660
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2661	int nobh_truncate_page(struct address_space *mapping,
				2662	loff_t from, get_block_t *get_block)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2663	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2664	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2665	unsigned offset = from & (PAGE_CACHE_SIZE-1);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2666	unsigned blocksize;
				2667	sector_t iblock;
				2668	unsigned length, pos;
				2669	struct inode *inode = mapping->host;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2670	struct page *page;
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2671	struct buffer_head map_bh;
				2672	int err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2673
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2674	blocksize = 1 << inode->i_blkbits;
				2675	length = offset & (blocksize - 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2676
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2677	/* Block boundary? Nothing to do */
				2678	if (!length)
				2679	return 0;
				2680
				2681	length = blocksize - length;
				2682	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
				2683
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2684	page = grab_cache_page(mapping, index);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2685	err = -ENOMEM;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2686	if (!page)
				2687	goto out;
				2688
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2689	if (page_has_buffers(page)) {
				2690	has_buffers:
				2691	unlock_page(page);
				2692	page_cache_release(page);
				2693	return block_truncate_page(mapping, from, get_block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2694	}
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2695
				2696	/* Find the buffer that contains "offset" */
				2697	pos = blocksize;
				2698	while (offset >= pos) {
				2699	iblock++;
				2700	pos += blocksize;
				2701	}
				2702
				2703	err = get_block(inode, iblock, &map_bh, 0);
				2704	if (err)
				2705	goto unlock;
				2706	/* unmapped? It's a hole - nothing to do */
				2707	if (!buffer_mapped(&map_bh))
				2708	goto unlock;
				2709
				2710	/* Ok, it's mapped. Make sure it's up-to-date */
				2711	if (!PageUptodate(page)) {
				2712	err = mapping->a_ops->readpage(NULL, page);
				2713	if (err) {
				2714	page_cache_release(page);
				2715	goto out;
				2716	}
				2717	lock_page(page);
				2718	if (!PageUptodate(page)) {
				2719	err = -EIO;
				2720	goto unlock;
				2721	}
				2722	if (page_has_buffers(page))
				2723	goto has_buffers;
				2724	}
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2725	zero_user(page, offset, length);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2726	set_page_dirty(page);
				2727	err = 0;
				2728
				2729	unlock:
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2730	unlock_page(page);
				2731	page_cache_release(page);
				2732	out:
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2733	return err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2734	}
				2735	EXPORT_SYMBOL(nobh_truncate_page);
				2736
				2737	int block_truncate_page(struct address_space *mapping,
				2738	loff_t from, get_block_t *get_block)
				2739	{
				2740	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2741	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				2742	unsigned blocksize;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2743	sector_t iblock;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2744	unsigned length, pos;
				2745	struct inode *inode = mapping->host;
				2746	struct page *page;
				2747	struct buffer_head *bh;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2748	int err;
				2749
				2750	blocksize = 1 << inode->i_blkbits;
				2751	length = offset & (blocksize - 1);
				2752
				2753	/* Block boundary? Nothing to do */
				2754	if (!length)
				2755	return 0;
				2756
				2757	length = blocksize - length;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2758	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2759
				2760	page = grab_cache_page(mapping, index);
				2761	err = -ENOMEM;
				2762	if (!page)
				2763	goto out;
				2764
				2765	if (!page_has_buffers(page))
				2766	create_empty_buffers(page, blocksize, 0);
				2767
				2768	/* Find the buffer that contains "offset" */
				2769	bh = page_buffers(page);
				2770	pos = blocksize;
				2771	while (offset >= pos) {
				2772	bh = bh->b_this_page;
				2773	iblock++;
				2774	pos += blocksize;
				2775	}
				2776
				2777	err = 0;
				2778	if (!buffer_mapped(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2779	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2780	err = get_block(inode, iblock, bh, 0);
				2781	if (err)
				2782	goto unlock;
				2783	/* unmapped? It's a hole - nothing to do */
				2784	if (!buffer_mapped(bh))
				2785	goto unlock;
				2786	}
				2787
				2788	/* Ok, it's mapped. Make sure it's up-to-date */
				2789	if (PageUptodate(page))
				2790	set_buffer_uptodate(bh);
				2791
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	2792	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2793	err = -EIO;
				2794	ll_rw_block(READ, 1, &bh);
				2795	wait_on_buffer(bh);
				2796	/* Uhhuh. Read error. Complain and punt. */
				2797	if (!buffer_uptodate(bh))
				2798	goto unlock;
				2799	}
				2800
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2801	zero_user(page, offset, length);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2802	mark_buffer_dirty(bh);
				2803	err = 0;
				2804
				2805	unlock:
				2806	unlock_page(page);
				2807	page_cache_release(page);
				2808	out:
				2809	return err;
				2810	}
				2811
				2812	/*
				2813	* The generic ->writepage function for buffer-backed address_spaces
				2814	*/
				2815	int block_write_full_page(struct page page, get_block_t get_block,
				2816	struct writeback_control *wbc)
				2817	{
				2818	struct inode * const inode = page->mapping->host;
				2819	loff_t i_size = i_size_read(inode);
				2820	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2821	unsigned offset;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2822
				2823	/* Is the page fully inside i_size? */
				2824	if (page->index < end_index)
				2825	return __block_write_full_page(inode, page, get_block, wbc);
				2826
				2827	/* Is the page fully outside i_size? (truncate in progress) */
				2828	offset = i_size & (PAGE_CACHE_SIZE-1);
				2829	if (page->index >= end_index+1 \|\| !offset) {
				2830	/*
				2831	* The page may have dirty, unmapped buffers. For example,
				2832	* they may have been added in ext3_writepage(). Make them
				2833	* freeable here, so the page does not leak.
				2834	*/
Jan Kara	aaa4059	2005-10-30 15:00:16 -0800	[diff] [blame]	2835	do_invalidatepage(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2836	unlock_page(page);
				2837	return 0; /* don't care */
				2838	}
				2839
				2840	/*
				2841	* The page straddles i_size. It must be zeroed out on each and every
				2842	* writepage invokation because it may be mmapped. "A file is mapped
				2843	* in multiples of the page size. For a file that is not a multiple of
				2844	* the page size, the remaining memory is zeroed when mapped, and
				2845	* writes to that region are not written out to the file."
				2846	*/
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2847	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2848	return __block_write_full_page(inode, page, get_block, wbc);
				2849	}
				2850
				2851	sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
				2852	get_block_t *get_block)
				2853	{
				2854	struct buffer_head tmp;
				2855	struct inode *inode = mapping->host;
				2856	tmp.b_state = 0;
				2857	tmp.b_blocknr = 0;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2858	tmp.b_size = 1 << inode->i_blkbits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2859	get_block(inode, block, &tmp, 0);
				2860	return tmp.b_blocknr;
				2861	}
				2862
NeilBrown	6712ecf	2007-09-27 12:47:43 +0200	[diff] [blame]	2863	static void end_bio_bh_io_sync(struct bio *bio, int err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2864	{
				2865	struct buffer_head *bh = bio->bi_private;
				2866
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2867	if (err == -EOPNOTSUPP) {
				2868	set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
				2869	set_bit(BH_Eopnotsupp, &bh->b_state);
				2870	}
				2871
				2872	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
				2873	bio_put(bio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2874	}
				2875
				2876	int submit_bh(int rw, struct buffer_head * bh)
				2877	{
				2878	struct bio *bio;
				2879	int ret = 0;
				2880
				2881	BUG_ON(!buffer_locked(bh));
				2882	BUG_ON(!buffer_mapped(bh));
				2883	BUG_ON(!bh->b_end_io);
				2884
				2885	if (buffer_ordered(bh) && (rw == WRITE))
				2886	rw = WRITE_BARRIER;
				2887
				2888	/*
				2889	* Only clear out a write error when rewriting, should this
				2890	* include WRITE_SYNC as well?
				2891	*/
				2892	if (test_set_buffer_req(bh) && (rw == WRITE \|\| rw == WRITE_BARRIER))
				2893	clear_buffer_write_io_error(bh);
				2894
				2895	/*
				2896	* from here on down, it's all bio -- do the initial mapping,
				2897	* submit_bio -> generic_make_request may further map this bio around
				2898	*/
				2899	bio = bio_alloc(GFP_NOIO, 1);
				2900
				2901	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
				2902	bio->bi_bdev = bh->b_bdev;
				2903	bio->bi_io_vec[0].bv_page = bh->b_page;
				2904	bio->bi_io_vec[0].bv_len = bh->b_size;
				2905	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
				2906
				2907	bio->bi_vcnt = 1;
				2908	bio->bi_idx = 0;
				2909	bio->bi_size = bh->b_size;
				2910
				2911	bio->bi_end_io = end_bio_bh_io_sync;
				2912	bio->bi_private = bh;
				2913
				2914	bio_get(bio);
				2915	submit_bio(rw, bio);
				2916
				2917	if (bio_flagged(bio, BIO_EOPNOTSUPP))
				2918	ret = -EOPNOTSUPP;
				2919
				2920	bio_put(bio);
				2921	return ret;
				2922	}
				2923
				2924	/**
				2925	* ll_rw_block: low-level access to block devices (DEPRECATED)
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2926	* @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2927	* @nr: number of &struct buffer_heads in the array
				2928	* @bhs: array of pointers to &struct buffer_head
				2929	*
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2930	* ll_rw_block() takes an array of pointers to &struct buffer_heads, and
				2931	* requests an I/O operation on them, either a %READ or a %WRITE. The third
				2932	* %SWRITE is like %WRITE only we make sure that the current data in buffers
				2933	* are sent to disk. The fourth %READA option is described in the documentation
				2934	* for generic_make_request() which ll_rw_block() calls.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2935	*
				2936	* This function drops any buffer that it cannot get a lock on (with the
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2937	* BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
				2938	* clean when doing a write request, and any buffer that appears to be
				2939	* up-to-date when doing read request. Further it marks as clean buffers that
				2940	* are processed for writing (the buffer cache won't assume that they are
				2941	* actually clean until the buffer gets unlocked).
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2942	*
				2943	* ll_rw_block sets b_end_io to simple completion handler that marks
				2944	* the buffer up-to-date (if approriate), unlocks the buffer and wakes
				2945	* any waiters.
				2946	*
				2947	* All of the buffers must be for the same device, and must also be a
				2948	* multiple of the current approved size for the device.
				2949	*/
				2950	void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
				2951	{
				2952	int i;
				2953
				2954	for (i = 0; i < nr; i++) {
				2955	struct buffer_head *bh = bhs[i];
				2956
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2957	if (rw == SWRITE)
				2958	lock_buffer(bh);
				2959	else if (test_set_buffer_locked(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2960	continue;
				2961
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2962	if (rw == WRITE \|\| rw == SWRITE) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2963	if (test_clear_buffer_dirty(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	2964	bh->b_end_io = end_buffer_write_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	2965	get_bh(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2966	submit_bh(WRITE, bh);
				2967	continue;
				2968	}
				2969	} else {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2970	if (!buffer_uptodate(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	2971	bh->b_end_io = end_buffer_read_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	2972	get_bh(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2973	submit_bh(rw, bh);
				2974	continue;
				2975	}
				2976	}
				2977	unlock_buffer(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2978	}
				2979	}
				2980
				2981	/*
				2982	* For a data-integrity writeout, we need to wait upon any in-progress I/O
				2983	* and then start new I/O and then wait upon it. The caller must have a ref on
				2984	* the buffer_head.
				2985	*/
				2986	int sync_dirty_buffer(struct buffer_head *bh)
				2987	{
				2988	int ret = 0;
				2989
				2990	WARN_ON(atomic_read(&bh->b_count) < 1);
				2991	lock_buffer(bh);
				2992	if (test_clear_buffer_dirty(bh)) {
				2993	get_bh(bh);
				2994	bh->b_end_io = end_buffer_write_sync;
				2995	ret = submit_bh(WRITE, bh);
				2996	wait_on_buffer(bh);
				2997	if (buffer_eopnotsupp(bh)) {
				2998	clear_buffer_eopnotsupp(bh);
				2999	ret = -EOPNOTSUPP;
				3000	}
				3001	if (!ret && !buffer_uptodate(bh))
				3002	ret = -EIO;
				3003	} else {
				3004	unlock_buffer(bh);
				3005	}
				3006	return ret;
				3007	}
				3008
				3009	/*
				3010	* try_to_free_buffers() checks if all the buffers on this particular page
				3011	* are unused, and releases them if so.
				3012	*
				3013	* Exclusion against try_to_free_buffers may be obtained by either
				3014	* locking the page or by holding its mapping's private_lock.
				3015	*
				3016	* If the page is dirty but all the buffers are clean then we need to
				3017	* be sure to mark the page clean as well. This is because the page
				3018	* may be against a block device, and a later reattachment of buffers
				3019	* to a dirty page will set all buffers dirty. Which would corrupt
				3020	* filesystem data on the same device.
				3021	*
				3022	* The same applies to regular filesystem pages: if all the buffers are
				3023	* clean then we set the page clean and proceed. To do that, we require
				3024	* total exclusion from __set_page_dirty_buffers(). That is obtained with
				3025	* private_lock.
				3026	*
				3027	* try_to_free_buffers() is non-blocking.
				3028	*/
				3029	static inline int buffer_busy(struct buffer_head *bh)
				3030	{
				3031	return atomic_read(&bh->b_count) \|
				3032	(bh->b_state & ((1 << BH_Dirty) \| (1 << BH_Lock)));
				3033	}
				3034
				3035	static int
				3036	drop_buffers(struct page page, struct buffer_head *buffers_to_free)
				3037	{
				3038	struct buffer_head *head = page_buffers(page);
				3039	struct buffer_head *bh;
				3040
				3041	bh = head;
				3042	do {
akpm@osdl.org	de7d5a3	2005-05-01 08:58:39 -0700	[diff] [blame]	3043	if (buffer_write_io_error(bh) && page->mapping)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3044	set_bit(AS_EIO, &page->mapping->flags);
				3045	if (buffer_busy(bh))
				3046	goto failed;
				3047	bh = bh->b_this_page;
				3048	} while (bh != head);
				3049
				3050	do {
				3051	struct buffer_head *next = bh->b_this_page;
				3052
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	3053	if (bh->b_assoc_map)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3054	__remove_assoc_queue(bh);
				3055	bh = next;
				3056	} while (bh != head);
				3057	*buffers_to_free = head;
				3058	__clear_page_buffers(page);
				3059	return 1;
				3060	failed:
				3061	return 0;
				3062	}
				3063
				3064	int try_to_free_buffers(struct page *page)
				3065	{
				3066	struct address_space * const mapping = page->mapping;
				3067	struct buffer_head *buffers_to_free = NULL;
				3068	int ret = 0;
				3069
				3070	BUG_ON(!PageLocked(page));
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	3071	if (PageWriteback(page))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3072	return 0;
				3073
				3074	if (mapping == NULL) { /* can this still happen? */
				3075	ret = drop_buffers(page, &buffers_to_free);
				3076	goto out;
				3077	}
				3078
				3079	spin_lock(&mapping->private_lock);
				3080	ret = drop_buffers(page, &buffers_to_free);
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	3081
				3082	/*
				3083	* If the filesystem writes its buffers by hand (eg ext3)
				3084	* then we can have clean buffers against a dirty page. We
				3085	* clean the page here; otherwise the VM will never notice
				3086	* that the filesystem did any IO at all.
				3087	*
				3088	* Also, during truncate, discard_buffer will have marked all
				3089	* the page's buffers clean. We discover that here and clean
				3090	* the page also.
Nick Piggin	87df724	2007-01-30 14:36:27 +1100	[diff] [blame]	3091	*
				3092	* private_lock must be held over this entire operation in order
				3093	* to synchronise against __set_page_dirty_buffers and prevent the
				3094	* dirty bit from being lost.
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	3095	*/
				3096	if (ret)
				3097	cancel_dirty_page(page, PAGE_CACHE_SIZE);
Nick Piggin	87df724	2007-01-30 14:36:27 +1100	[diff] [blame]	3098	spin_unlock(&mapping->private_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3099	out:
				3100	if (buffers_to_free) {
				3101	struct buffer_head *bh = buffers_to_free;
				3102
				3103	do {
				3104	struct buffer_head *next = bh->b_this_page;
				3105	free_buffer_head(bh);
				3106	bh = next;
				3107	} while (bh != buffers_to_free);
				3108	}
				3109	return ret;
				3110	}
				3111	EXPORT_SYMBOL(try_to_free_buffers);
				3112
NeilBrown	3978d71	2006-03-26 01:37:17 -0800	[diff] [blame]	3113	void block_sync_page(struct page *page)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3114	{
				3115	struct address_space *mapping;
				3116
				3117	smp_mb();
				3118	mapping = page_mapping(page);
				3119	if (mapping)
				3120	blk_run_backing_dev(mapping->backing_dev_info, page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3121	}
				3122
				3123	/*
				3124	* There are no bdflush tunables left. But distributions are
				3125	* still running obsolete flush daemons, so we terminate them here.
				3126	*
				3127	* Use of bdflush() is deprecated and will be removed in a future kernel.
				3128	* The `pdflush' kernel threads fully replace bdflush daemons and this call.
				3129	*/
				3130	asmlinkage long sys_bdflush(int func, long data)
				3131	{
				3132	static int msg_count;
				3133
				3134	if (!capable(CAP_SYS_ADMIN))
				3135	return -EPERM;
				3136
				3137	if (msg_count < 5) {
				3138	msg_count++;
				3139	printk(KERN_INFO
				3140	"warning: process `%s' used the obsolete bdflush"
				3141	" system call\n", current->comm);
				3142	printk(KERN_INFO "Fix your initscripts?\n");
				3143	}
				3144
				3145	if (func == 1)
				3146	do_exit(0);
				3147	return 0;
				3148	}
				3149
				3150	/*
				3151	* Buffer-head allocation
				3152	*/
Christoph Lameter	e18b890	2006-12-06 20:33:20 -0800	[diff] [blame]	3153	static struct kmem_cache *bh_cachep;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3154
				3155	/*
				3156	* Once the number of bh's in the machine exceeds this level, we start
				3157	* stripping them in writeback.
				3158	*/
				3159	static int max_buffer_heads;
				3160
				3161	int buffer_heads_over_limit;
				3162
				3163	struct bh_accounting {
				3164	int nr; /* Number of live bh's */
				3165	int ratelimit; /* Limit cacheline bouncing */
				3166	};
				3167
				3168	static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
				3169
				3170	static void recalc_bh_state(void)
				3171	{
				3172	int i;
				3173	int tot = 0;
				3174
				3175	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
				3176	return;
				3177	__get_cpu_var(bh_accounting).ratelimit = 0;
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	3178	for_each_online_cpu(i)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3179	tot += per_cpu(bh_accounting, i).nr;
				3180	buffer_heads_over_limit = (tot > max_buffer_heads);
				3181	}
				3182
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	3183	struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3184	{
Christoph Lameter	488514d	2008-04-28 02:12:05 -0700	[diff] [blame]	3185	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3186	if (ret) {
Christoph Lameter	a35afb8	2007-05-16 22:10:57 -0700	[diff] [blame]	3187	INIT_LIST_HEAD(&ret->b_assoc_buffers);
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3188	get_cpu_var(bh_accounting).nr++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3189	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3190	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3191	}
				3192	return ret;
				3193	}
				3194	EXPORT_SYMBOL(alloc_buffer_head);
				3195
				3196	void free_buffer_head(struct buffer_head *bh)
				3197	{
				3198	BUG_ON(!list_empty(&bh->b_assoc_buffers));
				3199	kmem_cache_free(bh_cachep, bh);
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3200	get_cpu_var(bh_accounting).nr--;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3201	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3202	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3203	}
				3204	EXPORT_SYMBOL(free_buffer_head);
				3205
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3206	static void buffer_exit_cpu(int cpu)
				3207	{
				3208	int i;
				3209	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
				3210
				3211	for (i = 0; i < BH_LRU_SIZE; i++) {
				3212	brelse(b->bhs[i]);
				3213	b->bhs[i] = NULL;
				3214	}
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	3215	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
				3216	per_cpu(bh_accounting, cpu).nr = 0;
				3217	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3218	}
				3219
				3220	static int buffer_cpu_notify(struct notifier_block *self,
				3221	unsigned long action, void *hcpu)
				3222	{
Rafael J. Wysocki	8bb7844	2007-05-09 02:35:10 -0700	[diff] [blame]	3223	if (action == CPU_DEAD \|\| action == CPU_DEAD_FROZEN)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3224	buffer_exit_cpu((unsigned long)hcpu);
				3225	return NOTIFY_OK;
				3226	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3227
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3228	/**
Randy Dunlap	a6b9191	2008-03-19 17:01:00 -0700	[diff] [blame]	3229	* bh_uptodate_or_lock - Test whether the buffer is uptodate
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3230	* @bh: struct buffer_head
				3231	*
				3232	* Return true if the buffer is up-to-date and false,
				3233	* with the buffer locked, if not.
				3234	*/
				3235	int bh_uptodate_or_lock(struct buffer_head *bh)
				3236	{
				3237	if (!buffer_uptodate(bh)) {
				3238	lock_buffer(bh);
				3239	if (!buffer_uptodate(bh))
				3240	return 0;
				3241	unlock_buffer(bh);
				3242	}
				3243	return 1;
				3244	}
				3245	EXPORT_SYMBOL(bh_uptodate_or_lock);
				3246
				3247	/**
Randy Dunlap	a6b9191	2008-03-19 17:01:00 -0700	[diff] [blame]	3248	* bh_submit_read - Submit a locked buffer for reading
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3249	* @bh: struct buffer_head
				3250	*
				3251	* Returns zero on success and -EIO on error.
				3252	*/
				3253	int bh_submit_read(struct buffer_head *bh)
				3254	{
				3255	BUG_ON(!buffer_locked(bh));
				3256
				3257	if (buffer_uptodate(bh)) {
				3258	unlock_buffer(bh);
				3259	return 0;
				3260	}
				3261
				3262	get_bh(bh);
				3263	bh->b_end_io = end_buffer_read_sync;
				3264	submit_bh(READ, bh);
				3265	wait_on_buffer(bh);
				3266	if (buffer_uptodate(bh))
				3267	return 0;
				3268	return -EIO;
				3269	}
				3270	EXPORT_SYMBOL(bh_submit_read);
				3271
Christoph Lameter	b98938c	2008-02-04 22:28:36 -0800	[diff] [blame]	3272	static void
				3273	init_buffer_head(struct kmem_cache cachep, void data)
				3274	{
				3275	struct buffer_head *bh = data;
				3276
				3277	memset(bh, 0, sizeof(*bh));
				3278	INIT_LIST_HEAD(&bh->b_assoc_buffers);
				3279	}
				3280
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3281	void __init buffer_init(void)
				3282	{
				3283	int nrpages;
				3284
Christoph Lameter	b98938c	2008-02-04 22:28:36 -0800	[diff] [blame]	3285	bh_cachep = kmem_cache_create("buffer_head",
				3286	sizeof(struct buffer_head), 0,
				3287	(SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|
				3288	SLAB_MEM_SPREAD),
				3289	init_buffer_head);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3290
				3291	/*
				3292	* Limit the bh occupancy to 10% of ZONE_NORMAL
				3293	*/
				3294	nrpages = (nr_free_buffer_pages() * 10) / 100;
				3295	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
				3296	hotcpu_notifier(buffer_cpu_notify, 0);
				3297	}
				3298
				3299	EXPORT_SYMBOL(__bforget);
				3300	EXPORT_SYMBOL(__brelse);
				3301	EXPORT_SYMBOL(__wait_on_buffer);
				3302	EXPORT_SYMBOL(block_commit_write);
				3303	EXPORT_SYMBOL(block_prepare_write);
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	3304	EXPORT_SYMBOL(block_page_mkwrite);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3305	EXPORT_SYMBOL(block_read_full_page);
				3306	EXPORT_SYMBOL(block_sync_page);
				3307	EXPORT_SYMBOL(block_truncate_page);
				3308	EXPORT_SYMBOL(block_write_full_page);
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	3309	EXPORT_SYMBOL(cont_write_begin);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3310	EXPORT_SYMBOL(end_buffer_read_sync);
				3311	EXPORT_SYMBOL(end_buffer_write_sync);
				3312	EXPORT_SYMBOL(file_fsync);
				3313	EXPORT_SYMBOL(fsync_bdev);
				3314	EXPORT_SYMBOL(generic_block_bmap);
				3315	EXPORT_SYMBOL(generic_commit_write);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	3316	EXPORT_SYMBOL(generic_cont_expand_simple);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3317	EXPORT_SYMBOL(init_buffer);
				3318	EXPORT_SYMBOL(invalidate_bdev);
				3319	EXPORT_SYMBOL(ll_rw_block);
				3320	EXPORT_SYMBOL(mark_buffer_dirty);
				3321	EXPORT_SYMBOL(submit_bh);
				3322	EXPORT_SYMBOL(sync_dirty_buffer);
				3323	EXPORT_SYMBOL(unlock_buffer);