Blame - fs/buffer.c - fp2-dev/kernel/msm

blob: 826baf4f04bc738fdfea27f6652bc3c0bfe9969d [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/fs/buffer.c
				3	*
				4	* Copyright (C) 1991, 1992, 2002 Linus Torvalds
				5	*/
				6
				7	/*
				8	* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
				9	*
				10	* Removed a lot of unnecessary code and simplified things now that
				11	* the buffer cache isn't our primary cache - Andrew Tridgell 12/96
				12	*
				13	* Speed up hash, lru, and free list operations. Use gfp() for allocating
				14	* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
				15	*
				16	* Added 32k buffer block sizes - these are required older ARM systems. - RMK
				17	*
				18	* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
				19	*/
				20
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	21	#include <linux/kernel.h>
				22	#include <linux/syscalls.h>
				23	#include <linux/fs.h>
				24	#include <linux/mm.h>
				25	#include <linux/percpu.h>
				26	#include <linux/slab.h>
Randy Dunlap	16f7e0f	2006-01-11 12:17:46 -0800	[diff] [blame]	27	#include <linux/capability.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	28	#include <linux/blkdev.h>
				29	#include <linux/file.h>
				30	#include <linux/quotaops.h>
				31	#include <linux/highmem.h>
				32	#include <linux/module.h>
				33	#include <linux/writeback.h>
				34	#include <linux/hash.h>
				35	#include <linux/suspend.h>
				36	#include <linux/buffer_head.h>
Andrew Morton	55e829a	2006-12-10 02:19:27 -0800	[diff] [blame]	37	#include <linux/task_io_accounting_ops.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	38	#include <linux/bio.h>
				39	#include <linux/notifier.h>
				40	#include <linux/cpu.h>
				41	#include <linux/bitops.h>
				42	#include <linux/mpage.h>
Ingo Molnar	fb1c8f9	2005-09-10 00:25:56 -0700	[diff] [blame]	43	#include <linux/bit_spinlock.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	44
				45	static int fsync_buffers_list(spinlock_t lock, struct list_head list);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	46
				47	#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
				48
				49	inline void
				50	init_buffer(struct buffer_head bh, bh_end_io_t handler, void *private)
				51	{
				52	bh->b_end_io = handler;
				53	bh->b_private = private;
				54	}
				55
				56	static int sync_buffer(void *word)
				57	{
				58	struct block_device *bd;
				59	struct buffer_head *bh
				60	= container_of(word, struct buffer_head, b_state);
				61
				62	smp_mb();
				63	bd = bh->b_bdev;
				64	if (bd)
				65	blk_run_address_space(bd->bd_inode->i_mapping);
				66	io_schedule();
				67	return 0;
				68	}
				69
				70	void fastcall __lock_buffer(struct buffer_head *bh)
				71	{
				72	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
				73	TASK_UNINTERRUPTIBLE);
				74	}
				75	EXPORT_SYMBOL(__lock_buffer);
				76
				77	void fastcall unlock_buffer(struct buffer_head *bh)
				78	{
Nick Piggin	72ed3d0	2007-02-10 01:46:22 -0800	[diff] [blame]	79	smp_mb__before_clear_bit();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	80	clear_buffer_locked(bh);
				81	smp_mb__after_clear_bit();
				82	wake_up_bit(&bh->b_state, BH_Lock);
				83	}
				84
				85	/*
				86	* Block until a buffer comes unlocked. This doesn't stop it
				87	* from becoming locked again - you have to lock it yourself
				88	* if you want to preserve its state.
				89	*/
				90	void __wait_on_buffer(struct buffer_head * bh)
				91	{
				92	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
				93	}
				94
				95	static void
				96	__clear_page_buffers(struct page *page)
				97	{
				98	ClearPagePrivate(page);
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	99	set_page_private(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	100	page_cache_release(page);
				101	}
				102
				103	static void buffer_io_error(struct buffer_head *bh)
				104	{
				105	char b[BDEVNAME_SIZE];
				106
				107	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
				108	bdevname(bh->b_bdev, b),
				109	(unsigned long long)bh->b_blocknr);
				110	}
				111
				112	/*
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	113	* End-of-IO handler helper function which does not touch the bh after
				114	* unlocking it.
				115	* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
				116	* a race there is benign: unlock_buffer() only use the bh's address for
				117	* hashing after unlocking the buffer, so it doesn't actually touch the bh
				118	* itself.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	119	*/
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	120	static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	121	{
				122	if (uptodate) {
				123	set_buffer_uptodate(bh);
				124	} else {
				125	/* This happens, due to failed READA attempts. */
				126	clear_buffer_uptodate(bh);
				127	}
				128	unlock_buffer(bh);
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	129	}
				130
				131	/*
				132	* Default synchronous end-of-IO handler.. Just mark it up-to-date and
				133	* unlock the buffer. This is what ll_rw_block uses too.
				134	*/
				135	void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
				136	{
				137	__end_buffer_read_notouch(bh, uptodate);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	138	put_bh(bh);
				139	}
				140
				141	void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
				142	{
				143	char b[BDEVNAME_SIZE];
				144
				145	if (uptodate) {
				146	set_buffer_uptodate(bh);
				147	} else {
				148	if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
				149	buffer_io_error(bh);
				150	printk(KERN_WARNING "lost page write due to "
				151	"I/O error on %s\n",
				152	bdevname(bh->b_bdev, b));
				153	}
				154	set_buffer_write_io_error(bh);
				155	clear_buffer_uptodate(bh);
				156	}
				157	unlock_buffer(bh);
				158	put_bh(bh);
				159	}
				160
				161	/*
				162	* Write out and wait upon all the dirty data associated with a block
				163	* device via its mapping. Does not take the superblock lock.
				164	*/
				165	int sync_blockdev(struct block_device *bdev)
				166	{
				167	int ret = 0;
				168
OGAWA Hirofumi	28fd129	2006-01-08 01:02:14 -0800	[diff] [blame]	169	if (bdev)
				170	ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	171	return ret;
				172	}
				173	EXPORT_SYMBOL(sync_blockdev);
				174
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	175	/*
				176	* Write out and wait upon all dirty data associated with this
				177	* device. Filesystem data as well as the underlying block
				178	* device. Takes the superblock lock.
				179	*/
				180	int fsync_bdev(struct block_device *bdev)
				181	{
				182	struct super_block *sb = get_super(bdev);
				183	if (sb) {
				184	int res = fsync_super(sb);
				185	drop_super(sb);
				186	return res;
				187	}
				188	return sync_blockdev(bdev);
				189	}
				190
				191	/**
				192	* freeze_bdev -- lock a filesystem and force it into a consistent state
				193	* @bdev: blockdevice to lock
				194	*
David Chinner	f73ca1b	2007-01-10 23:15:41 -0800	[diff] [blame]	195	* This takes the block device bd_mount_sem to make sure no new mounts
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	196	* happen on bdev until thaw_bdev() is called.
				197	* If a superblock is found on this device, we take the s_umount semaphore
				198	* on it to make sure nobody unmounts until the snapshot creation is done.
				199	*/
				200	struct super_block freeze_bdev(struct block_device bdev)
				201	{
				202	struct super_block *sb;
				203
David Chinner	f73ca1b	2007-01-10 23:15:41 -0800	[diff] [blame]	204	down(&bdev->bd_mount_sem);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	205	sb = get_super(bdev);
				206	if (sb && !(sb->s_flags & MS_RDONLY)) {
				207	sb->s_frozen = SB_FREEZE_WRITE;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	208	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	209
OGAWA Hirofumi	d25b9a1	2006-03-25 03:07:44 -0800	[diff] [blame]	210	__fsync_super(sb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	211
				212	sb->s_frozen = SB_FREEZE_TRANS;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	213	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	214
				215	sync_blockdev(sb->s_bdev);
				216
				217	if (sb->s_op->write_super_lockfs)
				218	sb->s_op->write_super_lockfs(sb);
				219	}
				220
				221	sync_blockdev(bdev);
				222	return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
				223	}
				224	EXPORT_SYMBOL(freeze_bdev);
				225
				226	/**
				227	* thaw_bdev -- unlock filesystem
				228	* @bdev: blockdevice to unlock
				229	* @sb: associated superblock
				230	*
				231	* Unlocks the filesystem and marks it writeable again after freeze_bdev().
				232	*/
				233	void thaw_bdev(struct block_device bdev, struct super_block sb)
				234	{
				235	if (sb) {
				236	BUG_ON(sb->s_bdev != bdev);
				237
				238	if (sb->s_op->unlockfs)
				239	sb->s_op->unlockfs(sb);
				240	sb->s_frozen = SB_UNFROZEN;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	241	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	242	wake_up(&sb->s_wait_unfrozen);
				243	drop_super(sb);
				244	}
				245
David Chinner	f73ca1b	2007-01-10 23:15:41 -0800	[diff] [blame]	246	up(&bdev->bd_mount_sem);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	247	}
				248	EXPORT_SYMBOL(thaw_bdev);
				249
				250	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	251	* Various filesystems appear to want __find_get_block to be non-blocking.
				252	* But it's the page lock which protects the buffers. To get around this,
				253	* we get exclusion from try_to_free_buffers with the blockdev mapping's
				254	* private_lock.
				255	*
				256	* Hack idea: for the blockdev mapping, i_bufferlist_lock contention
				257	* may be quite high. This code could TryLock the page, and if that
				258	* succeeds, there is no need to take private_lock. (But if
				259	* private_lock is contended then so is mapping->tree_lock).
				260	*/
				261	static struct buffer_head *
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	262	__find_get_block_slow(struct block_device *bdev, sector_t block)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	263	{
				264	struct inode *bd_inode = bdev->bd_inode;
				265	struct address_space *bd_mapping = bd_inode->i_mapping;
				266	struct buffer_head *ret = NULL;
				267	pgoff_t index;
				268	struct buffer_head *bh;
				269	struct buffer_head *head;
				270	struct page *page;
				271	int all_mapped = 1;
				272
				273	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
				274	page = find_get_page(bd_mapping, index);
				275	if (!page)
				276	goto out;
				277
				278	spin_lock(&bd_mapping->private_lock);
				279	if (!page_has_buffers(page))
				280	goto out_unlock;
				281	head = page_buffers(page);
				282	bh = head;
				283	do {
				284	if (bh->b_blocknr == block) {
				285	ret = bh;
				286	get_bh(bh);
				287	goto out_unlock;
				288	}
				289	if (!buffer_mapped(bh))
				290	all_mapped = 0;
				291	bh = bh->b_this_page;
				292	} while (bh != head);
				293
				294	/* we might be here because some of the buffers on this page are
				295	* not mapped. This is due to various races between
				296	* file io on the block device and getblk. It gets dealt with
				297	* elsewhere, don't buffer_error if we had some unmapped buffers
				298	*/
				299	if (all_mapped) {
				300	printk("__find_get_block_slow() failed. "
				301	"block=%llu, b_blocknr=%llu\n",
Badari Pulavarty	205f87f	2006-03-26 01:38:00 -0800	[diff] [blame]	302	(unsigned long long)block,
				303	(unsigned long long)bh->b_blocknr);
				304	printk("b_state=0x%08lx, b_size=%zu\n",
				305	bh->b_state, bh->b_size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	306	printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
				307	}
				308	out_unlock:
				309	spin_unlock(&bd_mapping->private_lock);
				310	page_cache_release(page);
				311	out:
				312	return ret;
				313	}
				314
				315	/* If invalidate_buffers() will trash dirty buffers, it means some kind
				316	of fs corruption is going on. Trashing dirty data always imply losing
				317	information that was supposed to be just stored on the physical layer
				318	by the user.
				319
				320	Thus invalidate_buffers in general usage is not allwowed to trash
				321	dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
				322	be preserved. These buffers are simply skipped.
				323
				324	We also skip buffers which are still in use. For example this can
				325	happen if a userspace program is reading the block device.
				326
				327	NOTE: In the case where the user removed a removable-media-disk even if
				328	there's still dirty data not synced on disk (due a bug in the device driver
				329	or due an error of the user), by not destroying the dirty buffers we could
				330	generate corruption also on the next media inserted, thus a parameter is
				331	necessary to handle this case in the most safe way possible (trying
				332	to not corrupt also the new disk inserted with the data belonging to
				333	the old now corrupted disk). Also for the ramdisk the natural thing
				334	to do in order to release the ramdisk memory is to destroy dirty buffers.
				335
				336	These are two special cases. Normal usage imply the device driver
				337	to issue a sync on the device (without waiting I/O completion) and
				338	then an invalidate_buffers call that doesn't trash dirty buffers.
				339
				340	For handling cache coherency with the blkdev pagecache the 'update' case
				341	is been introduced. It is needed to re-read from disk any pinned
				342	buffer. NOTE: re-reading from disk is destructive so we can do it only
				343	when we assume nobody is changing the buffercache under our I/O and when
				344	we think the disk contains more recent information than the buffercache.
				345	The update == 1 pass marks the buffers we need to update, the update == 2
				346	pass does the actual I/O. */
Peter Zijlstra	f98393a	2007-05-06 14:49:54 -0700	[diff] [blame]	347	void invalidate_bdev(struct block_device *bdev)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	348	{
Andrew Morton	0e1dfc6	2006-07-30 03:03:28 -0700	[diff] [blame]	349	struct address_space *mapping = bdev->bd_inode->i_mapping;
				350
				351	if (mapping->nrpages == 0)
				352	return;
				353
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	354	invalidate_bh_lrus();
Andrew Morton	fc0ecff	2007-02-10 01:45:39 -0800	[diff] [blame]	355	invalidate_mapping_pages(mapping, 0, -1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	356	}
				357
				358	/*
				359	* Kick pdflush then try to free up some ZONE_NORMAL memory.
				360	*/
				361	static void free_more_memory(void)
				362	{
				363	struct zone **zones;
				364	pg_data_t *pgdat;
				365
Pekka J Enberg	687a21c	2005-06-28 20:44:55 -0700	[diff] [blame]	366	wakeup_pdflush(1024);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	367	yield();
				368
KAMEZAWA Hiroyuki	ec936fc	2006-03-27 01:15:59 -0800	[diff] [blame]	369	for_each_online_pgdat(pgdat) {
Al Viro	af4ca45	2005-10-21 02:55:38 -0400	[diff] [blame]	370	zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	371	if (*zones)
Andy Whitcroft	5ad333e	2007-07-17 04:03:16 -0700	[diff] [blame]	372	try_to_free_pages(zones, 0, GFP_NOFS);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	373	}
				374	}
				375
				376	/*
				377	* I/O completion handler for block_read_full_page() - pages
				378	* which come unlocked at the end of I/O.
				379	*/
				380	static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
				381	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	382	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	383	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	384	struct buffer_head *tmp;
				385	struct page *page;
				386	int page_uptodate = 1;
				387
				388	BUG_ON(!buffer_async_read(bh));
				389
				390	page = bh->b_page;
				391	if (uptodate) {
				392	set_buffer_uptodate(bh);
				393	} else {
				394	clear_buffer_uptodate(bh);
				395	if (printk_ratelimit())
				396	buffer_io_error(bh);
				397	SetPageError(page);
				398	}
				399
				400	/*
				401	* Be _very_ careful from here on. Bad things can happen if
				402	* two buffer heads end IO at almost the same time and both
				403	* decide that the page is now completely done.
				404	*/
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	405	first = page_buffers(page);
				406	local_irq_save(flags);
				407	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	408	clear_buffer_async_read(bh);
				409	unlock_buffer(bh);
				410	tmp = bh;
				411	do {
				412	if (!buffer_uptodate(tmp))
				413	page_uptodate = 0;
				414	if (buffer_async_read(tmp)) {
				415	BUG_ON(!buffer_locked(tmp));
				416	goto still_busy;
				417	}
				418	tmp = tmp->b_this_page;
				419	} while (tmp != bh);
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	420	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				421	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	422
				423	/*
				424	* If none of the buffers had errors and they are all
				425	* uptodate then we can set the page uptodate.
				426	*/
				427	if (page_uptodate && !PageError(page))
				428	SetPageUptodate(page);
				429	unlock_page(page);
				430	return;
				431
				432	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	433	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				434	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	435	return;
				436	}
				437
				438	/*
				439	* Completion handler for block_write_full_page() - pages which are unlocked
				440	* during I/O, and which have PageWriteback cleared upon I/O completion.
				441	*/
Adrian Bunk	b6cd0b7	2006-06-27 02:53:54 -0700	[diff] [blame]	442	static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	443	{
				444	char b[BDEVNAME_SIZE];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	445	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	446	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	447	struct buffer_head *tmp;
				448	struct page *page;
				449
				450	BUG_ON(!buffer_async_write(bh));
				451
				452	page = bh->b_page;
				453	if (uptodate) {
				454	set_buffer_uptodate(bh);
				455	} else {
				456	if (printk_ratelimit()) {
				457	buffer_io_error(bh);
				458	printk(KERN_WARNING "lost page write due to "
				459	"I/O error on %s\n",
				460	bdevname(bh->b_bdev, b));
				461	}
				462	set_bit(AS_EIO, &page->mapping->flags);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	463	set_buffer_write_io_error(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	464	clear_buffer_uptodate(bh);
				465	SetPageError(page);
				466	}
				467
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	468	first = page_buffers(page);
				469	local_irq_save(flags);
				470	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
				471
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	472	clear_buffer_async_write(bh);
				473	unlock_buffer(bh);
				474	tmp = bh->b_this_page;
				475	while (tmp != bh) {
				476	if (buffer_async_write(tmp)) {
				477	BUG_ON(!buffer_locked(tmp));
				478	goto still_busy;
				479	}
				480	tmp = tmp->b_this_page;
				481	}
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	482	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				483	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	484	end_page_writeback(page);
				485	return;
				486
				487	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	488	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				489	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	490	return;
				491	}
				492
				493	/*
				494	* If a page's buffers are under async readin (end_buffer_async_read
				495	* completion) then there is a possibility that another thread of
				496	* control could lock one of the buffers after it has completed
				497	* but while some of the other buffers have not completed. This
				498	* locked buffer would confuse end_buffer_async_read() into not unlocking
				499	* the page. So the absence of BH_Async_Read tells end_buffer_async_read()
				500	* that this buffer is not under async I/O.
				501	*
				502	* The page comes unlocked when it has no locked buffer_async buffers
				503	* left.
				504	*
				505	* PageLocked prevents anyone starting new async I/O reads any of
				506	* the buffers.
				507	*
				508	* PageWriteback is used to prevent simultaneous writeout of the same
				509	* page.
				510	*
				511	* PageLocked prevents anyone from starting writeback of a page which is
				512	* under read I/O (PageWriteback is only ever set against a locked page).
				513	*/
				514	static void mark_buffer_async_read(struct buffer_head *bh)
				515	{
				516	bh->b_end_io = end_buffer_async_read;
				517	set_buffer_async_read(bh);
				518	}
				519
				520	void mark_buffer_async_write(struct buffer_head *bh)
				521	{
				522	bh->b_end_io = end_buffer_async_write;
				523	set_buffer_async_write(bh);
				524	}
				525	EXPORT_SYMBOL(mark_buffer_async_write);
				526
				527
				528	/*
				529	* fs/buffer.c contains helper functions for buffer-backed address space's
				530	* fsync functions. A common requirement for buffer-based filesystems is
				531	* that certain data from the backing blockdev needs to be written out for
				532	* a successful fsync(). For example, ext2 indirect blocks need to be
				533	* written back and waited upon before fsync() returns.
				534	*
				535	* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
				536	* inode_has_buffers() and invalidate_inode_buffers() are provided for the
				537	* management of a list of dependent buffers at ->i_mapping->private_list.
				538	*
				539	* Locking is a little subtle: try_to_free_buffers() will remove buffers
				540	* from their controlling inode's queue when they are being freed. But
				541	* try_to_free_buffers() will be operating against the blockdev mapping
				542	* at the time, not against the S_ISREG file which depends on those buffers.
				543	* So the locking for private_list is via the private_lock in the address_space
				544	* which backs the buffers. Which is different from the address_space
				545	* against which the buffers are listed. So for a particular address_space,
				546	* mapping->private_lock does not protect mapping->private_list! In fact,
				547	* mapping->private_list will always be protected by the backing blockdev's
				548	* ->private_lock.
				549	*
				550	* Which introduces a requirement: all buffers on an address_space's
				551	* ->private_list must be from the same address_space: the blockdev's.
				552	*
				553	* address_spaces which do not place buffers at ->private_list via these
				554	* utility functions are free to use private_lock and private_list for
				555	* whatever they want. The only requirement is that list_empty(private_list)
				556	* be true at clear_inode() time.
				557	*
				558	* FIXME: clear_inode should not call invalidate_inode_buffers(). The
				559	* filesystems should do that. invalidate_inode_buffers() should just go
				560	* BUG_ON(!list_empty).
				561	*
				562	* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
				563	* take an address_space, not an inode. And it should be called
				564	* mark_buffer_dirty_fsync() to clearly define why those buffers are being
				565	* queued up.
				566	*
				567	* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
				568	* list if it is already on a list. Because if the buffer is on a list,
				569	* it must already be on the right one. If not, the filesystem is being
				570	* silly. This will save a ton of locking. But first we have to ensure
				571	* that buffers are taken off the old inode's list when they are freed
				572	* (presumably in truncate). That requires careful auditing of all
				573	* filesystems (do it inside bforget()). It could also be done by bringing
				574	* b_inode back.
				575	*/
				576
				577	/*
				578	* The buffer's backing address_space's private_lock must be held
				579	*/
				580	static inline void __remove_assoc_queue(struct buffer_head *bh)
				581	{
				582	list_del_init(&bh->b_assoc_buffers);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	583	WARN_ON(!bh->b_assoc_map);
				584	if (buffer_write_io_error(bh))
				585	set_bit(AS_EIO, &bh->b_assoc_map->flags);
				586	bh->b_assoc_map = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	587	}
				588
				589	int inode_has_buffers(struct inode *inode)
				590	{
				591	return !list_empty(&inode->i_data.private_list);
				592	}
				593
				594	/*
				595	* osync is designed to support O_SYNC io. It waits synchronously for
				596	* all already-submitted IO to complete, but does not queue any new
				597	* writes to the disk.
				598	*
				599	* To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
				600	* you dirty the buffers, and then use osync_inode_buffers to wait for
				601	* completion. Any other dirty buffers which are not yet queued for
				602	* write will not be flushed to disk by the osync.
				603	*/
				604	static int osync_buffers_list(spinlock_t lock, struct list_head list)
				605	{
				606	struct buffer_head *bh;
				607	struct list_head *p;
				608	int err = 0;
				609
				610	spin_lock(lock);
				611	repeat:
				612	list_for_each_prev(p, list) {
				613	bh = BH_ENTRY(p);
				614	if (buffer_locked(bh)) {
				615	get_bh(bh);
				616	spin_unlock(lock);
				617	wait_on_buffer(bh);
				618	if (!buffer_uptodate(bh))
				619	err = -EIO;
				620	brelse(bh);
				621	spin_lock(lock);
				622	goto repeat;
				623	}
				624	}
				625	spin_unlock(lock);
				626	return err;
				627	}
				628
				629	/**
				630	* sync_mapping_buffers - write out and wait upon a mapping's "associated"
				631	* buffers
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	632	* @mapping: the mapping which wants those buffers written
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	633	*
				634	* Starts I/O against the buffers at mapping->private_list, and waits upon
				635	* that I/O.
				636	*
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	637	* Basically, this is a convenience function for fsync().
				638	* @mapping is a file or directory which needs those buffers to be written for
				639	* a successful fsync().
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	640	*/
				641	int sync_mapping_buffers(struct address_space *mapping)
				642	{
				643	struct address_space *buffer_mapping = mapping->assoc_mapping;
				644
				645	if (buffer_mapping == NULL \|\| list_empty(&mapping->private_list))
				646	return 0;
				647
				648	return fsync_buffers_list(&buffer_mapping->private_lock,
				649	&mapping->private_list);
				650	}
				651	EXPORT_SYMBOL(sync_mapping_buffers);
				652
				653	/*
				654	* Called when we've recently written block `bblock', and it is known that
				655	* `bblock' was for a buffer_boundary() buffer. This means that the block at
				656	* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
				657	* dirty, schedule it for IO. So that indirects merge nicely with their data.
				658	*/
				659	void write_boundary_block(struct block_device *bdev,
				660	sector_t bblock, unsigned blocksize)
				661	{
				662	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
				663	if (bh) {
				664	if (buffer_dirty(bh))
				665	ll_rw_block(WRITE, 1, &bh);
				666	put_bh(bh);
				667	}
				668	}
				669
				670	void mark_buffer_dirty_inode(struct buffer_head bh, struct inode inode)
				671	{
				672	struct address_space *mapping = inode->i_mapping;
				673	struct address_space *buffer_mapping = bh->b_page->mapping;
				674
				675	mark_buffer_dirty(bh);
				676	if (!mapping->assoc_mapping) {
				677	mapping->assoc_mapping = buffer_mapping;
				678	} else {
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	679	BUG_ON(mapping->assoc_mapping != buffer_mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	680	}
				681	if (list_empty(&bh->b_assoc_buffers)) {
				682	spin_lock(&buffer_mapping->private_lock);
				683	list_move_tail(&bh->b_assoc_buffers,
				684	&mapping->private_list);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	685	bh->b_assoc_map = mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	686	spin_unlock(&buffer_mapping->private_lock);
				687	}
				688	}
				689	EXPORT_SYMBOL(mark_buffer_dirty_inode);
				690
				691	/*
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	692	* Mark the page dirty, and set it dirty in the radix tree, and mark the inode
				693	* dirty.
				694	*
				695	* If warn is true, then emit a warning if the page is not uptodate and has
				696	* not been truncated.
				697	*/
				698	static int __set_page_dirty(struct page *page,
				699	struct address_space *mapping, int warn)
				700	{
				701	if (unlikely(!mapping))
				702	return !TestSetPageDirty(page);
				703
				704	if (TestSetPageDirty(page))
				705	return 0;
				706
				707	write_lock_irq(&mapping->tree_lock);
				708	if (page->mapping) { /* Race with truncate? */
				709	WARN_ON_ONCE(warn && !PageUptodate(page));
				710
				711	if (mapping_cap_account_dirty(mapping)) {
				712	__inc_zone_page_state(page, NR_FILE_DIRTY);
Peter Zijlstra	c9e51e4	2007-10-16 23:25:47 -0700	[diff] [blame]	713	__inc_bdi_stat(mapping->backing_dev_info,
				714	BDI_RECLAIMABLE);
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	715	task_io_account_write(PAGE_CACHE_SIZE);
				716	}
				717	radix_tree_tag_set(&mapping->page_tree,
				718	page_index(page), PAGECACHE_TAG_DIRTY);
				719	}
				720	write_unlock_irq(&mapping->tree_lock);
				721	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
				722
				723	return 1;
				724	}
				725
				726	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	727	* Add a page to the dirty page list.
				728	*
				729	* It is a sad fact of life that this function is called from several places
				730	* deeply under spinlocking. It may not sleep.
				731	*
				732	* If the page has buffers, the uptodate buffers are set dirty, to preserve
				733	* dirty-state coherency between the page and the buffers. It the page does
				734	* not have buffers then when they are later attached they will all be set
				735	* dirty.
				736	*
				737	* The buffers are dirtied before the page is dirtied. There's a small race
				738	* window in which a writepage caller may see the page cleanness but not the
				739	* buffer dirtiness. That's fine. If this code were to set the page dirty
				740	* before the buffers, a concurrent writepage caller could clear the page dirty
				741	* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
				742	* page on the dirty page list.
				743	*
				744	* We use private_lock to lock against try_to_free_buffers while using the
				745	* page's buffer list. Also use this to protect against clean buffers being
				746	* added to the page after it was set dirty.
				747	*
				748	* FIXME: may need to call ->reservepage here as well. That's rather up to the
				749	* address_space though.
				750	*/
				751	int __set_page_dirty_buffers(struct page *page)
				752	{
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	753	struct address_space *mapping = page_mapping(page);
Nick Piggin	ebf7a22	2006-10-10 04:36:54 +0200	[diff] [blame]	754
				755	if (unlikely(!mapping))
				756	return !TestSetPageDirty(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	757
				758	spin_lock(&mapping->private_lock);
				759	if (page_has_buffers(page)) {
				760	struct buffer_head *head = page_buffers(page);
				761	struct buffer_head *bh = head;
				762
				763	do {
				764	set_buffer_dirty(bh);
				765	bh = bh->b_this_page;
				766	} while (bh != head);
				767	}
				768	spin_unlock(&mapping->private_lock);
				769
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	770	return __set_page_dirty(page, mapping, 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	771	}
				772	EXPORT_SYMBOL(__set_page_dirty_buffers);
				773
				774	/*
				775	* Write out and wait upon a list of buffers.
				776	*
				777	* We have conflicting pressures: we want to make sure that all
				778	* initially dirty buffers get waited on, but that any subsequently
				779	* dirtied buffers don't. After all, we don't want fsync to last
				780	* forever if somebody is actively writing to the file.
				781	*
				782	* Do this in two main stages: first we copy dirty buffers to a
				783	* temporary inode list, queueing the writes as we go. Then we clean
				784	* up, waiting for those writes to complete.
				785	*
				786	* During this second stage, any subsequent updates to the file may end
				787	* up refiling the buffer on the original inode's dirty list again, so
				788	* there is a chance we will end up with a buffer queued for write but
				789	* not yet completed on that list. So, as a final cleanup we go through
				790	* the osync code to catch these locked, dirty buffers without requeuing
				791	* any newly dirty buffers for write.
				792	*/
				793	static int fsync_buffers_list(spinlock_t lock, struct list_head list)
				794	{
				795	struct buffer_head *bh;
				796	struct list_head tmp;
				797	int err = 0, err2;
				798
				799	INIT_LIST_HEAD(&tmp);
				800
				801	spin_lock(lock);
				802	while (!list_empty(list)) {
				803	bh = BH_ENTRY(list->next);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	804	__remove_assoc_queue(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	805	if (buffer_dirty(bh) \|\| buffer_locked(bh)) {
				806	list_add(&bh->b_assoc_buffers, &tmp);
				807	if (buffer_dirty(bh)) {
				808	get_bh(bh);
				809	spin_unlock(lock);
				810	/*
				811	* Ensure any pending I/O completes so that
				812	* ll_rw_block() actually writes the current
				813	* contents - it is a noop if I/O is still in
				814	* flight on potentially older contents.
				815	*/
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	816	ll_rw_block(SWRITE, 1, &bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	817	brelse(bh);
				818	spin_lock(lock);
				819	}
				820	}
				821	}
				822
				823	while (!list_empty(&tmp)) {
				824	bh = BH_ENTRY(tmp.prev);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	825	list_del_init(&bh->b_assoc_buffers);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	826	get_bh(bh);
				827	spin_unlock(lock);
				828	wait_on_buffer(bh);
				829	if (!buffer_uptodate(bh))
				830	err = -EIO;
				831	brelse(bh);
				832	spin_lock(lock);
				833	}
				834
				835	spin_unlock(lock);
				836	err2 = osync_buffers_list(lock, list);
				837	if (err)
				838	return err;
				839	else
				840	return err2;
				841	}
				842
				843	/*
				844	* Invalidate any and all dirty buffers on a given inode. We are
				845	* probably unmounting the fs, but that doesn't mean we have already
				846	* done a sync(). Just drop the buffers from the inode list.
				847	*
				848	* NOTE: we take the inode's blockdev's mapping's private_lock. Which
				849	* assumes that all the buffers are against the blockdev. Not true
				850	* for reiserfs.
				851	*/
				852	void invalidate_inode_buffers(struct inode *inode)
				853	{
				854	if (inode_has_buffers(inode)) {
				855	struct address_space *mapping = &inode->i_data;
				856	struct list_head *list = &mapping->private_list;
				857	struct address_space *buffer_mapping = mapping->assoc_mapping;
				858
				859	spin_lock(&buffer_mapping->private_lock);
				860	while (!list_empty(list))
				861	__remove_assoc_queue(BH_ENTRY(list->next));
				862	spin_unlock(&buffer_mapping->private_lock);
				863	}
				864	}
				865
				866	/*
				867	* Remove any clean buffers from the inode's buffer list. This is called
				868	* when we're trying to free the inode itself. Those buffers can pin it.
				869	*
				870	* Returns true if all buffers were removed.
				871	*/
				872	int remove_inode_buffers(struct inode *inode)
				873	{
				874	int ret = 1;
				875
				876	if (inode_has_buffers(inode)) {
				877	struct address_space *mapping = &inode->i_data;
				878	struct list_head *list = &mapping->private_list;
				879	struct address_space *buffer_mapping = mapping->assoc_mapping;
				880
				881	spin_lock(&buffer_mapping->private_lock);
				882	while (!list_empty(list)) {
				883	struct buffer_head *bh = BH_ENTRY(list->next);
				884	if (buffer_dirty(bh)) {
				885	ret = 0;
				886	break;
				887	}
				888	__remove_assoc_queue(bh);
				889	}
				890	spin_unlock(&buffer_mapping->private_lock);
				891	}
				892	return ret;
				893	}
				894
				895	/*
				896	* Create the appropriate buffers when given a page for data area and
				897	* the size of each buffer.. Use the bh->b_this_page linked list to
				898	* follow the buffers created. Return NULL if unable to create more
				899	* buffers.
				900	*
				901	* The retry flag is used to differentiate async IO (paging, swapping)
				902	* which may not fail from ordinary buffer allocations.
				903	*/
				904	struct buffer_head alloc_page_buffers(struct page page, unsigned long size,
				905	int retry)
				906	{
				907	struct buffer_head bh, head;
				908	long offset;
				909
				910	try_again:
				911	head = NULL;
				912	offset = PAGE_SIZE;
				913	while ((offset -= size) >= 0) {
				914	bh = alloc_buffer_head(GFP_NOFS);
				915	if (!bh)
				916	goto no_grow;
				917
				918	bh->b_bdev = NULL;
				919	bh->b_this_page = head;
				920	bh->b_blocknr = -1;
				921	head = bh;
				922
				923	bh->b_state = 0;
				924	atomic_set(&bh->b_count, 0);
Chris Mason	fc5cd58	2006-02-01 03:06:48 -0800	[diff] [blame]	925	bh->b_private = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	926	bh->b_size = size;
				927
				928	/* Link the buffer to its page */
				929	set_bh_page(bh, page, offset);
				930
Nathan Scott	01ffe33	2006-01-17 09:02:07 +1100	[diff] [blame]	931	init_buffer(bh, NULL, NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	932	}
				933	return head;
				934	/*
				935	* In case anything failed, we just free everything we got.
				936	*/
				937	no_grow:
				938	if (head) {
				939	do {
				940	bh = head;
				941	head = head->b_this_page;
				942	free_buffer_head(bh);
				943	} while (head);
				944	}
				945
				946	/*
				947	* Return failure for non-async IO requests. Async IO requests
				948	* are not allowed to fail, so we have to wait until buffer heads
				949	* become available. But we don't want tasks sleeping with
				950	* partially complete buffers, so all were released above.
				951	*/
				952	if (!retry)
				953	return NULL;
				954
				955	/* We're _really_ low on memory. Now we just
				956	* wait for old buffer heads to become free due to
				957	* finishing IO. Since this is an async request and
				958	* the reserve list is empty, we're sure there are
				959	* async buffer heads in use.
				960	*/
				961	free_more_memory();
				962	goto try_again;
				963	}
				964	EXPORT_SYMBOL_GPL(alloc_page_buffers);
				965
				966	static inline void
				967	link_dev_buffers(struct page page, struct buffer_head head)
				968	{
				969	struct buffer_head bh, tail;
				970
				971	bh = head;
				972	do {
				973	tail = bh;
				974	bh = bh->b_this_page;
				975	} while (bh);
				976	tail->b_this_page = head;
				977	attach_page_buffers(page, head);
				978	}
				979
				980	/*
				981	* Initialise the state of a blockdev page's buffers.
				982	*/
				983	static void
				984	init_page_buffers(struct page page, struct block_device bdev,
				985	sector_t block, int size)
				986	{
				987	struct buffer_head *head = page_buffers(page);
				988	struct buffer_head *bh = head;
				989	int uptodate = PageUptodate(page);
				990
				991	do {
				992	if (!buffer_mapped(bh)) {
				993	init_buffer(bh, NULL, NULL);
				994	bh->b_bdev = bdev;
				995	bh->b_blocknr = block;
				996	if (uptodate)
				997	set_buffer_uptodate(bh);
				998	set_buffer_mapped(bh);
				999	}
				1000	block++;
				1001	bh = bh->b_this_page;
				1002	} while (bh != head);
				1003	}
				1004
				1005	/*
				1006	* Create the page-cache page that contains the requested block.
				1007	*
				1008	* This is user purely for blockdev mappings.
				1009	*/
				1010	static struct page *
				1011	grow_dev_page(struct block_device *bdev, sector_t block,
				1012	pgoff_t index, int size)
				1013	{
				1014	struct inode *inode = bdev->bd_inode;
				1015	struct page *page;
				1016	struct buffer_head *bh;
				1017
Christoph Lameter	ea12589	2007-05-16 22:11:21 -0700	[diff] [blame]	1018	page = find_or_create_page(inode->i_mapping, index,
Mel Gorman	769848c	2007-07-17 04:03:05 -0700	[diff] [blame]	1019	(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)\|__GFP_MOVABLE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1020	if (!page)
				1021	return NULL;
				1022
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	1023	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1024
				1025	if (page_has_buffers(page)) {
				1026	bh = page_buffers(page);
				1027	if (bh->b_size == size) {
				1028	init_page_buffers(page, bdev, block, size);
				1029	return page;
				1030	}
				1031	if (!try_to_free_buffers(page))
				1032	goto failed;
				1033	}
				1034
				1035	/*
				1036	* Allocate some buffers for this page
				1037	*/
				1038	bh = alloc_page_buffers(page, size, 0);
				1039	if (!bh)
				1040	goto failed;
				1041
				1042	/*
				1043	* Link the page to the buffers and initialise them. Take the
				1044	* lock to be atomic wrt __find_get_block(), which does not
				1045	* run under the page lock.
				1046	*/
				1047	spin_lock(&inode->i_mapping->private_lock);
				1048	link_dev_buffers(page, bh);
				1049	init_page_buffers(page, bdev, block, size);
				1050	spin_unlock(&inode->i_mapping->private_lock);
				1051	return page;
				1052
				1053	failed:
				1054	BUG();
				1055	unlock_page(page);
				1056	page_cache_release(page);
				1057	return NULL;
				1058	}
				1059
				1060	/*
				1061	* Create buffers for the specified block device block's page. If
				1062	* that page was dirty, the buffers are set dirty also.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1063	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1064	static int
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1065	grow_buffers(struct block_device *bdev, sector_t block, int size)
				1066	{
				1067	struct page *page;
				1068	pgoff_t index;
				1069	int sizebits;
				1070
				1071	sizebits = -1;
				1072	do {
				1073	sizebits++;
				1074	} while ((size << sizebits) < PAGE_SIZE);
				1075
				1076	index = block >> sizebits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1077
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1078	/*
				1079	* Check for a block which wants to lie outside our maximum possible
				1080	* pagecache index. (this comparison is done using sector_t types).
				1081	*/
				1082	if (unlikely(index != block >> sizebits)) {
				1083	char b[BDEVNAME_SIZE];
				1084
				1085	printk(KERN_ERR "%s: requested out-of-range block %llu for "
				1086	"device %s\n",
				1087	__FUNCTION__, (unsigned long long)block,
				1088	bdevname(bdev, b));
				1089	return -EIO;
				1090	}
				1091	block = index << sizebits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1092	/* Create a page with the proper size buffers.. */
				1093	page = grow_dev_page(bdev, block, index, size);
				1094	if (!page)
				1095	return 0;
				1096	unlock_page(page);
				1097	page_cache_release(page);
				1098	return 1;
				1099	}
				1100
Adrian Bunk	75c96f8	2005-05-05 16:16:09 -0700	[diff] [blame]	1101	static struct buffer_head *
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1102	__getblk_slow(struct block_device *bdev, sector_t block, int size)
				1103	{
				1104	/* Size must be multiple of hard sectorsize */
				1105	if (unlikely(size & (bdev_hardsect_size(bdev)-1) \|\|
				1106	(size < 512 \|\| size > PAGE_SIZE))) {
				1107	printk(KERN_ERR "getblk(): invalid block size %d requested\n",
				1108	size);
				1109	printk(KERN_ERR "hardsect size: %d\n",
				1110	bdev_hardsect_size(bdev));
				1111
				1112	dump_stack();
				1113	return NULL;
				1114	}
				1115
				1116	for (;;) {
				1117	struct buffer_head * bh;
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1118	int ret;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1119
				1120	bh = __find_get_block(bdev, block, size);
				1121	if (bh)
				1122	return bh;
				1123
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1124	ret = grow_buffers(bdev, block, size);
				1125	if (ret < 0)
				1126	return NULL;
				1127	if (ret == 0)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1128	free_more_memory();
				1129	}
				1130	}
				1131
				1132	/*
				1133	* The relationship between dirty buffers and dirty pages:
				1134	*
				1135	* Whenever a page has any dirty buffers, the page's dirty bit is set, and
				1136	* the page is tagged dirty in its radix tree.
				1137	*
				1138	* At all times, the dirtiness of the buffers represents the dirtiness of
				1139	* subsections of the page. If the page has buffers, the page dirty bit is
				1140	* merely a hint about the true dirty state.
				1141	*
				1142	* When a page is set dirty in its entirety, all its buffers are marked dirty
				1143	* (if the page has buffers).
				1144	*
				1145	* When a buffer is marked dirty, its page is dirtied, but the page's other
				1146	* buffers are not.
				1147	*
				1148	* Also. When blockdev buffers are explicitly read with bread(), they
				1149	* individually become uptodate. But their backing page remains not
				1150	* uptodate - even if all of its buffers are uptodate. A subsequent
				1151	* block_read_full_page() against that page will discover all the uptodate
				1152	* buffers, will set the page uptodate and will perform no I/O.
				1153	*/
				1154
				1155	/**
				1156	* mark_buffer_dirty - mark a buffer_head as needing writeout
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1157	* @bh: the buffer_head to mark dirty
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1158	*
				1159	* mark_buffer_dirty() will set the dirty bit against the buffer, then set its
				1160	* backing page dirty, then tag the page as dirty in its address_space's radix
				1161	* tree and then attach the address_space's inode to its superblock's dirty
				1162	* inode list.
				1163	*
				1164	* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
				1165	* mapping->tree_lock and the global inode_lock.
				1166	*/
				1167	void fastcall mark_buffer_dirty(struct buffer_head *bh)
				1168	{
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	1169	WARN_ON_ONCE(!buffer_uptodate(bh));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1170	if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	1171	__set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1172	}
				1173
				1174	/*
				1175	* Decrement a buffer_head's reference count. If all buffers against a page
				1176	* have zero reference count, are clean and unlocked, and if the page is clean
				1177	* and unlocked then try_to_free_buffers() may strip the buffers from the page
				1178	* in preparation for freeing it (sometimes, rarely, buffers are removed from
				1179	* a page but it ends up not being freed, and buffers may later be reattached).
				1180	*/
				1181	void __brelse(struct buffer_head * buf)
				1182	{
				1183	if (atomic_read(&buf->b_count)) {
				1184	put_bh(buf);
				1185	return;
				1186	}
				1187	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
				1188	WARN_ON(1);
				1189	}
				1190
				1191	/*
				1192	* bforget() is like brelse(), except it discards any
				1193	* potentially dirty data.
				1194	*/
				1195	void __bforget(struct buffer_head *bh)
				1196	{
				1197	clear_buffer_dirty(bh);
				1198	if (!list_empty(&bh->b_assoc_buffers)) {
				1199	struct address_space *buffer_mapping = bh->b_page->mapping;
				1200
				1201	spin_lock(&buffer_mapping->private_lock);
				1202	list_del_init(&bh->b_assoc_buffers);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	1203	bh->b_assoc_map = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1204	spin_unlock(&buffer_mapping->private_lock);
				1205	}
				1206	__brelse(bh);
				1207	}
				1208
				1209	static struct buffer_head __bread_slow(struct buffer_head bh)
				1210	{
				1211	lock_buffer(bh);
				1212	if (buffer_uptodate(bh)) {
				1213	unlock_buffer(bh);
				1214	return bh;
				1215	} else {
				1216	get_bh(bh);
				1217	bh->b_end_io = end_buffer_read_sync;
				1218	submit_bh(READ, bh);
				1219	wait_on_buffer(bh);
				1220	if (buffer_uptodate(bh))
				1221	return bh;
				1222	}
				1223	brelse(bh);
				1224	return NULL;
				1225	}
				1226
				1227	/*
				1228	* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
				1229	* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
				1230	* refcount elevated by one when they're in an LRU. A buffer can only appear
				1231	* once in a particular CPU's LRU. A single buffer can be present in multiple
				1232	* CPU's LRUs at the same time.
				1233	*
				1234	* This is a transparent caching front-end to sb_bread(), sb_getblk() and
				1235	* sb_find_get_block().
				1236	*
				1237	* The LRUs themselves only need locking against invalidate_bh_lrus. We use
				1238	* a local interrupt disable for that.
				1239	*/
				1240
				1241	#define BH_LRU_SIZE 8
				1242
				1243	struct bh_lru {
				1244	struct buffer_head *bhs[BH_LRU_SIZE];
				1245	};
				1246
				1247	static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
				1248
				1249	#ifdef CONFIG_SMP
				1250	#define bh_lru_lock() local_irq_disable()
				1251	#define bh_lru_unlock() local_irq_enable()
				1252	#else
				1253	#define bh_lru_lock() preempt_disable()
				1254	#define bh_lru_unlock() preempt_enable()
				1255	#endif
				1256
				1257	static inline void check_irqs_on(void)
				1258	{
				1259	#ifdef irqs_disabled
				1260	BUG_ON(irqs_disabled());
				1261	#endif
				1262	}
				1263
				1264	/*
				1265	* The LRU management algorithm is dopey-but-simple. Sorry.
				1266	*/
				1267	static void bh_lru_install(struct buffer_head *bh)
				1268	{
				1269	struct buffer_head *evictee = NULL;
				1270	struct bh_lru *lru;
				1271
				1272	check_irqs_on();
				1273	bh_lru_lock();
				1274	lru = &__get_cpu_var(bh_lrus);
				1275	if (lru->bhs[0] != bh) {
				1276	struct buffer_head *bhs[BH_LRU_SIZE];
				1277	int in;
				1278	int out = 0;
				1279
				1280	get_bh(bh);
				1281	bhs[out++] = bh;
				1282	for (in = 0; in < BH_LRU_SIZE; in++) {
				1283	struct buffer_head *bh2 = lru->bhs[in];
				1284
				1285	if (bh2 == bh) {
				1286	__brelse(bh2);
				1287	} else {
				1288	if (out >= BH_LRU_SIZE) {
				1289	BUG_ON(evictee != NULL);
				1290	evictee = bh2;
				1291	} else {
				1292	bhs[out++] = bh2;
				1293	}
				1294	}
				1295	}
				1296	while (out < BH_LRU_SIZE)
				1297	bhs[out++] = NULL;
				1298	memcpy(lru->bhs, bhs, sizeof(bhs));
				1299	}
				1300	bh_lru_unlock();
				1301
				1302	if (evictee)
				1303	__brelse(evictee);
				1304	}
				1305
				1306	/*
				1307	* Look up the bh in this cpu's LRU. If it's there, move it to the head.
				1308	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1309	static struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1310	lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1311	{
				1312	struct buffer_head *ret = NULL;
				1313	struct bh_lru *lru;
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1314	unsigned int i;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1315
				1316	check_irqs_on();
				1317	bh_lru_lock();
				1318	lru = &__get_cpu_var(bh_lrus);
				1319	for (i = 0; i < BH_LRU_SIZE; i++) {
				1320	struct buffer_head *bh = lru->bhs[i];
				1321
				1322	if (bh && bh->b_bdev == bdev &&
				1323	bh->b_blocknr == block && bh->b_size == size) {
				1324	if (i) {
				1325	while (i) {
				1326	lru->bhs[i] = lru->bhs[i - 1];
				1327	i--;
				1328	}
				1329	lru->bhs[0] = bh;
				1330	}
				1331	get_bh(bh);
				1332	ret = bh;
				1333	break;
				1334	}
				1335	}
				1336	bh_lru_unlock();
				1337	return ret;
				1338	}
				1339
				1340	/*
				1341	* Perform a pagecache lookup for the matching buffer. If it's there, refresh
				1342	* it in the LRU and mark it as accessed. If it is not present then return
				1343	* NULL
				1344	*/
				1345	struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1346	__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1347	{
				1348	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
				1349
				1350	if (bh == NULL) {
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1351	bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1352	if (bh)
				1353	bh_lru_install(bh);
				1354	}
				1355	if (bh)
				1356	touch_buffer(bh);
				1357	return bh;
				1358	}
				1359	EXPORT_SYMBOL(__find_get_block);
				1360
				1361	/*
				1362	* __getblk will locate (and, if necessary, create) the buffer_head
				1363	* which corresponds to the passed block_device, block and size. The
				1364	* returned buffer has its reference count incremented.
				1365	*
				1366	* __getblk() cannot fail - it just keeps trying. If you pass it an
				1367	* illegal block number, __getblk() will happily return a buffer_head
				1368	* which represents the non-existent block. Very weird.
				1369	*
				1370	* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
				1371	* attempt is failing. FIXME, perhaps?
				1372	*/
				1373	struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1374	__getblk(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1375	{
				1376	struct buffer_head *bh = __find_get_block(bdev, block, size);
				1377
				1378	might_sleep();
				1379	if (bh == NULL)
				1380	bh = __getblk_slow(bdev, block, size);
				1381	return bh;
				1382	}
				1383	EXPORT_SYMBOL(__getblk);
				1384
				1385	/*
				1386	* Do async read-ahead on a buffer..
				1387	*/
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1388	void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1389	{
				1390	struct buffer_head *bh = __getblk(bdev, block, size);
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1391	if (likely(bh)) {
				1392	ll_rw_block(READA, 1, &bh);
				1393	brelse(bh);
				1394	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1395	}
				1396	EXPORT_SYMBOL(__breadahead);
				1397
				1398	/**
				1399	* __bread() - reads a specified block and returns the bh
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1400	* @bdev: the block_device to read from
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1401	* @block: number of block
				1402	* @size: size (in bytes) to read
				1403	*
				1404	* Reads a specified block, and returns buffer head that contains it.
				1405	* It returns NULL if the block was unreadable.
				1406	*/
				1407	struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1408	__bread(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1409	{
				1410	struct buffer_head *bh = __getblk(bdev, block, size);
				1411
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1412	if (likely(bh) && !buffer_uptodate(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1413	bh = __bread_slow(bh);
				1414	return bh;
				1415	}
				1416	EXPORT_SYMBOL(__bread);
				1417
				1418	/*
				1419	* invalidate_bh_lrus() is called rarely - but not only at unmount.
				1420	* This doesn't race because it runs in each cpu either in irq
				1421	* or with preempt disabled.
				1422	*/
				1423	static void invalidate_bh_lru(void *arg)
				1424	{
				1425	struct bh_lru *b = &get_cpu_var(bh_lrus);
				1426	int i;
				1427
				1428	for (i = 0; i < BH_LRU_SIZE; i++) {
				1429	brelse(b->bhs[i]);
				1430	b->bhs[i] = NULL;
				1431	}
				1432	put_cpu_var(bh_lrus);
				1433	}
				1434
Peter Zijlstra	f9a1439	2007-05-06 14:49:55 -0700	[diff] [blame]	1435	void invalidate_bh_lrus(void)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1436	{
				1437	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
				1438	}
				1439
				1440	void set_bh_page(struct buffer_head *bh,
				1441	struct page *page, unsigned long offset)
				1442	{
				1443	bh->b_page = page;
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	1444	BUG_ON(offset >= PAGE_SIZE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1445	if (PageHighMem(page))
				1446	/*
				1447	* This catches illegal uses and preserves the offset:
				1448	*/
				1449	bh->b_data = (char *)(0 + offset);
				1450	else
				1451	bh->b_data = page_address(page) + offset;
				1452	}
				1453	EXPORT_SYMBOL(set_bh_page);
				1454
				1455	/*
				1456	* Called when truncating a buffer on a page completely.
				1457	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1458	static void discard_buffer(struct buffer_head * bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1459	{
				1460	lock_buffer(bh);
				1461	clear_buffer_dirty(bh);
				1462	bh->b_bdev = NULL;
				1463	clear_buffer_mapped(bh);
				1464	clear_buffer_req(bh);
				1465	clear_buffer_new(bh);
				1466	clear_buffer_delay(bh);
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	1467	clear_buffer_unwritten(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1468	unlock_buffer(bh);
				1469	}
				1470
				1471	/**
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1472	* block_invalidatepage - invalidate part of all of a buffer-backed page
				1473	*
				1474	* @page: the page which is affected
				1475	* @offset: the index of the truncation point
				1476	*
				1477	* block_invalidatepage() is called when all or part of the page has become
				1478	* invalidatedby a truncate operation.
				1479	*
				1480	* block_invalidatepage() does not have to release all buffers, but it must
				1481	* ensure that no dirty buffer is left outside @offset and that no I/O
				1482	* is underway against any of the blocks which are outside the truncation
				1483	* point. Because the caller is about to free (and possibly reuse) those
				1484	* blocks on-disk.
				1485	*/
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1486	void block_invalidatepage(struct page *page, unsigned long offset)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1487	{
				1488	struct buffer_head head, bh, *next;
				1489	unsigned int curr_off = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1490
				1491	BUG_ON(!PageLocked(page));
				1492	if (!page_has_buffers(page))
				1493	goto out;
				1494
				1495	head = page_buffers(page);
				1496	bh = head;
				1497	do {
				1498	unsigned int next_off = curr_off + bh->b_size;
				1499	next = bh->b_this_page;
				1500
				1501	/*
				1502	* is this block fully invalidated?
				1503	*/
				1504	if (offset <= curr_off)
				1505	discard_buffer(bh);
				1506	curr_off = next_off;
				1507	bh = next;
				1508	} while (bh != head);
				1509
				1510	/*
				1511	* We release buffers only if the entire page is being invalidated.
				1512	* The get_block cached value has been unconditionally invalidated,
				1513	* so real IO is not possible anymore.
				1514	*/
				1515	if (offset == 0)
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1516	try_to_release_page(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1517	out:
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1518	return;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1519	}
				1520	EXPORT_SYMBOL(block_invalidatepage);
				1521
				1522	/*
				1523	* We attach and possibly dirty the buffers atomically wrt
				1524	* __set_page_dirty_buffers() via private_lock. try_to_free_buffers
				1525	* is already excluded via the page lock.
				1526	*/
				1527	void create_empty_buffers(struct page *page,
				1528	unsigned long blocksize, unsigned long b_state)
				1529	{
				1530	struct buffer_head bh, head, *tail;
				1531
				1532	head = alloc_page_buffers(page, blocksize, 1);
				1533	bh = head;
				1534	do {
				1535	bh->b_state \|= b_state;
				1536	tail = bh;
				1537	bh = bh->b_this_page;
				1538	} while (bh);
				1539	tail->b_this_page = head;
				1540
				1541	spin_lock(&page->mapping->private_lock);
				1542	if (PageUptodate(page) \|\| PageDirty(page)) {
				1543	bh = head;
				1544	do {
				1545	if (PageDirty(page))
				1546	set_buffer_dirty(bh);
				1547	if (PageUptodate(page))
				1548	set_buffer_uptodate(bh);
				1549	bh = bh->b_this_page;
				1550	} while (bh != head);
				1551	}
				1552	attach_page_buffers(page, head);
				1553	spin_unlock(&page->mapping->private_lock);
				1554	}
				1555	EXPORT_SYMBOL(create_empty_buffers);
				1556
				1557	/*
				1558	* We are taking a block for data and we don't want any output from any
				1559	* buffer-cache aliases starting from return from that function and
				1560	* until the moment when something will explicitly mark the buffer
				1561	* dirty (hopefully that will not happen until we will free that block ;-)
				1562	* We don't even need to mark it not-uptodate - nobody can expect
				1563	* anything from a newly allocated buffer anyway. We used to used
				1564	* unmap_buffer() for such invalidation, but that was wrong. We definitely
				1565	* don't want to mark the alias unmapped, for example - it would confuse
				1566	* anyone who might pick it with bread() afterwards...
				1567	*
				1568	* Also.. Note that bforget() doesn't lock the buffer. So there can
				1569	* be writeout I/O going on against recently-freed buffers. We don't
				1570	* wait on that I/O in bforget() - it's more efficient to wait on the I/O
				1571	* only if we really need to. That happens here.
				1572	*/
				1573	void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
				1574	{
				1575	struct buffer_head *old_bh;
				1576
				1577	might_sleep();
				1578
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1579	old_bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1580	if (old_bh) {
				1581	clear_buffer_dirty(old_bh);
				1582	wait_on_buffer(old_bh);
				1583	clear_buffer_req(old_bh);
				1584	__brelse(old_bh);
				1585	}
				1586	}
				1587	EXPORT_SYMBOL(unmap_underlying_metadata);
				1588
				1589	/*
				1590	* NOTE! All mapped/uptodate combinations are valid:
				1591	*
				1592	* Mapped Uptodate Meaning
				1593	*
				1594	* No No "unknown" - must do get_block()
				1595	* No Yes "hole" - zero-filled
				1596	* Yes No "allocated" - allocated on disk, not read in
				1597	* Yes Yes "valid" - allocated and up-to-date in memory.
				1598	*
				1599	* "Dirty" is valid only with the last case (mapped+uptodate).
				1600	*/
				1601
				1602	/*
				1603	* While block_write_full_page is writing back the dirty buffers under
				1604	* the page lock, whoever dirtied the buffers may decide to clean them
				1605	* again at any time. We handle that by only looking at the buffer
				1606	* state inside lock_buffer().
				1607	*
				1608	* If block_write_full_page() is called for regular writeback
				1609	* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
				1610	* locked buffer. This only can happen if someone has written the buffer
				1611	* directly, with submit_bh(). At the address_space level PageWriteback
				1612	* prevents this contention from occurring.
				1613	*/
				1614	static int __block_write_full_page(struct inode inode, struct page page,
				1615	get_block_t get_block, struct writeback_control wbc)
				1616	{
				1617	int err;
				1618	sector_t block;
				1619	sector_t last_block;
Andrew Morton	f0fbd5f	2005-05-05 16:15:48 -0700	[diff] [blame]	1620	struct buffer_head bh, head;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1621	const unsigned blocksize = 1 << inode->i_blkbits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1622	int nr_underway = 0;
				1623
				1624	BUG_ON(!PageLocked(page));
				1625
				1626	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
				1627
				1628	if (!page_has_buffers(page)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1629	create_empty_buffers(page, blocksize,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1630	(1 << BH_Dirty)\|(1 << BH_Uptodate));
				1631	}
				1632
				1633	/*
				1634	* Be very careful. We have no exclusion from __set_page_dirty_buffers
				1635	* here, and the (potentially unmapped) buffers may become dirty at
				1636	* any time. If a buffer becomes dirty here after we've inspected it
				1637	* then we just miss that fact, and the page stays dirty.
				1638	*
				1639	* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
				1640	* handle that here by just cleaning them.
				1641	*/
				1642
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	1643	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1644	head = page_buffers(page);
				1645	bh = head;
				1646
				1647	/*
				1648	* Get all the dirty buffers mapped to disk addresses and
				1649	* handle any aliases from the underlying blockdev's mapping.
				1650	*/
				1651	do {
				1652	if (block > last_block) {
				1653	/*
				1654	* mapped buffers outside i_size will occur, because
				1655	* this page can be outside i_size when there is a
				1656	* truncate in progress.
				1657	*/
				1658	/*
				1659	* The buffer was zeroed by block_write_full_page()
				1660	*/
				1661	clear_buffer_dirty(bh);
				1662	set_buffer_uptodate(bh);
				1663	} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1664	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1665	err = get_block(inode, block, bh, 1);
				1666	if (err)
				1667	goto recover;
				1668	if (buffer_new(bh)) {
				1669	/* blockdev mappings never come here */
				1670	clear_buffer_new(bh);
				1671	unmap_underlying_metadata(bh->b_bdev,
				1672	bh->b_blocknr);
				1673	}
				1674	}
				1675	bh = bh->b_this_page;
				1676	block++;
				1677	} while (bh != head);
				1678
				1679	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1680	if (!buffer_mapped(bh))
				1681	continue;
				1682	/*
				1683	* If it's a fully non-blocking write attempt and we cannot
				1684	* lock the buffer then redirty the page. Note that this can
				1685	* potentially cause a busy-wait loop from pdflush and kswapd
				1686	* activity, but those code paths have their own higher-level
				1687	* throttling.
				1688	*/
				1689	if (wbc->sync_mode != WB_SYNC_NONE \|\| !wbc->nonblocking) {
				1690	lock_buffer(bh);
				1691	} else if (test_set_buffer_locked(bh)) {
				1692	redirty_page_for_writepage(wbc, page);
				1693	continue;
				1694	}
				1695	if (test_clear_buffer_dirty(bh)) {
				1696	mark_buffer_async_write(bh);
				1697	} else {
				1698	unlock_buffer(bh);
				1699	}
				1700	} while ((bh = bh->b_this_page) != head);
				1701
				1702	/*
				1703	* The page and its buffers are protected by PageWriteback(), so we can
				1704	* drop the bh refcounts early.
				1705	*/
				1706	BUG_ON(PageWriteback(page));
				1707	set_page_writeback(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1708
				1709	do {
				1710	struct buffer_head *next = bh->b_this_page;
				1711	if (buffer_async_write(bh)) {
				1712	submit_bh(WRITE, bh);
				1713	nr_underway++;
				1714	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1715	bh = next;
				1716	} while (bh != head);
Andrew Morton	05937ba	2005-05-05 16:15:47 -0700	[diff] [blame]	1717	unlock_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1718
				1719	err = 0;
				1720	done:
				1721	if (nr_underway == 0) {
				1722	/*
				1723	* The page was marked dirty, but the buffers were
				1724	* clean. Someone wrote them back by hand with
				1725	* ll_rw_block/submit_bh. A rare case.
				1726	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1727	end_page_writeback(page);
Nick Piggin	3d67f2d	2007-05-06 14:49:05 -0700	[diff] [blame]	1728
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1729	/*
				1730	* The page and buffer_heads can be released at any time from
				1731	* here on.
				1732	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1733	}
				1734	return err;
				1735
				1736	recover:
				1737	/*
				1738	* ENOSPC, or some other error. We may already have added some
				1739	* blocks to the file, so we need to write these out to avoid
				1740	* exposing stale data.
				1741	* The page is currently locked and not marked for writeback
				1742	*/
				1743	bh = head;
				1744	/* Recovery: lock and submit the mapped buffers */
				1745	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1746	if (buffer_mapped(bh) && buffer_dirty(bh)) {
				1747	lock_buffer(bh);
				1748	mark_buffer_async_write(bh);
				1749	} else {
				1750	/*
				1751	* The buffer may have been set dirty during
				1752	* attachment to a dirty page.
				1753	*/
				1754	clear_buffer_dirty(bh);
				1755	}
				1756	} while ((bh = bh->b_this_page) != head);
				1757	SetPageError(page);
				1758	BUG_ON(PageWriteback(page));
Andrew Morton	7e4c369	2007-05-08 00:23:27 -0700	[diff] [blame]	1759	mapping_set_error(page->mapping, err);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1760	set_page_writeback(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1761	do {
				1762	struct buffer_head *next = bh->b_this_page;
				1763	if (buffer_async_write(bh)) {
				1764	clear_buffer_dirty(bh);
				1765	submit_bh(WRITE, bh);
				1766	nr_underway++;
				1767	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1768	bh = next;
				1769	} while (bh != head);
Nick Piggin	ffda9d3	2007-02-20 13:57:54 -0800	[diff] [blame]	1770	unlock_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1771	goto done;
				1772	}
				1773
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1774	/*
				1775	* If a page has any new buffers, zero them out here, and mark them uptodate
				1776	* and dirty so they'll be written out (in order to prevent uninitialised
				1777	* block data from leaking). And clear the new bit.
				1778	*/
				1779	void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
				1780	{
				1781	unsigned int block_start, block_end;
				1782	struct buffer_head head, bh;
				1783
				1784	BUG_ON(!PageLocked(page));
				1785	if (!page_has_buffers(page))
				1786	return;
				1787
				1788	bh = head = page_buffers(page);
				1789	block_start = 0;
				1790	do {
				1791	block_end = block_start + bh->b_size;
				1792
				1793	if (buffer_new(bh)) {
				1794	if (block_end > from && block_start < to) {
				1795	if (!PageUptodate(page)) {
				1796	unsigned start, size;
				1797
				1798	start = max(from, block_start);
				1799	size = min(to, block_end) - start;
				1800
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	1801	zero_user(page, start, size);
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1802	set_buffer_uptodate(bh);
				1803	}
				1804
				1805	clear_buffer_new(bh);
				1806	mark_buffer_dirty(bh);
				1807	}
				1808	}
				1809
				1810	block_start = block_end;
				1811	bh = bh->b_this_page;
				1812	} while (bh != head);
				1813	}
				1814	EXPORT_SYMBOL(page_zero_new_buffers);
				1815
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1816	static int __block_prepare_write(struct inode inode, struct page page,
				1817	unsigned from, unsigned to, get_block_t *get_block)
				1818	{
				1819	unsigned block_start, block_end;
				1820	sector_t block;
				1821	int err = 0;
				1822	unsigned blocksize, bbits;
				1823	struct buffer_head bh, head, wait[2], *wait_bh=wait;
				1824
				1825	BUG_ON(!PageLocked(page));
				1826	BUG_ON(from > PAGE_CACHE_SIZE);
				1827	BUG_ON(to > PAGE_CACHE_SIZE);
				1828	BUG_ON(from > to);
				1829
				1830	blocksize = 1 << inode->i_blkbits;
				1831	if (!page_has_buffers(page))
				1832	create_empty_buffers(page, blocksize, 0);
				1833	head = page_buffers(page);
				1834
				1835	bbits = inode->i_blkbits;
				1836	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
				1837
				1838	for(bh = head, block_start = 0; bh != head \|\| !block_start;
				1839	block++, block_start=block_end, bh = bh->b_this_page) {
				1840	block_end = block_start + blocksize;
				1841	if (block_end <= from \|\| block_start >= to) {
				1842	if (PageUptodate(page)) {
				1843	if (!buffer_uptodate(bh))
				1844	set_buffer_uptodate(bh);
				1845	}
				1846	continue;
				1847	}
				1848	if (buffer_new(bh))
				1849	clear_buffer_new(bh);
				1850	if (!buffer_mapped(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1851	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1852	err = get_block(inode, block, bh, 1);
				1853	if (err)
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1854	break;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1855	if (buffer_new(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1856	unmap_underlying_metadata(bh->b_bdev,
				1857	bh->b_blocknr);
				1858	if (PageUptodate(page)) {
Nick Piggin	637aff4	2007-10-16 01:25:00 -0700	[diff] [blame]	1859	clear_buffer_new(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1860	set_buffer_uptodate(bh);
Nick Piggin	637aff4	2007-10-16 01:25:00 -0700	[diff] [blame]	1861	mark_buffer_dirty(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1862	continue;
				1863	}
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	1864	if (block_end > to \|\| block_start < from)
				1865	zero_user_segments(page,
				1866	to, block_end,
				1867	block_start, from);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1868	continue;
				1869	}
				1870	}
				1871	if (PageUptodate(page)) {
				1872	if (!buffer_uptodate(bh))
				1873	set_buffer_uptodate(bh);
				1874	continue;
				1875	}
				1876	if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	1877	!buffer_unwritten(bh) &&
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1878	(block_start < from \|\| block_end > to)) {
				1879	ll_rw_block(READ, 1, &bh);
				1880	*wait_bh++=bh;
				1881	}
				1882	}
				1883	/*
				1884	* If we issued read requests - let them complete.
				1885	*/
				1886	while(wait_bh > wait) {
				1887	wait_on_buffer(*--wait_bh);
				1888	if (!buffer_uptodate(*wait_bh))
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1889	err = -EIO;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1890	}
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1891	if (unlikely(err))
				1892	page_zero_new_buffers(page, from, to);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1893	return err;
				1894	}
				1895
				1896	static int __block_commit_write(struct inode inode, struct page page,
				1897	unsigned from, unsigned to)
				1898	{
				1899	unsigned block_start, block_end;
				1900	int partial = 0;
				1901	unsigned blocksize;
				1902	struct buffer_head bh, head;
				1903
				1904	blocksize = 1 << inode->i_blkbits;
				1905
				1906	for(bh = head = page_buffers(page), block_start = 0;
				1907	bh != head \|\| !block_start;
				1908	block_start=block_end, bh = bh->b_this_page) {
				1909	block_end = block_start + blocksize;
				1910	if (block_end <= from \|\| block_start >= to) {
				1911	if (!buffer_uptodate(bh))
				1912	partial = 1;
				1913	} else {
				1914	set_buffer_uptodate(bh);
				1915	mark_buffer_dirty(bh);
				1916	}
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1917	clear_buffer_new(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1918	}
				1919
				1920	/*
				1921	* If this is a partial write which happened to make all buffers
				1922	* uptodate then we can optimize away a bogus readpage() for
				1923	* the next read(). Here we 'discover' whether the page went
				1924	* uptodate as a result of this (potentially partial) write.
				1925	*/
				1926	if (!partial)
				1927	SetPageUptodate(page);
				1928	return 0;
				1929	}
				1930
				1931	/*
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1932	* block_write_begin takes care of the basic task of block allocation and
				1933	* bringing partial write blocks uptodate first.
				1934	*
				1935	* If *pagep is not NULL, then block_write_begin uses the locked page
				1936	* at *pagep rather than allocating its own. In this case, the page will
				1937	* not be unlocked or deallocated on failure.
				1938	*/
				1939	int block_write_begin(struct file file, struct address_space mapping,
				1940	loff_t pos, unsigned len, unsigned flags,
				1941	struct page pagep, void fsdata,
				1942	get_block_t *get_block)
				1943	{
				1944	struct inode *inode = mapping->host;
				1945	int status = 0;
				1946	struct page *page;
				1947	pgoff_t index;
				1948	unsigned start, end;
				1949	int ownpage = 0;
				1950
				1951	index = pos >> PAGE_CACHE_SHIFT;
				1952	start = pos & (PAGE_CACHE_SIZE - 1);
				1953	end = start + len;
				1954
				1955	page = *pagep;
				1956	if (page == NULL) {
				1957	ownpage = 1;
				1958	page = __grab_cache_page(mapping, index);
				1959	if (!page) {
				1960	status = -ENOMEM;
				1961	goto out;
				1962	}
				1963	*pagep = page;
				1964	} else
				1965	BUG_ON(!PageLocked(page));
				1966
				1967	status = __block_prepare_write(inode, page, start, end, get_block);
				1968	if (unlikely(status)) {
				1969	ClearPageUptodate(page);
				1970
				1971	if (ownpage) {
				1972	unlock_page(page);
				1973	page_cache_release(page);
				1974	*pagep = NULL;
				1975
				1976	/*
				1977	* prepare_write() may have instantiated a few blocks
				1978	* outside i_size. Trim these off again. Don't need
				1979	* i_size_read because we hold i_mutex.
				1980	*/
				1981	if (pos + len > inode->i_size)
				1982	vmtruncate(inode, inode->i_size);
				1983	}
				1984	goto out;
				1985	}
				1986
				1987	out:
				1988	return status;
				1989	}
				1990	EXPORT_SYMBOL(block_write_begin);
				1991
				1992	int block_write_end(struct file file, struct address_space mapping,
				1993	loff_t pos, unsigned len, unsigned copied,
				1994	struct page page, void fsdata)
				1995	{
				1996	struct inode *inode = mapping->host;
				1997	unsigned start;
				1998
				1999	start = pos & (PAGE_CACHE_SIZE - 1);
				2000
				2001	if (unlikely(copied < len)) {
				2002	/*
				2003	* The buffers that were written will now be uptodate, so we
				2004	* don't have to worry about a readpage reading them and
				2005	* overwriting a partial write. However if we have encountered
				2006	* a short write and only partially written into a buffer, it
				2007	* will not be marked uptodate, so a readpage might come in and
				2008	* destroy our partial write.
				2009	*
				2010	* Do the simplest thing, and just treat any short write to a
				2011	* non uptodate page as a zero-length write, and force the
				2012	* caller to redo the whole thing.
				2013	*/
				2014	if (!PageUptodate(page))
				2015	copied = 0;
				2016
				2017	page_zero_new_buffers(page, start+copied, start+len);
				2018	}
				2019	flush_dcache_page(page);
				2020
				2021	/* This could be a short (even 0-length) commit */
				2022	__block_commit_write(inode, page, start, start+copied);
				2023
				2024	return copied;
				2025	}
				2026	EXPORT_SYMBOL(block_write_end);
				2027
				2028	int generic_write_end(struct file file, struct address_space mapping,
				2029	loff_t pos, unsigned len, unsigned copied,
				2030	struct page page, void fsdata)
				2031	{
				2032	struct inode *inode = mapping->host;
				2033
				2034	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
				2035
				2036	/*
				2037	* No need to use i_size_read() here, the i_size
				2038	* cannot change under us because we hold i_mutex.
				2039	*
				2040	* But it's important to update i_size while still holding page lock:
				2041	* page writeout could otherwise come in and zero beyond i_size.
				2042	*/
				2043	if (pos+copied > inode->i_size) {
				2044	i_size_write(inode, pos+copied);
				2045	mark_inode_dirty(inode);
				2046	}
				2047
				2048	unlock_page(page);
				2049	page_cache_release(page);
				2050
				2051	return copied;
				2052	}
				2053	EXPORT_SYMBOL(generic_write_end);
				2054
				2055	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2056	* Generic "read page" function for block devices that have the normal
				2057	* get_block functionality. This is most of the block device filesystems.
				2058	* Reads the page asynchronously --- the unlock_buffer() and
				2059	* set/clear_buffer_uptodate() functions propagate buffer state into the
				2060	* page struct once IO has completed.
				2061	*/
				2062	int block_read_full_page(struct page page, get_block_t get_block)
				2063	{
				2064	struct inode *inode = page->mapping->host;
				2065	sector_t iblock, lblock;
				2066	struct buffer_head bh, head, *arr[MAX_BUF_PER_PAGE];
				2067	unsigned int blocksize;
				2068	int nr, i;
				2069	int fully_mapped = 1;
				2070
Matt Mackall	cd7619d	2005-05-01 08:59:01 -0700	[diff] [blame]	2071	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2072	blocksize = 1 << inode->i_blkbits;
				2073	if (!page_has_buffers(page))
				2074	create_empty_buffers(page, blocksize, 0);
				2075	head = page_buffers(page);
				2076
				2077	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
				2078	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
				2079	bh = head;
				2080	nr = 0;
				2081	i = 0;
				2082
				2083	do {
				2084	if (buffer_uptodate(bh))
				2085	continue;
				2086
				2087	if (!buffer_mapped(bh)) {
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2088	int err = 0;
				2089
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2090	fully_mapped = 0;
				2091	if (iblock < lblock) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2092	WARN_ON(bh->b_size != blocksize);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2093	err = get_block(inode, iblock, bh, 0);
				2094	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2095	SetPageError(page);
				2096	}
				2097	if (!buffer_mapped(bh)) {
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2098	zero_user(page, i * blocksize, blocksize);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2099	if (!err)
				2100	set_buffer_uptodate(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2101	continue;
				2102	}
				2103	/*
				2104	* get_block() might have updated the buffer
				2105	* synchronously
				2106	*/
				2107	if (buffer_uptodate(bh))
				2108	continue;
				2109	}
				2110	arr[nr++] = bh;
				2111	} while (i++, iblock++, (bh = bh->b_this_page) != head);
				2112
				2113	if (fully_mapped)
				2114	SetPageMappedToDisk(page);
				2115
				2116	if (!nr) {
				2117	/*
				2118	* All buffers are uptodate - we can set the page uptodate
				2119	* as well. But not if get_block() returned an error.
				2120	*/
				2121	if (!PageError(page))
				2122	SetPageUptodate(page);
				2123	unlock_page(page);
				2124	return 0;
				2125	}
				2126
				2127	/* Stage two: lock the buffers */
				2128	for (i = 0; i < nr; i++) {
				2129	bh = arr[i];
				2130	lock_buffer(bh);
				2131	mark_buffer_async_read(bh);
				2132	}
				2133
				2134	/*
				2135	* Stage 3: start the IO. Check for uptodateness
				2136	* inside the buffer lock in case another process reading
				2137	* the underlying blockdev brought it uptodate (the sct fix).
				2138	*/
				2139	for (i = 0; i < nr; i++) {
				2140	bh = arr[i];
				2141	if (buffer_uptodate(bh))
				2142	end_buffer_async_read(bh, 1);
				2143	else
				2144	submit_bh(READ, bh);
				2145	}
				2146	return 0;
				2147	}
				2148
				2149	/* utility function for filesystems that need to do work on expanding
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2150	* truncates. Uses filesystem pagecache writes to allow the filesystem to
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2151	* deal with the hole.
				2152	*/
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2153	int generic_cont_expand_simple(struct inode *inode, loff_t size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2154	{
				2155	struct address_space *mapping = inode->i_mapping;
				2156	struct page *page;
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2157	void *fsdata;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2158	unsigned long limit;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2159	int err;
				2160
				2161	err = -EFBIG;
				2162	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
				2163	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
				2164	send_sig(SIGXFSZ, current, 0);
				2165	goto out;
				2166	}
				2167	if (size > inode->i_sb->s_maxbytes)
				2168	goto out;
				2169
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2170	err = pagecache_write_begin(NULL, mapping, size, 0,
				2171	AOP_FLAG_UNINTERRUPTIBLE\|AOP_FLAG_CONT_EXPAND,
				2172	&page, &fsdata);
				2173	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2174	goto out;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2175
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2176	err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
				2177	BUG_ON(err > 0);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2178
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2179	out:
				2180	return err;
				2181	}
				2182
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2183	int cont_expand_zero(struct file file, struct address_space mapping,
				2184	loff_t pos, loff_t *bytes)
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2185	{
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2186	struct inode *inode = mapping->host;
				2187	unsigned blocksize = 1 << inode->i_blkbits;
				2188	struct page *page;
				2189	void *fsdata;
				2190	pgoff_t index, curidx;
				2191	loff_t curpos;
				2192	unsigned zerofrom, offset, len;
				2193	int err = 0;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2194
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2195	index = pos >> PAGE_CACHE_SHIFT;
				2196	offset = pos & ~PAGE_CACHE_MASK;
				2197
				2198	while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
				2199	zerofrom = curpos & ~PAGE_CACHE_MASK;
				2200	if (zerofrom & (blocksize-1)) {
				2201	*bytes \|= (blocksize-1);
				2202	(*bytes)++;
				2203	}
				2204	len = PAGE_CACHE_SIZE - zerofrom;
				2205
				2206	err = pagecache_write_begin(file, mapping, curpos, len,
				2207	AOP_FLAG_UNINTERRUPTIBLE,
				2208	&page, &fsdata);
				2209	if (err)
				2210	goto out;
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2211	zero_user(page, zerofrom, len);
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2212	err = pagecache_write_end(file, mapping, curpos, len, len,
				2213	page, fsdata);
				2214	if (err < 0)
				2215	goto out;
				2216	BUG_ON(err != len);
				2217	err = 0;
				2218	}
				2219
				2220	/* page covers the boundary, find the boundary offset */
				2221	if (index == curidx) {
				2222	zerofrom = curpos & ~PAGE_CACHE_MASK;
				2223	/* if we will expand the thing last block will be filled */
				2224	if (offset <= zerofrom) {
				2225	goto out;
				2226	}
				2227	if (zerofrom & (blocksize-1)) {
				2228	*bytes \|= (blocksize-1);
				2229	(*bytes)++;
				2230	}
				2231	len = offset - zerofrom;
				2232
				2233	err = pagecache_write_begin(file, mapping, curpos, len,
				2234	AOP_FLAG_UNINTERRUPTIBLE,
				2235	&page, &fsdata);
				2236	if (err)
				2237	goto out;
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2238	zero_user(page, zerofrom, len);
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2239	err = pagecache_write_end(file, mapping, curpos, len, len,
				2240	page, fsdata);
				2241	if (err < 0)
				2242	goto out;
				2243	BUG_ON(err != len);
				2244	err = 0;
				2245	}
				2246	out:
				2247	return err;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2248	}
				2249
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2250	/*
				2251	* For moronic filesystems that do not allow holes in file.
				2252	* We may have to extend the file.
				2253	*/
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2254	int cont_write_begin(struct file file, struct address_space mapping,
				2255	loff_t pos, unsigned len, unsigned flags,
				2256	struct page pagep, void fsdata,
				2257	get_block_t get_block, loff_t bytes)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2258	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2259	struct inode *inode = mapping->host;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2260	unsigned blocksize = 1 << inode->i_blkbits;
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2261	unsigned zerofrom;
				2262	int err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2263
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2264	err = cont_expand_zero(file, mapping, pos, bytes);
				2265	if (err)
				2266	goto out;
				2267
				2268	zerofrom = *bytes & ~PAGE_CACHE_MASK;
				2269	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
				2270	*bytes \|= (blocksize-1);
				2271	(*bytes)++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2272	}
				2273
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2274	*pagep = NULL;
				2275	err = block_write_begin(file, mapping, pos, len,
				2276	flags, pagep, fsdata, get_block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2277	out:
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2278	return err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2279	}
				2280
				2281	int block_prepare_write(struct page *page, unsigned from, unsigned to,
				2282	get_block_t *get_block)
				2283	{
				2284	struct inode *inode = page->mapping->host;
				2285	int err = __block_prepare_write(inode, page, from, to, get_block);
				2286	if (err)
				2287	ClearPageUptodate(page);
				2288	return err;
				2289	}
				2290
				2291	int block_commit_write(struct page *page, unsigned from, unsigned to)
				2292	{
				2293	struct inode *inode = page->mapping->host;
				2294	__block_commit_write(inode,page,from,to);
				2295	return 0;
				2296	}
				2297
				2298	int generic_commit_write(struct file file, struct page page,
				2299	unsigned from, unsigned to)
				2300	{
				2301	struct inode *inode = page->mapping->host;
				2302	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				2303	__block_commit_write(inode,page,from,to);
				2304	/*
				2305	* No need to use i_size_read() here, the i_size
Jes Sorensen	1b1dcc1	2006-01-09 15:59:24 -0800	[diff] [blame]	2306	* cannot change under us because we hold i_mutex.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2307	*/
				2308	if (pos > inode->i_size) {
				2309	i_size_write(inode, pos);
				2310	mark_inode_dirty(inode);
				2311	}
				2312	return 0;
				2313	}
				2314
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2315	/*
				2316	* block_page_mkwrite() is not allowed to change the file size as it gets
				2317	* called from a page fault handler when a page is first dirtied. Hence we must
				2318	* be careful to check for EOF conditions here. We set the page up correctly
				2319	* for a written page which means we get ENOSPC checking when writing into
				2320	* holes and correct delalloc and unwritten extent mapping on filesystems that
				2321	* support these features.
				2322	*
				2323	* We are not allowed to take the i_mutex here so we have to play games to
				2324	* protect against truncate races as the page could now be beyond EOF. Because
				2325	* vmtruncate() writes the inode size before removing pages, once we have the
				2326	* page lock we can determine safely if the page is beyond EOF. If it is not
				2327	* beyond EOF, then the page is guaranteed safe against truncation until we
				2328	* unlock the page.
				2329	*/
				2330	int
				2331	block_page_mkwrite(struct vm_area_struct vma, struct page page,
				2332	get_block_t get_block)
				2333	{
				2334	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
				2335	unsigned long end;
				2336	loff_t size;
				2337	int ret = -EINVAL;
				2338
				2339	lock_page(page);
				2340	size = i_size_read(inode);
				2341	if ((page->mapping != inode->i_mapping) \|\|
Nick Piggin	1833633	2007-07-20 00:31:45 -0700	[diff] [blame]	2342	(page_offset(page) > size)) {
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2343	/* page got truncated out from underneath us */
				2344	goto out_unlock;
				2345	}
				2346
				2347	/* page is wholly or partially inside EOF */
				2348	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
				2349	end = size & ~PAGE_CACHE_MASK;
				2350	else
				2351	end = PAGE_CACHE_SIZE;
				2352
				2353	ret = block_prepare_write(page, 0, end, get_block);
				2354	if (!ret)
				2355	ret = block_commit_write(page, 0, end);
				2356
				2357	out_unlock:
				2358	unlock_page(page);
				2359	return ret;
				2360	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2361
				2362	/*
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2363	* nobh_write_begin()'s prereads are special: the buffer_heads are freed
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2364	* immediately, while under the page lock. So it needs a special end_io
				2365	* handler which does not touch the bh after unlocking it.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2366	*/
				2367	static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
				2368	{
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	2369	__end_buffer_read_notouch(bh, uptodate);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2370	}
				2371
				2372	/*
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2373	* Attach the singly-linked list of buffers created by nobh_write_begin, to
				2374	* the page (converting it to circular linked list and taking care of page
				2375	* dirty races).
				2376	*/
				2377	static void attach_nobh_buffers(struct page page, struct buffer_head head)
				2378	{
				2379	struct buffer_head *bh;
				2380
				2381	BUG_ON(!PageLocked(page));
				2382
				2383	spin_lock(&page->mapping->private_lock);
				2384	bh = head;
				2385	do {
				2386	if (PageDirty(page))
				2387	set_buffer_dirty(bh);
				2388	if (!bh->b_this_page)
				2389	bh->b_this_page = head;
				2390	bh = bh->b_this_page;
				2391	} while (bh != head);
				2392	attach_page_buffers(page, head);
				2393	spin_unlock(&page->mapping->private_lock);
				2394	}
				2395
				2396	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2397	* On entry, the page is fully not uptodate.
				2398	* On exit the page is fully uptodate in the areas outside (from,to)
				2399	*/
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2400	int nobh_write_begin(struct file file, struct address_space mapping,
				2401	loff_t pos, unsigned len, unsigned flags,
				2402	struct page pagep, void fsdata,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2403	get_block_t *get_block)
				2404	{
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2405	struct inode *inode = mapping->host;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2406	const unsigned blkbits = inode->i_blkbits;
				2407	const unsigned blocksize = 1 << blkbits;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2408	struct buffer_head head, bh;
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2409	struct page *page;
				2410	pgoff_t index;
				2411	unsigned from, to;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2412	unsigned block_in_page;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2413	unsigned block_start, block_end;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2414	sector_t block_in_file;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2415	int nr_reads = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2416	int ret = 0;
				2417	int is_mapped_to_disk = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2418
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2419	index = pos >> PAGE_CACHE_SHIFT;
				2420	from = pos & (PAGE_CACHE_SIZE - 1);
				2421	to = from + len;
				2422
				2423	page = __grab_cache_page(mapping, index);
				2424	if (!page)
				2425	return -ENOMEM;
				2426	*pagep = page;
				2427	*fsdata = NULL;
				2428
				2429	if (page_has_buffers(page)) {
				2430	unlock_page(page);
				2431	page_cache_release(page);
				2432	*pagep = NULL;
				2433	return block_write_begin(file, mapping, pos, len, flags, pagep,
				2434	fsdata, get_block);
				2435	}
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2436
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2437	if (PageMappedToDisk(page))
				2438	return 0;
				2439
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2440	/*
				2441	* Allocate buffers so that we can keep track of state, and potentially
				2442	* attach them to the page if an error occurs. In the common case of
				2443	* no error, they will just be freed again without ever being attached
				2444	* to the page (which is all OK, because we're under the page lock).
				2445	*
				2446	* Be careful: the buffer linked list is a NULL terminated one, rather
				2447	* than the circular one we're used to.
				2448	*/
				2449	head = alloc_page_buffers(page, blocksize, 0);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2450	if (!head) {
				2451	ret = -ENOMEM;
				2452	goto out_release;
				2453	}
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2454
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2455	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2456
				2457	/*
				2458	* We loop across all blocks in the page, whether or not they are
				2459	* part of the affected region. This is so we can discover if the
				2460	* page is fully mapped-to-disk.
				2461	*/
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2462	for (block_start = 0, block_in_page = 0, bh = head;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2463	block_start < PAGE_CACHE_SIZE;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2464	block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2465	int create;
				2466
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2467	block_end = block_start + blocksize;
				2468	bh->b_state = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2469	create = 1;
				2470	if (block_start >= to)
				2471	create = 0;
				2472	ret = get_block(inode, block_in_file + block_in_page,
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2473	bh, create);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2474	if (ret)
				2475	goto failed;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2476	if (!buffer_mapped(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2477	is_mapped_to_disk = 0;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2478	if (buffer_new(bh))
				2479	unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
				2480	if (PageUptodate(page)) {
				2481	set_buffer_uptodate(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2482	continue;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2483	}
				2484	if (buffer_new(bh) \|\| !buffer_mapped(bh)) {
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2485	zero_user_segments(page, block_start, from,
				2486	to, block_end);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2487	continue;
				2488	}
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2489	if (buffer_uptodate(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2490	continue; /* reiserfs does this */
				2491	if (block_start < from \|\| block_end > to) {
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2492	lock_buffer(bh);
				2493	bh->b_end_io = end_buffer_read_nobh;
				2494	submit_bh(READ, bh);
				2495	nr_reads++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2496	}
				2497	}
				2498
				2499	if (nr_reads) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2500	/*
				2501	* The page is locked, so these buffers are protected from
				2502	* any VM or truncate activity. Hence we don't need to care
				2503	* for the buffer_head refcounts.
				2504	*/
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2505	for (bh = head; bh; bh = bh->b_this_page) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2506	wait_on_buffer(bh);
				2507	if (!buffer_uptodate(bh))
				2508	ret = -EIO;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2509	}
				2510	if (ret)
				2511	goto failed;
				2512	}
				2513
				2514	if (is_mapped_to_disk)
				2515	SetPageMappedToDisk(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2516
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2517	fsdata = head; / to be released by nobh_write_end */
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2518
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2519	return 0;
				2520
				2521	failed:
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2522	BUG_ON(!ret);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2523	/*
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2524	* Error recovery is a bit difficult. We need to zero out blocks that
				2525	* were newly allocated, and dirty them to ensure they get written out.
				2526	* Buffers need to be attached to the page at this point, otherwise
				2527	* the handling of potential IO errors during writeout would be hard
				2528	* (could try doing synchronous writeout, but what if that fails too?)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2529	*/
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2530	attach_nobh_buffers(page, head);
				2531	page_zero_new_buffers(page, from, to);
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2532
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2533	out_release:
				2534	unlock_page(page);
				2535	page_cache_release(page);
				2536	*pagep = NULL;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2537
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2538	if (pos + len > inode->i_size)
				2539	vmtruncate(inode, inode->i_size);
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2540
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2541	return ret;
				2542	}
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2543	EXPORT_SYMBOL(nobh_write_begin);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2544
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2545	int nobh_write_end(struct file file, struct address_space mapping,
				2546	loff_t pos, unsigned len, unsigned copied,
				2547	struct page page, void fsdata)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2548	{
				2549	struct inode *inode = page->mapping->host;
Nick Piggin	efdc313	2007-10-21 06:57:41 +0200	[diff] [blame]	2550	struct buffer_head *head = fsdata;
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2551	struct buffer_head *bh;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2552
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2553	if (!PageMappedToDisk(page)) {
				2554	if (unlikely(copied < len) && !page_has_buffers(page))
				2555	attach_nobh_buffers(page, head);
				2556	if (page_has_buffers(page))
				2557	return generic_write_end(file, mapping, pos, len,
				2558	copied, page, fsdata);
				2559	}
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2560
Nick Piggin	22c8ca7	2007-02-20 13:58:09 -0800	[diff] [blame]	2561	SetPageUptodate(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2562	set_page_dirty(page);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2563	if (pos+copied > inode->i_size) {
				2564	i_size_write(inode, pos+copied);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2565	mark_inode_dirty(inode);
				2566	}
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2567
				2568	unlock_page(page);
				2569	page_cache_release(page);
				2570
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2571	while (head) {
				2572	bh = head;
				2573	head = head->b_this_page;
				2574	free_buffer_head(bh);
				2575	}
				2576
				2577	return copied;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2578	}
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2579	EXPORT_SYMBOL(nobh_write_end);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2580
				2581	/*
				2582	* nobh_writepage() - based on block_full_write_page() except
				2583	* that it tries to operate without attaching bufferheads to
				2584	* the page.
				2585	*/
				2586	int nobh_writepage(struct page page, get_block_t get_block,
				2587	struct writeback_control *wbc)
				2588	{
				2589	struct inode * const inode = page->mapping->host;
				2590	loff_t i_size = i_size_read(inode);
				2591	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2592	unsigned offset;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2593	int ret;
				2594
				2595	/* Is the page fully inside i_size? */
				2596	if (page->index < end_index)
				2597	goto out;
				2598
				2599	/* Is the page fully outside i_size? (truncate in progress) */
				2600	offset = i_size & (PAGE_CACHE_SIZE-1);
				2601	if (page->index >= end_index+1 \|\| !offset) {
				2602	/*
				2603	* The page may have dirty, unmapped buffers. For example,
				2604	* they may have been added in ext3_writepage(). Make them
				2605	* freeable here, so the page does not leak.
				2606	*/
				2607	#if 0
				2608	/* Not really sure about this - do we need this ? */
				2609	if (page->mapping->a_ops->invalidatepage)
				2610	page->mapping->a_ops->invalidatepage(page, offset);
				2611	#endif
				2612	unlock_page(page);
				2613	return 0; /* don't care */
				2614	}
				2615
				2616	/*
				2617	* The page straddles i_size. It must be zeroed out on each and every
				2618	* writepage invocation because it may be mmapped. "A file is mapped
				2619	* in multiples of the page size. For a file that is not a multiple of
				2620	* the page size, the remaining memory is zeroed when mapped, and
				2621	* writes to that region are not written out to the file."
				2622	*/
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2623	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2624	out:
				2625	ret = mpage_writepage(page, get_block, wbc);
				2626	if (ret == -EAGAIN)
				2627	ret = __block_write_full_page(inode, page, get_block, wbc);
				2628	return ret;
				2629	}
				2630	EXPORT_SYMBOL(nobh_writepage);
				2631
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2632	int nobh_truncate_page(struct address_space *mapping,
				2633	loff_t from, get_block_t *get_block)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2634	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2635	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2636	unsigned offset = from & (PAGE_CACHE_SIZE-1);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2637	unsigned blocksize;
				2638	sector_t iblock;
				2639	unsigned length, pos;
				2640	struct inode *inode = mapping->host;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2641	struct page *page;
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2642	struct buffer_head map_bh;
				2643	int err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2644
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2645	blocksize = 1 << inode->i_blkbits;
				2646	length = offset & (blocksize - 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2647
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2648	/* Block boundary? Nothing to do */
				2649	if (!length)
				2650	return 0;
				2651
				2652	length = blocksize - length;
				2653	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
				2654
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2655	page = grab_cache_page(mapping, index);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2656	err = -ENOMEM;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2657	if (!page)
				2658	goto out;
				2659
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2660	if (page_has_buffers(page)) {
				2661	has_buffers:
				2662	unlock_page(page);
				2663	page_cache_release(page);
				2664	return block_truncate_page(mapping, from, get_block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2665	}
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2666
				2667	/* Find the buffer that contains "offset" */
				2668	pos = blocksize;
				2669	while (offset >= pos) {
				2670	iblock++;
				2671	pos += blocksize;
				2672	}
				2673
				2674	err = get_block(inode, iblock, &map_bh, 0);
				2675	if (err)
				2676	goto unlock;
				2677	/* unmapped? It's a hole - nothing to do */
				2678	if (!buffer_mapped(&map_bh))
				2679	goto unlock;
				2680
				2681	/* Ok, it's mapped. Make sure it's up-to-date */
				2682	if (!PageUptodate(page)) {
				2683	err = mapping->a_ops->readpage(NULL, page);
				2684	if (err) {
				2685	page_cache_release(page);
				2686	goto out;
				2687	}
				2688	lock_page(page);
				2689	if (!PageUptodate(page)) {
				2690	err = -EIO;
				2691	goto unlock;
				2692	}
				2693	if (page_has_buffers(page))
				2694	goto has_buffers;
				2695	}
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2696	zero_user(page, offset, length);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2697	set_page_dirty(page);
				2698	err = 0;
				2699
				2700	unlock:
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2701	unlock_page(page);
				2702	page_cache_release(page);
				2703	out:
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2704	return err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2705	}
				2706	EXPORT_SYMBOL(nobh_truncate_page);
				2707
				2708	int block_truncate_page(struct address_space *mapping,
				2709	loff_t from, get_block_t *get_block)
				2710	{
				2711	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2712	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				2713	unsigned blocksize;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2714	sector_t iblock;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2715	unsigned length, pos;
				2716	struct inode *inode = mapping->host;
				2717	struct page *page;
				2718	struct buffer_head *bh;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2719	int err;
				2720
				2721	blocksize = 1 << inode->i_blkbits;
				2722	length = offset & (blocksize - 1);
				2723
				2724	/* Block boundary? Nothing to do */
				2725	if (!length)
				2726	return 0;
				2727
				2728	length = blocksize - length;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2729	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2730
				2731	page = grab_cache_page(mapping, index);
				2732	err = -ENOMEM;
				2733	if (!page)
				2734	goto out;
				2735
				2736	if (!page_has_buffers(page))
				2737	create_empty_buffers(page, blocksize, 0);
				2738
				2739	/* Find the buffer that contains "offset" */
				2740	bh = page_buffers(page);
				2741	pos = blocksize;
				2742	while (offset >= pos) {
				2743	bh = bh->b_this_page;
				2744	iblock++;
				2745	pos += blocksize;
				2746	}
				2747
				2748	err = 0;
				2749	if (!buffer_mapped(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2750	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2751	err = get_block(inode, iblock, bh, 0);
				2752	if (err)
				2753	goto unlock;
				2754	/* unmapped? It's a hole - nothing to do */
				2755	if (!buffer_mapped(bh))
				2756	goto unlock;
				2757	}
				2758
				2759	/* Ok, it's mapped. Make sure it's up-to-date */
				2760	if (PageUptodate(page))
				2761	set_buffer_uptodate(bh);
				2762
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	2763	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2764	err = -EIO;
				2765	ll_rw_block(READ, 1, &bh);
				2766	wait_on_buffer(bh);
				2767	/* Uhhuh. Read error. Complain and punt. */
				2768	if (!buffer_uptodate(bh))
				2769	goto unlock;
				2770	}
				2771
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2772	zero_user(page, offset, length);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2773	mark_buffer_dirty(bh);
				2774	err = 0;
				2775
				2776	unlock:
				2777	unlock_page(page);
				2778	page_cache_release(page);
				2779	out:
				2780	return err;
				2781	}
				2782
				2783	/*
				2784	* The generic ->writepage function for buffer-backed address_spaces
				2785	*/
				2786	int block_write_full_page(struct page page, get_block_t get_block,
				2787	struct writeback_control *wbc)
				2788	{
				2789	struct inode * const inode = page->mapping->host;
				2790	loff_t i_size = i_size_read(inode);
				2791	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2792	unsigned offset;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2793
				2794	/* Is the page fully inside i_size? */
				2795	if (page->index < end_index)
				2796	return __block_write_full_page(inode, page, get_block, wbc);
				2797
				2798	/* Is the page fully outside i_size? (truncate in progress) */
				2799	offset = i_size & (PAGE_CACHE_SIZE-1);
				2800	if (page->index >= end_index+1 \|\| !offset) {
				2801	/*
				2802	* The page may have dirty, unmapped buffers. For example,
				2803	* they may have been added in ext3_writepage(). Make them
				2804	* freeable here, so the page does not leak.
				2805	*/
Jan Kara	aaa4059	2005-10-30 15:00:16 -0800	[diff] [blame]	2806	do_invalidatepage(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2807	unlock_page(page);
				2808	return 0; /* don't care */
				2809	}
				2810
				2811	/*
				2812	* The page straddles i_size. It must be zeroed out on each and every
				2813	* writepage invokation because it may be mmapped. "A file is mapped
				2814	* in multiples of the page size. For a file that is not a multiple of
				2815	* the page size, the remaining memory is zeroed when mapped, and
				2816	* writes to that region are not written out to the file."
				2817	*/
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2818	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2819	return __block_write_full_page(inode, page, get_block, wbc);
				2820	}
				2821
				2822	sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
				2823	get_block_t *get_block)
				2824	{
				2825	struct buffer_head tmp;
				2826	struct inode *inode = mapping->host;
				2827	tmp.b_state = 0;
				2828	tmp.b_blocknr = 0;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2829	tmp.b_size = 1 << inode->i_blkbits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2830	get_block(inode, block, &tmp, 0);
				2831	return tmp.b_blocknr;
				2832	}
				2833
NeilBrown	6712ecf	2007-09-27 12:47:43 +0200	[diff] [blame]	2834	static void end_bio_bh_io_sync(struct bio *bio, int err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2835	{
				2836	struct buffer_head *bh = bio->bi_private;
				2837
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2838	if (err == -EOPNOTSUPP) {
				2839	set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
				2840	set_bit(BH_Eopnotsupp, &bh->b_state);
				2841	}
				2842
				2843	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
				2844	bio_put(bio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2845	}
				2846
				2847	int submit_bh(int rw, struct buffer_head * bh)
				2848	{
				2849	struct bio *bio;
				2850	int ret = 0;
				2851
				2852	BUG_ON(!buffer_locked(bh));
				2853	BUG_ON(!buffer_mapped(bh));
				2854	BUG_ON(!bh->b_end_io);
				2855
				2856	if (buffer_ordered(bh) && (rw == WRITE))
				2857	rw = WRITE_BARRIER;
				2858
				2859	/*
				2860	* Only clear out a write error when rewriting, should this
				2861	* include WRITE_SYNC as well?
				2862	*/
				2863	if (test_set_buffer_req(bh) && (rw == WRITE \|\| rw == WRITE_BARRIER))
				2864	clear_buffer_write_io_error(bh);
				2865
				2866	/*
				2867	* from here on down, it's all bio -- do the initial mapping,
				2868	* submit_bio -> generic_make_request may further map this bio around
				2869	*/
				2870	bio = bio_alloc(GFP_NOIO, 1);
				2871
				2872	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
				2873	bio->bi_bdev = bh->b_bdev;
				2874	bio->bi_io_vec[0].bv_page = bh->b_page;
				2875	bio->bi_io_vec[0].bv_len = bh->b_size;
				2876	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
				2877
				2878	bio->bi_vcnt = 1;
				2879	bio->bi_idx = 0;
				2880	bio->bi_size = bh->b_size;
				2881
				2882	bio->bi_end_io = end_bio_bh_io_sync;
				2883	bio->bi_private = bh;
				2884
				2885	bio_get(bio);
				2886	submit_bio(rw, bio);
				2887
				2888	if (bio_flagged(bio, BIO_EOPNOTSUPP))
				2889	ret = -EOPNOTSUPP;
				2890
				2891	bio_put(bio);
				2892	return ret;
				2893	}
				2894
				2895	/**
				2896	* ll_rw_block: low-level access to block devices (DEPRECATED)
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2897	* @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2898	* @nr: number of &struct buffer_heads in the array
				2899	* @bhs: array of pointers to &struct buffer_head
				2900	*
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2901	* ll_rw_block() takes an array of pointers to &struct buffer_heads, and
				2902	* requests an I/O operation on them, either a %READ or a %WRITE. The third
				2903	* %SWRITE is like %WRITE only we make sure that the current data in buffers
				2904	* are sent to disk. The fourth %READA option is described in the documentation
				2905	* for generic_make_request() which ll_rw_block() calls.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2906	*
				2907	* This function drops any buffer that it cannot get a lock on (with the
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2908	* BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
				2909	* clean when doing a write request, and any buffer that appears to be
				2910	* up-to-date when doing read request. Further it marks as clean buffers that
				2911	* are processed for writing (the buffer cache won't assume that they are
				2912	* actually clean until the buffer gets unlocked).
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2913	*
				2914	* ll_rw_block sets b_end_io to simple completion handler that marks
				2915	* the buffer up-to-date (if approriate), unlocks the buffer and wakes
				2916	* any waiters.
				2917	*
				2918	* All of the buffers must be for the same device, and must also be a
				2919	* multiple of the current approved size for the device.
				2920	*/
				2921	void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
				2922	{
				2923	int i;
				2924
				2925	for (i = 0; i < nr; i++) {
				2926	struct buffer_head *bh = bhs[i];
				2927
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2928	if (rw == SWRITE)
				2929	lock_buffer(bh);
				2930	else if (test_set_buffer_locked(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2931	continue;
				2932
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2933	if (rw == WRITE \|\| rw == SWRITE) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2934	if (test_clear_buffer_dirty(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	2935	bh->b_end_io = end_buffer_write_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	2936	get_bh(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2937	submit_bh(WRITE, bh);
				2938	continue;
				2939	}
				2940	} else {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2941	if (!buffer_uptodate(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	2942	bh->b_end_io = end_buffer_read_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	2943	get_bh(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2944	submit_bh(rw, bh);
				2945	continue;
				2946	}
				2947	}
				2948	unlock_buffer(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2949	}
				2950	}
				2951
				2952	/*
				2953	* For a data-integrity writeout, we need to wait upon any in-progress I/O
				2954	* and then start new I/O and then wait upon it. The caller must have a ref on
				2955	* the buffer_head.
				2956	*/
				2957	int sync_dirty_buffer(struct buffer_head *bh)
				2958	{
				2959	int ret = 0;
				2960
				2961	WARN_ON(atomic_read(&bh->b_count) < 1);
				2962	lock_buffer(bh);
				2963	if (test_clear_buffer_dirty(bh)) {
				2964	get_bh(bh);
				2965	bh->b_end_io = end_buffer_write_sync;
				2966	ret = submit_bh(WRITE, bh);
				2967	wait_on_buffer(bh);
				2968	if (buffer_eopnotsupp(bh)) {
				2969	clear_buffer_eopnotsupp(bh);
				2970	ret = -EOPNOTSUPP;
				2971	}
				2972	if (!ret && !buffer_uptodate(bh))
				2973	ret = -EIO;
				2974	} else {
				2975	unlock_buffer(bh);
				2976	}
				2977	return ret;
				2978	}
				2979
				2980	/*
				2981	* try_to_free_buffers() checks if all the buffers on this particular page
				2982	* are unused, and releases them if so.
				2983	*
				2984	* Exclusion against try_to_free_buffers may be obtained by either
				2985	* locking the page or by holding its mapping's private_lock.
				2986	*
				2987	* If the page is dirty but all the buffers are clean then we need to
				2988	* be sure to mark the page clean as well. This is because the page
				2989	* may be against a block device, and a later reattachment of buffers
				2990	* to a dirty page will set all buffers dirty. Which would corrupt
				2991	* filesystem data on the same device.
				2992	*
				2993	* The same applies to regular filesystem pages: if all the buffers are
				2994	* clean then we set the page clean and proceed. To do that, we require
				2995	* total exclusion from __set_page_dirty_buffers(). That is obtained with
				2996	* private_lock.
				2997	*
				2998	* try_to_free_buffers() is non-blocking.
				2999	*/
				3000	static inline int buffer_busy(struct buffer_head *bh)
				3001	{
				3002	return atomic_read(&bh->b_count) \|
				3003	(bh->b_state & ((1 << BH_Dirty) \| (1 << BH_Lock)));
				3004	}
				3005
				3006	static int
				3007	drop_buffers(struct page page, struct buffer_head *buffers_to_free)
				3008	{
				3009	struct buffer_head *head = page_buffers(page);
				3010	struct buffer_head *bh;
				3011
				3012	bh = head;
				3013	do {
akpm@osdl.org	de7d5a3	2005-05-01 08:58:39 -0700	[diff] [blame]	3014	if (buffer_write_io_error(bh) && page->mapping)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3015	set_bit(AS_EIO, &page->mapping->flags);
				3016	if (buffer_busy(bh))
				3017	goto failed;
				3018	bh = bh->b_this_page;
				3019	} while (bh != head);
				3020
				3021	do {
				3022	struct buffer_head *next = bh->b_this_page;
				3023
				3024	if (!list_empty(&bh->b_assoc_buffers))
				3025	__remove_assoc_queue(bh);
				3026	bh = next;
				3027	} while (bh != head);
				3028	*buffers_to_free = head;
				3029	__clear_page_buffers(page);
				3030	return 1;
				3031	failed:
				3032	return 0;
				3033	}
				3034
				3035	int try_to_free_buffers(struct page *page)
				3036	{
				3037	struct address_space * const mapping = page->mapping;
				3038	struct buffer_head *buffers_to_free = NULL;
				3039	int ret = 0;
				3040
				3041	BUG_ON(!PageLocked(page));
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	3042	if (PageWriteback(page))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3043	return 0;
				3044
				3045	if (mapping == NULL) { /* can this still happen? */
				3046	ret = drop_buffers(page, &buffers_to_free);
				3047	goto out;
				3048	}
				3049
				3050	spin_lock(&mapping->private_lock);
				3051	ret = drop_buffers(page, &buffers_to_free);
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	3052
				3053	/*
				3054	* If the filesystem writes its buffers by hand (eg ext3)
				3055	* then we can have clean buffers against a dirty page. We
				3056	* clean the page here; otherwise the VM will never notice
				3057	* that the filesystem did any IO at all.
				3058	*
				3059	* Also, during truncate, discard_buffer will have marked all
				3060	* the page's buffers clean. We discover that here and clean
				3061	* the page also.
Nick Piggin	87df724	2007-01-30 14:36:27 +1100	[diff] [blame]	3062	*
				3063	* private_lock must be held over this entire operation in order
				3064	* to synchronise against __set_page_dirty_buffers and prevent the
				3065	* dirty bit from being lost.
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	3066	*/
				3067	if (ret)
				3068	cancel_dirty_page(page, PAGE_CACHE_SIZE);
Nick Piggin	87df724	2007-01-30 14:36:27 +1100	[diff] [blame]	3069	spin_unlock(&mapping->private_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3070	out:
				3071	if (buffers_to_free) {
				3072	struct buffer_head *bh = buffers_to_free;
				3073
				3074	do {
				3075	struct buffer_head *next = bh->b_this_page;
				3076	free_buffer_head(bh);
				3077	bh = next;
				3078	} while (bh != buffers_to_free);
				3079	}
				3080	return ret;
				3081	}
				3082	EXPORT_SYMBOL(try_to_free_buffers);
				3083
NeilBrown	3978d71	2006-03-26 01:37:17 -0800	[diff] [blame]	3084	void block_sync_page(struct page *page)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3085	{
				3086	struct address_space *mapping;
				3087
				3088	smp_mb();
				3089	mapping = page_mapping(page);
				3090	if (mapping)
				3091	blk_run_backing_dev(mapping->backing_dev_info, page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3092	}
				3093
				3094	/*
				3095	* There are no bdflush tunables left. But distributions are
				3096	* still running obsolete flush daemons, so we terminate them here.
				3097	*
				3098	* Use of bdflush() is deprecated and will be removed in a future kernel.
				3099	* The `pdflush' kernel threads fully replace bdflush daemons and this call.
				3100	*/
				3101	asmlinkage long sys_bdflush(int func, long data)
				3102	{
				3103	static int msg_count;
				3104
				3105	if (!capable(CAP_SYS_ADMIN))
				3106	return -EPERM;
				3107
				3108	if (msg_count < 5) {
				3109	msg_count++;
				3110	printk(KERN_INFO
				3111	"warning: process `%s' used the obsolete bdflush"
				3112	" system call\n", current->comm);
				3113	printk(KERN_INFO "Fix your initscripts?\n");
				3114	}
				3115
				3116	if (func == 1)
				3117	do_exit(0);
				3118	return 0;
				3119	}
				3120
				3121	/*
				3122	* Buffer-head allocation
				3123	*/
Christoph Lameter	e18b890	2006-12-06 20:33:20 -0800	[diff] [blame]	3124	static struct kmem_cache *bh_cachep;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3125
				3126	/*
				3127	* Once the number of bh's in the machine exceeds this level, we start
				3128	* stripping them in writeback.
				3129	*/
				3130	static int max_buffer_heads;
				3131
				3132	int buffer_heads_over_limit;
				3133
				3134	struct bh_accounting {
				3135	int nr; /* Number of live bh's */
				3136	int ratelimit; /* Limit cacheline bouncing */
				3137	};
				3138
				3139	static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
				3140
				3141	static void recalc_bh_state(void)
				3142	{
				3143	int i;
				3144	int tot = 0;
				3145
				3146	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
				3147	return;
				3148	__get_cpu_var(bh_accounting).ratelimit = 0;
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	3149	for_each_online_cpu(i)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3150	tot += per_cpu(bh_accounting, i).nr;
				3151	buffer_heads_over_limit = (tot > max_buffer_heads);
				3152	}
				3153
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	3154	struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3155	{
Christoph Lameter	b98938c	2008-02-04 22:28:36 -0800	[diff] [blame]	3156	struct buffer_head *ret = kmem_cache_alloc(bh_cachep,
Mel Gorman	e12ba74	2007-10-16 01:25:52 -0700	[diff] [blame]	3157	set_migrateflags(gfp_flags, __GFP_RECLAIMABLE));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3158	if (ret) {
Christoph Lameter	a35afb8	2007-05-16 22:10:57 -0700	[diff] [blame]	3159	INIT_LIST_HEAD(&ret->b_assoc_buffers);
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3160	get_cpu_var(bh_accounting).nr++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3161	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3162	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3163	}
				3164	return ret;
				3165	}
				3166	EXPORT_SYMBOL(alloc_buffer_head);
				3167
				3168	void free_buffer_head(struct buffer_head *bh)
				3169	{
				3170	BUG_ON(!list_empty(&bh->b_assoc_buffers));
				3171	kmem_cache_free(bh_cachep, bh);
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3172	get_cpu_var(bh_accounting).nr--;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3173	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3174	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3175	}
				3176	EXPORT_SYMBOL(free_buffer_head);
				3177
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3178	static void buffer_exit_cpu(int cpu)
				3179	{
				3180	int i;
				3181	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
				3182
				3183	for (i = 0; i < BH_LRU_SIZE; i++) {
				3184	brelse(b->bhs[i]);
				3185	b->bhs[i] = NULL;
				3186	}
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	3187	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
				3188	per_cpu(bh_accounting, cpu).nr = 0;
				3189	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3190	}
				3191
				3192	static int buffer_cpu_notify(struct notifier_block *self,
				3193	unsigned long action, void *hcpu)
				3194	{
Rafael J. Wysocki	8bb7844	2007-05-09 02:35:10 -0700	[diff] [blame]	3195	if (action == CPU_DEAD \|\| action == CPU_DEAD_FROZEN)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3196	buffer_exit_cpu((unsigned long)hcpu);
				3197	return NOTIFY_OK;
				3198	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3199
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3200	/**
				3201	* bh_uptodate_or_lock: Test whether the buffer is uptodate
				3202	* @bh: struct buffer_head
				3203	*
				3204	* Return true if the buffer is up-to-date and false,
				3205	* with the buffer locked, if not.
				3206	*/
				3207	int bh_uptodate_or_lock(struct buffer_head *bh)
				3208	{
				3209	if (!buffer_uptodate(bh)) {
				3210	lock_buffer(bh);
				3211	if (!buffer_uptodate(bh))
				3212	return 0;
				3213	unlock_buffer(bh);
				3214	}
				3215	return 1;
				3216	}
				3217	EXPORT_SYMBOL(bh_uptodate_or_lock);
				3218
				3219	/**
				3220	* bh_submit_read: Submit a locked buffer for reading
				3221	* @bh: struct buffer_head
				3222	*
				3223	* Returns zero on success and -EIO on error.
				3224	*/
				3225	int bh_submit_read(struct buffer_head *bh)
				3226	{
				3227	BUG_ON(!buffer_locked(bh));
				3228
				3229	if (buffer_uptodate(bh)) {
				3230	unlock_buffer(bh);
				3231	return 0;
				3232	}
				3233
				3234	get_bh(bh);
				3235	bh->b_end_io = end_buffer_read_sync;
				3236	submit_bh(READ, bh);
				3237	wait_on_buffer(bh);
				3238	if (buffer_uptodate(bh))
				3239	return 0;
				3240	return -EIO;
				3241	}
				3242	EXPORT_SYMBOL(bh_submit_read);
				3243
Christoph Lameter	b98938c	2008-02-04 22:28:36 -0800	[diff] [blame]	3244	static void
				3245	init_buffer_head(struct kmem_cache cachep, void data)
				3246	{
				3247	struct buffer_head *bh = data;
				3248
				3249	memset(bh, 0, sizeof(*bh));
				3250	INIT_LIST_HEAD(&bh->b_assoc_buffers);
				3251	}
				3252
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3253	void __init buffer_init(void)
				3254	{
				3255	int nrpages;
				3256
Christoph Lameter	b98938c	2008-02-04 22:28:36 -0800	[diff] [blame]	3257	bh_cachep = kmem_cache_create("buffer_head",
				3258	sizeof(struct buffer_head), 0,
				3259	(SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|
				3260	SLAB_MEM_SPREAD),
				3261	init_buffer_head);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3262
				3263	/*
				3264	* Limit the bh occupancy to 10% of ZONE_NORMAL
				3265	*/
				3266	nrpages = (nr_free_buffer_pages() * 10) / 100;
				3267	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
				3268	hotcpu_notifier(buffer_cpu_notify, 0);
				3269	}
				3270
				3271	EXPORT_SYMBOL(__bforget);
				3272	EXPORT_SYMBOL(__brelse);
				3273	EXPORT_SYMBOL(__wait_on_buffer);
				3274	EXPORT_SYMBOL(block_commit_write);
				3275	EXPORT_SYMBOL(block_prepare_write);
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	3276	EXPORT_SYMBOL(block_page_mkwrite);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3277	EXPORT_SYMBOL(block_read_full_page);
				3278	EXPORT_SYMBOL(block_sync_page);
				3279	EXPORT_SYMBOL(block_truncate_page);
				3280	EXPORT_SYMBOL(block_write_full_page);
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	3281	EXPORT_SYMBOL(cont_write_begin);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3282	EXPORT_SYMBOL(end_buffer_read_sync);
				3283	EXPORT_SYMBOL(end_buffer_write_sync);
				3284	EXPORT_SYMBOL(file_fsync);
				3285	EXPORT_SYMBOL(fsync_bdev);
				3286	EXPORT_SYMBOL(generic_block_bmap);
				3287	EXPORT_SYMBOL(generic_commit_write);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	3288	EXPORT_SYMBOL(generic_cont_expand_simple);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3289	EXPORT_SYMBOL(init_buffer);
				3290	EXPORT_SYMBOL(invalidate_bdev);
				3291	EXPORT_SYMBOL(ll_rw_block);
				3292	EXPORT_SYMBOL(mark_buffer_dirty);
				3293	EXPORT_SYMBOL(submit_bh);
				3294	EXPORT_SYMBOL(sync_dirty_buffer);
				3295	EXPORT_SYMBOL(unlock_buffer);