Blame - fs/buffer.c - kernel/msm

blob: 3b226f7d96969d166aabbae758fa073c81d478ec [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/fs/buffer.c
				3	*
				4	* Copyright (C) 1991, 1992, 2002 Linus Torvalds
				5	*/
				6
				7	/*
				8	* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
				9	*
				10	* Removed a lot of unnecessary code and simplified things now that
				11	* the buffer cache isn't our primary cache - Andrew Tridgell 12/96
				12	*
				13	* Speed up hash, lru, and free list operations. Use gfp() for allocating
				14	* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
				15	*
				16	* Added 32k buffer block sizes - these are required older ARM systems. - RMK
				17	*
				18	* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
				19	*/
				20
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	21	#include <linux/kernel.h>
				22	#include <linux/syscalls.h>
				23	#include <linux/fs.h>
				24	#include <linux/mm.h>
				25	#include <linux/percpu.h>
				26	#include <linux/slab.h>
				27	#include <linux/smp_lock.h>
Randy Dunlap	16f7e0f	2006-01-11 12:17:46 -0800	[diff] [blame]	28	#include <linux/capability.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	29	#include <linux/blkdev.h>
				30	#include <linux/file.h>
				31	#include <linux/quotaops.h>
				32	#include <linux/highmem.h>
				33	#include <linux/module.h>
				34	#include <linux/writeback.h>
				35	#include <linux/hash.h>
				36	#include <linux/suspend.h>
				37	#include <linux/buffer_head.h>
Andrew Morton	55e829a	2006-12-10 02:19:27 -0800	[diff] [blame]	38	#include <linux/task_io_accounting_ops.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	39	#include <linux/bio.h>
				40	#include <linux/notifier.h>
				41	#include <linux/cpu.h>
				42	#include <linux/bitops.h>
				43	#include <linux/mpage.h>
Ingo Molnar	fb1c8f9	2005-09-10 00:25:56 -0700	[diff] [blame]	44	#include <linux/bit_spinlock.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	45
				46	static int fsync_buffers_list(spinlock_t lock, struct list_head list);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	47
				48	#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
				49
				50	inline void
				51	init_buffer(struct buffer_head bh, bh_end_io_t handler, void *private)
				52	{
				53	bh->b_end_io = handler;
				54	bh->b_private = private;
				55	}
				56
				57	static int sync_buffer(void *word)
				58	{
				59	struct block_device *bd;
				60	struct buffer_head *bh
				61	= container_of(word, struct buffer_head, b_state);
				62
				63	smp_mb();
				64	bd = bh->b_bdev;
				65	if (bd)
				66	blk_run_address_space(bd->bd_inode->i_mapping);
				67	io_schedule();
				68	return 0;
				69	}
				70
				71	void fastcall __lock_buffer(struct buffer_head *bh)
				72	{
				73	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
				74	TASK_UNINTERRUPTIBLE);
				75	}
				76	EXPORT_SYMBOL(__lock_buffer);
				77
				78	void fastcall unlock_buffer(struct buffer_head *bh)
				79	{
Nick Piggin	72ed3d0	2007-02-10 01:46:22 -0800	[diff] [blame]	80	smp_mb__before_clear_bit();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	81	clear_buffer_locked(bh);
				82	smp_mb__after_clear_bit();
				83	wake_up_bit(&bh->b_state, BH_Lock);
				84	}
				85
				86	/*
				87	* Block until a buffer comes unlocked. This doesn't stop it
				88	* from becoming locked again - you have to lock it yourself
				89	* if you want to preserve its state.
				90	*/
				91	void __wait_on_buffer(struct buffer_head * bh)
				92	{
				93	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
				94	}
				95
				96	static void
				97	__clear_page_buffers(struct page *page)
				98	{
				99	ClearPagePrivate(page);
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	100	set_page_private(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	101	page_cache_release(page);
				102	}
				103
				104	static void buffer_io_error(struct buffer_head *bh)
				105	{
				106	char b[BDEVNAME_SIZE];
				107
				108	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
				109	bdevname(bh->b_bdev, b),
				110	(unsigned long long)bh->b_blocknr);
				111	}
				112
				113	/*
				114	* Default synchronous end-of-IO handler.. Just mark it up-to-date and
				115	* unlock the buffer. This is what ll_rw_block uses too.
				116	*/
				117	void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
				118	{
				119	if (uptodate) {
				120	set_buffer_uptodate(bh);
				121	} else {
				122	/* This happens, due to failed READA attempts. */
				123	clear_buffer_uptodate(bh);
				124	}
				125	unlock_buffer(bh);
				126	put_bh(bh);
				127	}
				128
				129	void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
				130	{
				131	char b[BDEVNAME_SIZE];
				132
				133	if (uptodate) {
				134	set_buffer_uptodate(bh);
				135	} else {
				136	if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
				137	buffer_io_error(bh);
				138	printk(KERN_WARNING "lost page write due to "
				139	"I/O error on %s\n",
				140	bdevname(bh->b_bdev, b));
				141	}
				142	set_buffer_write_io_error(bh);
				143	clear_buffer_uptodate(bh);
				144	}
				145	unlock_buffer(bh);
				146	put_bh(bh);
				147	}
				148
				149	/*
				150	* Write out and wait upon all the dirty data associated with a block
				151	* device via its mapping. Does not take the superblock lock.
				152	*/
				153	int sync_blockdev(struct block_device *bdev)
				154	{
				155	int ret = 0;
				156
OGAWA Hirofumi	28fd129	2006-01-08 01:02:14 -0800	[diff] [blame]	157	if (bdev)
				158	ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	159	return ret;
				160	}
				161	EXPORT_SYMBOL(sync_blockdev);
				162
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	163	/*
				164	* Write out and wait upon all dirty data associated with this
				165	* device. Filesystem data as well as the underlying block
				166	* device. Takes the superblock lock.
				167	*/
				168	int fsync_bdev(struct block_device *bdev)
				169	{
				170	struct super_block *sb = get_super(bdev);
				171	if (sb) {
				172	int res = fsync_super(sb);
				173	drop_super(sb);
				174	return res;
				175	}
				176	return sync_blockdev(bdev);
				177	}
				178
				179	/**
				180	* freeze_bdev -- lock a filesystem and force it into a consistent state
				181	* @bdev: blockdevice to lock
				182	*
David Chinner	f73ca1b	2007-01-10 23:15:41 -0800	[diff] [blame]	183	* This takes the block device bd_mount_sem to make sure no new mounts
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	184	* happen on bdev until thaw_bdev() is called.
				185	* If a superblock is found on this device, we take the s_umount semaphore
				186	* on it to make sure nobody unmounts until the snapshot creation is done.
				187	*/
				188	struct super_block freeze_bdev(struct block_device bdev)
				189	{
				190	struct super_block *sb;
				191
David Chinner	f73ca1b	2007-01-10 23:15:41 -0800	[diff] [blame]	192	down(&bdev->bd_mount_sem);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	193	sb = get_super(bdev);
				194	if (sb && !(sb->s_flags & MS_RDONLY)) {
				195	sb->s_frozen = SB_FREEZE_WRITE;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	196	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	197
OGAWA Hirofumi	d25b9a1	2006-03-25 03:07:44 -0800	[diff] [blame]	198	__fsync_super(sb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	199
				200	sb->s_frozen = SB_FREEZE_TRANS;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	201	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	202
				203	sync_blockdev(sb->s_bdev);
				204
				205	if (sb->s_op->write_super_lockfs)
				206	sb->s_op->write_super_lockfs(sb);
				207	}
				208
				209	sync_blockdev(bdev);
				210	return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
				211	}
				212	EXPORT_SYMBOL(freeze_bdev);
				213
				214	/**
				215	* thaw_bdev -- unlock filesystem
				216	* @bdev: blockdevice to unlock
				217	* @sb: associated superblock
				218	*
				219	* Unlocks the filesystem and marks it writeable again after freeze_bdev().
				220	*/
				221	void thaw_bdev(struct block_device bdev, struct super_block sb)
				222	{
				223	if (sb) {
				224	BUG_ON(sb->s_bdev != bdev);
				225
				226	if (sb->s_op->unlockfs)
				227	sb->s_op->unlockfs(sb);
				228	sb->s_frozen = SB_UNFROZEN;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	229	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	230	wake_up(&sb->s_wait_unfrozen);
				231	drop_super(sb);
				232	}
				233
David Chinner	f73ca1b	2007-01-10 23:15:41 -0800	[diff] [blame]	234	up(&bdev->bd_mount_sem);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	235	}
				236	EXPORT_SYMBOL(thaw_bdev);
				237
				238	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	239	* Various filesystems appear to want __find_get_block to be non-blocking.
				240	* But it's the page lock which protects the buffers. To get around this,
				241	* we get exclusion from try_to_free_buffers with the blockdev mapping's
				242	* private_lock.
				243	*
				244	* Hack idea: for the blockdev mapping, i_bufferlist_lock contention
				245	* may be quite high. This code could TryLock the page, and if that
				246	* succeeds, there is no need to take private_lock. (But if
				247	* private_lock is contended then so is mapping->tree_lock).
				248	*/
				249	static struct buffer_head *
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	250	__find_get_block_slow(struct block_device *bdev, sector_t block)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	251	{
				252	struct inode *bd_inode = bdev->bd_inode;
				253	struct address_space *bd_mapping = bd_inode->i_mapping;
				254	struct buffer_head *ret = NULL;
				255	pgoff_t index;
				256	struct buffer_head *bh;
				257	struct buffer_head *head;
				258	struct page *page;
				259	int all_mapped = 1;
				260
				261	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
				262	page = find_get_page(bd_mapping, index);
				263	if (!page)
				264	goto out;
				265
				266	spin_lock(&bd_mapping->private_lock);
				267	if (!page_has_buffers(page))
				268	goto out_unlock;
				269	head = page_buffers(page);
				270	bh = head;
				271	do {
				272	if (bh->b_blocknr == block) {
				273	ret = bh;
				274	get_bh(bh);
				275	goto out_unlock;
				276	}
				277	if (!buffer_mapped(bh))
				278	all_mapped = 0;
				279	bh = bh->b_this_page;
				280	} while (bh != head);
				281
				282	/* we might be here because some of the buffers on this page are
				283	* not mapped. This is due to various races between
				284	* file io on the block device and getblk. It gets dealt with
				285	* elsewhere, don't buffer_error if we had some unmapped buffers
				286	*/
				287	if (all_mapped) {
				288	printk("__find_get_block_slow() failed. "
				289	"block=%llu, b_blocknr=%llu\n",
Badari Pulavarty	205f87f	2006-03-26 01:38:00 -0800	[diff] [blame]	290	(unsigned long long)block,
				291	(unsigned long long)bh->b_blocknr);
				292	printk("b_state=0x%08lx, b_size=%zu\n",
				293	bh->b_state, bh->b_size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	294	printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
				295	}
				296	out_unlock:
				297	spin_unlock(&bd_mapping->private_lock);
				298	page_cache_release(page);
				299	out:
				300	return ret;
				301	}
				302
				303	/* If invalidate_buffers() will trash dirty buffers, it means some kind
				304	of fs corruption is going on. Trashing dirty data always imply losing
				305	information that was supposed to be just stored on the physical layer
				306	by the user.
				307
				308	Thus invalidate_buffers in general usage is not allwowed to trash
				309	dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
				310	be preserved. These buffers are simply skipped.
				311
				312	We also skip buffers which are still in use. For example this can
				313	happen if a userspace program is reading the block device.
				314
				315	NOTE: In the case where the user removed a removable-media-disk even if
				316	there's still dirty data not synced on disk (due a bug in the device driver
				317	or due an error of the user), by not destroying the dirty buffers we could
				318	generate corruption also on the next media inserted, thus a parameter is
				319	necessary to handle this case in the most safe way possible (trying
				320	to not corrupt also the new disk inserted with the data belonging to
				321	the old now corrupted disk). Also for the ramdisk the natural thing
				322	to do in order to release the ramdisk memory is to destroy dirty buffers.
				323
				324	These are two special cases. Normal usage imply the device driver
				325	to issue a sync on the device (without waiting I/O completion) and
				326	then an invalidate_buffers call that doesn't trash dirty buffers.
				327
				328	For handling cache coherency with the blkdev pagecache the 'update' case
				329	is been introduced. It is needed to re-read from disk any pinned
				330	buffer. NOTE: re-reading from disk is destructive so we can do it only
				331	when we assume nobody is changing the buffercache under our I/O and when
				332	we think the disk contains more recent information than the buffercache.
				333	The update == 1 pass marks the buffers we need to update, the update == 2
				334	pass does the actual I/O. */
Peter Zijlstra	f98393a	2007-05-06 14:49:54 -0700	[diff] [blame]	335	void invalidate_bdev(struct block_device *bdev)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	336	{
Andrew Morton	0e1dfc6	2006-07-30 03:03:28 -0700	[diff] [blame]	337	struct address_space *mapping = bdev->bd_inode->i_mapping;
				338
				339	if (mapping->nrpages == 0)
				340	return;
				341
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	342	invalidate_bh_lrus();
Andrew Morton	fc0ecff	2007-02-10 01:45:39 -0800	[diff] [blame]	343	invalidate_mapping_pages(mapping, 0, -1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	344	}
				345
				346	/*
				347	* Kick pdflush then try to free up some ZONE_NORMAL memory.
				348	*/
				349	static void free_more_memory(void)
				350	{
				351	struct zone **zones;
				352	pg_data_t *pgdat;
				353
Pekka J Enberg	687a21c	2005-06-28 20:44:55 -0700	[diff] [blame]	354	wakeup_pdflush(1024);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	355	yield();
				356
KAMEZAWA Hiroyuki	ec936fc	2006-03-27 01:15:59 -0800	[diff] [blame]	357	for_each_online_pgdat(pgdat) {
Al Viro	af4ca45	2005-10-21 02:55:38 -0400	[diff] [blame]	358	zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	359	if (*zones)
Darren Hart	1ad539b	2005-06-21 17:14:53 -0700	[diff] [blame]	360	try_to_free_pages(zones, GFP_NOFS);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	361	}
				362	}
				363
				364	/*
				365	* I/O completion handler for block_read_full_page() - pages
				366	* which come unlocked at the end of I/O.
				367	*/
				368	static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
				369	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	370	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	371	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	372	struct buffer_head *tmp;
				373	struct page *page;
				374	int page_uptodate = 1;
				375
				376	BUG_ON(!buffer_async_read(bh));
				377
				378	page = bh->b_page;
				379	if (uptodate) {
				380	set_buffer_uptodate(bh);
				381	} else {
				382	clear_buffer_uptodate(bh);
				383	if (printk_ratelimit())
				384	buffer_io_error(bh);
				385	SetPageError(page);
				386	}
				387
				388	/*
				389	* Be _very_ careful from here on. Bad things can happen if
				390	* two buffer heads end IO at almost the same time and both
				391	* decide that the page is now completely done.
				392	*/
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	393	first = page_buffers(page);
				394	local_irq_save(flags);
				395	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	396	clear_buffer_async_read(bh);
				397	unlock_buffer(bh);
				398	tmp = bh;
				399	do {
				400	if (!buffer_uptodate(tmp))
				401	page_uptodate = 0;
				402	if (buffer_async_read(tmp)) {
				403	BUG_ON(!buffer_locked(tmp));
				404	goto still_busy;
				405	}
				406	tmp = tmp->b_this_page;
				407	} while (tmp != bh);
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	408	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				409	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	410
				411	/*
				412	* If none of the buffers had errors and they are all
				413	* uptodate then we can set the page uptodate.
				414	*/
				415	if (page_uptodate && !PageError(page))
				416	SetPageUptodate(page);
				417	unlock_page(page);
				418	return;
				419
				420	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	421	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				422	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	423	return;
				424	}
				425
				426	/*
				427	* Completion handler for block_write_full_page() - pages which are unlocked
				428	* during I/O, and which have PageWriteback cleared upon I/O completion.
				429	*/
Adrian Bunk	b6cd0b7	2006-06-27 02:53:54 -0700	[diff] [blame]	430	static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	431	{
				432	char b[BDEVNAME_SIZE];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	433	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	434	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	435	struct buffer_head *tmp;
				436	struct page *page;
				437
				438	BUG_ON(!buffer_async_write(bh));
				439
				440	page = bh->b_page;
				441	if (uptodate) {
				442	set_buffer_uptodate(bh);
				443	} else {
				444	if (printk_ratelimit()) {
				445	buffer_io_error(bh);
				446	printk(KERN_WARNING "lost page write due to "
				447	"I/O error on %s\n",
				448	bdevname(bh->b_bdev, b));
				449	}
				450	set_bit(AS_EIO, &page->mapping->flags);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	451	set_buffer_write_io_error(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	452	clear_buffer_uptodate(bh);
				453	SetPageError(page);
				454	}
				455
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	456	first = page_buffers(page);
				457	local_irq_save(flags);
				458	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
				459
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	460	clear_buffer_async_write(bh);
				461	unlock_buffer(bh);
				462	tmp = bh->b_this_page;
				463	while (tmp != bh) {
				464	if (buffer_async_write(tmp)) {
				465	BUG_ON(!buffer_locked(tmp));
				466	goto still_busy;
				467	}
				468	tmp = tmp->b_this_page;
				469	}
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	470	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				471	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	472	end_page_writeback(page);
				473	return;
				474
				475	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	476	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				477	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	478	return;
				479	}
				480
				481	/*
				482	* If a page's buffers are under async readin (end_buffer_async_read
				483	* completion) then there is a possibility that another thread of
				484	* control could lock one of the buffers after it has completed
				485	* but while some of the other buffers have not completed. This
				486	* locked buffer would confuse end_buffer_async_read() into not unlocking
				487	* the page. So the absence of BH_Async_Read tells end_buffer_async_read()
				488	* that this buffer is not under async I/O.
				489	*
				490	* The page comes unlocked when it has no locked buffer_async buffers
				491	* left.
				492	*
				493	* PageLocked prevents anyone starting new async I/O reads any of
				494	* the buffers.
				495	*
				496	* PageWriteback is used to prevent simultaneous writeout of the same
				497	* page.
				498	*
				499	* PageLocked prevents anyone from starting writeback of a page which is
				500	* under read I/O (PageWriteback is only ever set against a locked page).
				501	*/
				502	static void mark_buffer_async_read(struct buffer_head *bh)
				503	{
				504	bh->b_end_io = end_buffer_async_read;
				505	set_buffer_async_read(bh);
				506	}
				507
				508	void mark_buffer_async_write(struct buffer_head *bh)
				509	{
				510	bh->b_end_io = end_buffer_async_write;
				511	set_buffer_async_write(bh);
				512	}
				513	EXPORT_SYMBOL(mark_buffer_async_write);
				514
				515
				516	/*
				517	* fs/buffer.c contains helper functions for buffer-backed address space's
				518	* fsync functions. A common requirement for buffer-based filesystems is
				519	* that certain data from the backing blockdev needs to be written out for
				520	* a successful fsync(). For example, ext2 indirect blocks need to be
				521	* written back and waited upon before fsync() returns.
				522	*
				523	* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
				524	* inode_has_buffers() and invalidate_inode_buffers() are provided for the
				525	* management of a list of dependent buffers at ->i_mapping->private_list.
				526	*
				527	* Locking is a little subtle: try_to_free_buffers() will remove buffers
				528	* from their controlling inode's queue when they are being freed. But
				529	* try_to_free_buffers() will be operating against the blockdev mapping
				530	* at the time, not against the S_ISREG file which depends on those buffers.
				531	* So the locking for private_list is via the private_lock in the address_space
				532	* which backs the buffers. Which is different from the address_space
				533	* against which the buffers are listed. So for a particular address_space,
				534	* mapping->private_lock does not protect mapping->private_list! In fact,
				535	* mapping->private_list will always be protected by the backing blockdev's
				536	* ->private_lock.
				537	*
				538	* Which introduces a requirement: all buffers on an address_space's
				539	* ->private_list must be from the same address_space: the blockdev's.
				540	*
				541	* address_spaces which do not place buffers at ->private_list via these
				542	* utility functions are free to use private_lock and private_list for
				543	* whatever they want. The only requirement is that list_empty(private_list)
				544	* be true at clear_inode() time.
				545	*
				546	* FIXME: clear_inode should not call invalidate_inode_buffers(). The
				547	* filesystems should do that. invalidate_inode_buffers() should just go
				548	* BUG_ON(!list_empty).
				549	*
				550	* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
				551	* take an address_space, not an inode. And it should be called
				552	* mark_buffer_dirty_fsync() to clearly define why those buffers are being
				553	* queued up.
				554	*
				555	* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
				556	* list if it is already on a list. Because if the buffer is on a list,
				557	* it must already be on the right one. If not, the filesystem is being
				558	* silly. This will save a ton of locking. But first we have to ensure
				559	* that buffers are taken off the old inode's list when they are freed
				560	* (presumably in truncate). That requires careful auditing of all
				561	* filesystems (do it inside bforget()). It could also be done by bringing
				562	* b_inode back.
				563	*/
				564
				565	/*
				566	* The buffer's backing address_space's private_lock must be held
				567	*/
				568	static inline void __remove_assoc_queue(struct buffer_head *bh)
				569	{
				570	list_del_init(&bh->b_assoc_buffers);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	571	WARN_ON(!bh->b_assoc_map);
				572	if (buffer_write_io_error(bh))
				573	set_bit(AS_EIO, &bh->b_assoc_map->flags);
				574	bh->b_assoc_map = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	575	}
				576
				577	int inode_has_buffers(struct inode *inode)
				578	{
				579	return !list_empty(&inode->i_data.private_list);
				580	}
				581
				582	/*
				583	* osync is designed to support O_SYNC io. It waits synchronously for
				584	* all already-submitted IO to complete, but does not queue any new
				585	* writes to the disk.
				586	*
				587	* To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
				588	* you dirty the buffers, and then use osync_inode_buffers to wait for
				589	* completion. Any other dirty buffers which are not yet queued for
				590	* write will not be flushed to disk by the osync.
				591	*/
				592	static int osync_buffers_list(spinlock_t lock, struct list_head list)
				593	{
				594	struct buffer_head *bh;
				595	struct list_head *p;
				596	int err = 0;
				597
				598	spin_lock(lock);
				599	repeat:
				600	list_for_each_prev(p, list) {
				601	bh = BH_ENTRY(p);
				602	if (buffer_locked(bh)) {
				603	get_bh(bh);
				604	spin_unlock(lock);
				605	wait_on_buffer(bh);
				606	if (!buffer_uptodate(bh))
				607	err = -EIO;
				608	brelse(bh);
				609	spin_lock(lock);
				610	goto repeat;
				611	}
				612	}
				613	spin_unlock(lock);
				614	return err;
				615	}
				616
				617	/**
				618	* sync_mapping_buffers - write out and wait upon a mapping's "associated"
				619	* buffers
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	620	* @mapping: the mapping which wants those buffers written
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	621	*
				622	* Starts I/O against the buffers at mapping->private_list, and waits upon
				623	* that I/O.
				624	*
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	625	* Basically, this is a convenience function for fsync().
				626	* @mapping is a file or directory which needs those buffers to be written for
				627	* a successful fsync().
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	628	*/
				629	int sync_mapping_buffers(struct address_space *mapping)
				630	{
				631	struct address_space *buffer_mapping = mapping->assoc_mapping;
				632
				633	if (buffer_mapping == NULL \|\| list_empty(&mapping->private_list))
				634	return 0;
				635
				636	return fsync_buffers_list(&buffer_mapping->private_lock,
				637	&mapping->private_list);
				638	}
				639	EXPORT_SYMBOL(sync_mapping_buffers);
				640
				641	/*
				642	* Called when we've recently written block `bblock', and it is known that
				643	* `bblock' was for a buffer_boundary() buffer. This means that the block at
				644	* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
				645	* dirty, schedule it for IO. So that indirects merge nicely with their data.
				646	*/
				647	void write_boundary_block(struct block_device *bdev,
				648	sector_t bblock, unsigned blocksize)
				649	{
				650	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
				651	if (bh) {
				652	if (buffer_dirty(bh))
				653	ll_rw_block(WRITE, 1, &bh);
				654	put_bh(bh);
				655	}
				656	}
				657
				658	void mark_buffer_dirty_inode(struct buffer_head bh, struct inode inode)
				659	{
				660	struct address_space *mapping = inode->i_mapping;
				661	struct address_space *buffer_mapping = bh->b_page->mapping;
				662
				663	mark_buffer_dirty(bh);
				664	if (!mapping->assoc_mapping) {
				665	mapping->assoc_mapping = buffer_mapping;
				666	} else {
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	667	BUG_ON(mapping->assoc_mapping != buffer_mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	668	}
				669	if (list_empty(&bh->b_assoc_buffers)) {
				670	spin_lock(&buffer_mapping->private_lock);
				671	list_move_tail(&bh->b_assoc_buffers,
				672	&mapping->private_list);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	673	bh->b_assoc_map = mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	674	spin_unlock(&buffer_mapping->private_lock);
				675	}
				676	}
				677	EXPORT_SYMBOL(mark_buffer_dirty_inode);
				678
				679	/*
				680	* Add a page to the dirty page list.
				681	*
				682	* It is a sad fact of life that this function is called from several places
				683	* deeply under spinlocking. It may not sleep.
				684	*
				685	* If the page has buffers, the uptodate buffers are set dirty, to preserve
				686	* dirty-state coherency between the page and the buffers. It the page does
				687	* not have buffers then when they are later attached they will all be set
				688	* dirty.
				689	*
				690	* The buffers are dirtied before the page is dirtied. There's a small race
				691	* window in which a writepage caller may see the page cleanness but not the
				692	* buffer dirtiness. That's fine. If this code were to set the page dirty
				693	* before the buffers, a concurrent writepage caller could clear the page dirty
				694	* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
				695	* page on the dirty page list.
				696	*
				697	* We use private_lock to lock against try_to_free_buffers while using the
				698	* page's buffer list. Also use this to protect against clean buffers being
				699	* added to the page after it was set dirty.
				700	*
				701	* FIXME: may need to call ->reservepage here as well. That's rather up to the
				702	* address_space though.
				703	*/
				704	int __set_page_dirty_buffers(struct page *page)
				705	{
Nick Piggin	ebf7a22	2006-10-10 04:36:54 +0200	[diff] [blame]	706	struct address_space * const mapping = page_mapping(page);
				707
				708	if (unlikely(!mapping))
				709	return !TestSetPageDirty(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	710
				711	spin_lock(&mapping->private_lock);
				712	if (page_has_buffers(page)) {
				713	struct buffer_head *head = page_buffers(page);
				714	struct buffer_head *bh = head;
				715
				716	do {
				717	set_buffer_dirty(bh);
				718	bh = bh->b_this_page;
				719	} while (bh != head);
				720	}
				721	spin_unlock(&mapping->private_lock);
				722
Andrew Morton	8c08540	2006-12-10 02:19:24 -0800	[diff] [blame]	723	if (TestSetPageDirty(page))
				724	return 0;
				725
				726	write_lock_irq(&mapping->tree_lock);
				727	if (page->mapping) { /* Race with truncate? */
Andrew Morton	55e829a	2006-12-10 02:19:27 -0800	[diff] [blame]	728	if (mapping_cap_account_dirty(mapping)) {
Andrew Morton	8c08540	2006-12-10 02:19:24 -0800	[diff] [blame]	729	__inc_zone_page_state(page, NR_FILE_DIRTY);
Andrew Morton	55e829a	2006-12-10 02:19:27 -0800	[diff] [blame]	730	task_io_account_write(PAGE_CACHE_SIZE);
				731	}
Andrew Morton	8c08540	2006-12-10 02:19:24 -0800	[diff] [blame]	732	radix_tree_tag_set(&mapping->page_tree,
				733	page_index(page), PAGECACHE_TAG_DIRTY);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	734	}
Andrew Morton	8c08540	2006-12-10 02:19:24 -0800	[diff] [blame]	735	write_unlock_irq(&mapping->tree_lock);
				736	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
				737	return 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	738	}
				739	EXPORT_SYMBOL(__set_page_dirty_buffers);
				740
				741	/*
				742	* Write out and wait upon a list of buffers.
				743	*
				744	* We have conflicting pressures: we want to make sure that all
				745	* initially dirty buffers get waited on, but that any subsequently
				746	* dirtied buffers don't. After all, we don't want fsync to last
				747	* forever if somebody is actively writing to the file.
				748	*
				749	* Do this in two main stages: first we copy dirty buffers to a
				750	* temporary inode list, queueing the writes as we go. Then we clean
				751	* up, waiting for those writes to complete.
				752	*
				753	* During this second stage, any subsequent updates to the file may end
				754	* up refiling the buffer on the original inode's dirty list again, so
				755	* there is a chance we will end up with a buffer queued for write but
				756	* not yet completed on that list. So, as a final cleanup we go through
				757	* the osync code to catch these locked, dirty buffers without requeuing
				758	* any newly dirty buffers for write.
				759	*/
				760	static int fsync_buffers_list(spinlock_t lock, struct list_head list)
				761	{
				762	struct buffer_head *bh;
				763	struct list_head tmp;
				764	int err = 0, err2;
				765
				766	INIT_LIST_HEAD(&tmp);
				767
				768	spin_lock(lock);
				769	while (!list_empty(list)) {
				770	bh = BH_ENTRY(list->next);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	771	__remove_assoc_queue(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	772	if (buffer_dirty(bh) \|\| buffer_locked(bh)) {
				773	list_add(&bh->b_assoc_buffers, &tmp);
				774	if (buffer_dirty(bh)) {
				775	get_bh(bh);
				776	spin_unlock(lock);
				777	/*
				778	* Ensure any pending I/O completes so that
				779	* ll_rw_block() actually writes the current
				780	* contents - it is a noop if I/O is still in
				781	* flight on potentially older contents.
				782	*/
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	783	ll_rw_block(SWRITE, 1, &bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	784	brelse(bh);
				785	spin_lock(lock);
				786	}
				787	}
				788	}
				789
				790	while (!list_empty(&tmp)) {
				791	bh = BH_ENTRY(tmp.prev);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	792	list_del_init(&bh->b_assoc_buffers);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	793	get_bh(bh);
				794	spin_unlock(lock);
				795	wait_on_buffer(bh);
				796	if (!buffer_uptodate(bh))
				797	err = -EIO;
				798	brelse(bh);
				799	spin_lock(lock);
				800	}
				801
				802	spin_unlock(lock);
				803	err2 = osync_buffers_list(lock, list);
				804	if (err)
				805	return err;
				806	else
				807	return err2;
				808	}
				809
				810	/*
				811	* Invalidate any and all dirty buffers on a given inode. We are
				812	* probably unmounting the fs, but that doesn't mean we have already
				813	* done a sync(). Just drop the buffers from the inode list.
				814	*
				815	* NOTE: we take the inode's blockdev's mapping's private_lock. Which
				816	* assumes that all the buffers are against the blockdev. Not true
				817	* for reiserfs.
				818	*/
				819	void invalidate_inode_buffers(struct inode *inode)
				820	{
				821	if (inode_has_buffers(inode)) {
				822	struct address_space *mapping = &inode->i_data;
				823	struct list_head *list = &mapping->private_list;
				824	struct address_space *buffer_mapping = mapping->assoc_mapping;
				825
				826	spin_lock(&buffer_mapping->private_lock);
				827	while (!list_empty(list))
				828	__remove_assoc_queue(BH_ENTRY(list->next));
				829	spin_unlock(&buffer_mapping->private_lock);
				830	}
				831	}
				832
				833	/*
				834	* Remove any clean buffers from the inode's buffer list. This is called
				835	* when we're trying to free the inode itself. Those buffers can pin it.
				836	*
				837	* Returns true if all buffers were removed.
				838	*/
				839	int remove_inode_buffers(struct inode *inode)
				840	{
				841	int ret = 1;
				842
				843	if (inode_has_buffers(inode)) {
				844	struct address_space *mapping = &inode->i_data;
				845	struct list_head *list = &mapping->private_list;
				846	struct address_space *buffer_mapping = mapping->assoc_mapping;
				847
				848	spin_lock(&buffer_mapping->private_lock);
				849	while (!list_empty(list)) {
				850	struct buffer_head *bh = BH_ENTRY(list->next);
				851	if (buffer_dirty(bh)) {
				852	ret = 0;
				853	break;
				854	}
				855	__remove_assoc_queue(bh);
				856	}
				857	spin_unlock(&buffer_mapping->private_lock);
				858	}
				859	return ret;
				860	}
				861
				862	/*
				863	* Create the appropriate buffers when given a page for data area and
				864	* the size of each buffer.. Use the bh->b_this_page linked list to
				865	* follow the buffers created. Return NULL if unable to create more
				866	* buffers.
				867	*
				868	* The retry flag is used to differentiate async IO (paging, swapping)
				869	* which may not fail from ordinary buffer allocations.
				870	*/
				871	struct buffer_head alloc_page_buffers(struct page page, unsigned long size,
				872	int retry)
				873	{
				874	struct buffer_head bh, head;
				875	long offset;
				876
				877	try_again:
				878	head = NULL;
				879	offset = PAGE_SIZE;
				880	while ((offset -= size) >= 0) {
				881	bh = alloc_buffer_head(GFP_NOFS);
				882	if (!bh)
				883	goto no_grow;
				884
				885	bh->b_bdev = NULL;
				886	bh->b_this_page = head;
				887	bh->b_blocknr = -1;
				888	head = bh;
				889
				890	bh->b_state = 0;
				891	atomic_set(&bh->b_count, 0);
Chris Mason	fc5cd58	2006-02-01 03:06:48 -0800	[diff] [blame]	892	bh->b_private = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	893	bh->b_size = size;
				894
				895	/* Link the buffer to its page */
				896	set_bh_page(bh, page, offset);
				897
Nathan Scott	01ffe33	2006-01-17 09:02:07 +1100	[diff] [blame]	898	init_buffer(bh, NULL, NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	899	}
				900	return head;
				901	/*
				902	* In case anything failed, we just free everything we got.
				903	*/
				904	no_grow:
				905	if (head) {
				906	do {
				907	bh = head;
				908	head = head->b_this_page;
				909	free_buffer_head(bh);
				910	} while (head);
				911	}
				912
				913	/*
				914	* Return failure for non-async IO requests. Async IO requests
				915	* are not allowed to fail, so we have to wait until buffer heads
				916	* become available. But we don't want tasks sleeping with
				917	* partially complete buffers, so all were released above.
				918	*/
				919	if (!retry)
				920	return NULL;
				921
				922	/* We're _really_ low on memory. Now we just
				923	* wait for old buffer heads to become free due to
				924	* finishing IO. Since this is an async request and
				925	* the reserve list is empty, we're sure there are
				926	* async buffer heads in use.
				927	*/
				928	free_more_memory();
				929	goto try_again;
				930	}
				931	EXPORT_SYMBOL_GPL(alloc_page_buffers);
				932
				933	static inline void
				934	link_dev_buffers(struct page page, struct buffer_head head)
				935	{
				936	struct buffer_head bh, tail;
				937
				938	bh = head;
				939	do {
				940	tail = bh;
				941	bh = bh->b_this_page;
				942	} while (bh);
				943	tail->b_this_page = head;
				944	attach_page_buffers(page, head);
				945	}
				946
				947	/*
				948	* Initialise the state of a blockdev page's buffers.
				949	*/
				950	static void
				951	init_page_buffers(struct page page, struct block_device bdev,
				952	sector_t block, int size)
				953	{
				954	struct buffer_head *head = page_buffers(page);
				955	struct buffer_head *bh = head;
				956	int uptodate = PageUptodate(page);
				957
				958	do {
				959	if (!buffer_mapped(bh)) {
				960	init_buffer(bh, NULL, NULL);
				961	bh->b_bdev = bdev;
				962	bh->b_blocknr = block;
				963	if (uptodate)
				964	set_buffer_uptodate(bh);
				965	set_buffer_mapped(bh);
				966	}
				967	block++;
				968	bh = bh->b_this_page;
				969	} while (bh != head);
				970	}
				971
				972	/*
				973	* Create the page-cache page that contains the requested block.
				974	*
				975	* This is user purely for blockdev mappings.
				976	*/
				977	static struct page *
				978	grow_dev_page(struct block_device *bdev, sector_t block,
				979	pgoff_t index, int size)
				980	{
				981	struct inode *inode = bdev->bd_inode;
				982	struct page *page;
				983	struct buffer_head *bh;
				984
				985	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
				986	if (!page)
				987	return NULL;
				988
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	989	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	990
				991	if (page_has_buffers(page)) {
				992	bh = page_buffers(page);
				993	if (bh->b_size == size) {
				994	init_page_buffers(page, bdev, block, size);
				995	return page;
				996	}
				997	if (!try_to_free_buffers(page))
				998	goto failed;
				999	}
				1000
				1001	/*
				1002	* Allocate some buffers for this page
				1003	*/
				1004	bh = alloc_page_buffers(page, size, 0);
				1005	if (!bh)
				1006	goto failed;
				1007
				1008	/*
				1009	* Link the page to the buffers and initialise them. Take the
				1010	* lock to be atomic wrt __find_get_block(), which does not
				1011	* run under the page lock.
				1012	*/
				1013	spin_lock(&inode->i_mapping->private_lock);
				1014	link_dev_buffers(page, bh);
				1015	init_page_buffers(page, bdev, block, size);
				1016	spin_unlock(&inode->i_mapping->private_lock);
				1017	return page;
				1018
				1019	failed:
				1020	BUG();
				1021	unlock_page(page);
				1022	page_cache_release(page);
				1023	return NULL;
				1024	}
				1025
				1026	/*
				1027	* Create buffers for the specified block device block's page. If
				1028	* that page was dirty, the buffers are set dirty also.
				1029	*
				1030	* Except that's a bug. Attaching dirty buffers to a dirty
				1031	* blockdev's page can result in filesystem corruption, because
				1032	* some of those buffers may be aliases of filesystem data.
				1033	* grow_dev_page() will go BUG() if this happens.
				1034	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1035	static int
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1036	grow_buffers(struct block_device *bdev, sector_t block, int size)
				1037	{
				1038	struct page *page;
				1039	pgoff_t index;
				1040	int sizebits;
				1041
				1042	sizebits = -1;
				1043	do {
				1044	sizebits++;
				1045	} while ((size << sizebits) < PAGE_SIZE);
				1046
				1047	index = block >> sizebits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1048
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1049	/*
				1050	* Check for a block which wants to lie outside our maximum possible
				1051	* pagecache index. (this comparison is done using sector_t types).
				1052	*/
				1053	if (unlikely(index != block >> sizebits)) {
				1054	char b[BDEVNAME_SIZE];
				1055
				1056	printk(KERN_ERR "%s: requested out-of-range block %llu for "
				1057	"device %s\n",
				1058	__FUNCTION__, (unsigned long long)block,
				1059	bdevname(bdev, b));
				1060	return -EIO;
				1061	}
				1062	block = index << sizebits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1063	/* Create a page with the proper size buffers.. */
				1064	page = grow_dev_page(bdev, block, index, size);
				1065	if (!page)
				1066	return 0;
				1067	unlock_page(page);
				1068	page_cache_release(page);
				1069	return 1;
				1070	}
				1071
Adrian Bunk	75c96f8	2005-05-05 16:16:09 -0700	[diff] [blame]	1072	static struct buffer_head *
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1073	__getblk_slow(struct block_device *bdev, sector_t block, int size)
				1074	{
				1075	/* Size must be multiple of hard sectorsize */
				1076	if (unlikely(size & (bdev_hardsect_size(bdev)-1) \|\|
				1077	(size < 512 \|\| size > PAGE_SIZE))) {
				1078	printk(KERN_ERR "getblk(): invalid block size %d requested\n",
				1079	size);
				1080	printk(KERN_ERR "hardsect size: %d\n",
				1081	bdev_hardsect_size(bdev));
				1082
				1083	dump_stack();
				1084	return NULL;
				1085	}
				1086
				1087	for (;;) {
				1088	struct buffer_head * bh;
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1089	int ret;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1090
				1091	bh = __find_get_block(bdev, block, size);
				1092	if (bh)
				1093	return bh;
				1094
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1095	ret = grow_buffers(bdev, block, size);
				1096	if (ret < 0)
				1097	return NULL;
				1098	if (ret == 0)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1099	free_more_memory();
				1100	}
				1101	}
				1102
				1103	/*
				1104	* The relationship between dirty buffers and dirty pages:
				1105	*
				1106	* Whenever a page has any dirty buffers, the page's dirty bit is set, and
				1107	* the page is tagged dirty in its radix tree.
				1108	*
				1109	* At all times, the dirtiness of the buffers represents the dirtiness of
				1110	* subsections of the page. If the page has buffers, the page dirty bit is
				1111	* merely a hint about the true dirty state.
				1112	*
				1113	* When a page is set dirty in its entirety, all its buffers are marked dirty
				1114	* (if the page has buffers).
				1115	*
				1116	* When a buffer is marked dirty, its page is dirtied, but the page's other
				1117	* buffers are not.
				1118	*
				1119	* Also. When blockdev buffers are explicitly read with bread(), they
				1120	* individually become uptodate. But their backing page remains not
				1121	* uptodate - even if all of its buffers are uptodate. A subsequent
				1122	* block_read_full_page() against that page will discover all the uptodate
				1123	* buffers, will set the page uptodate and will perform no I/O.
				1124	*/
				1125
				1126	/**
				1127	* mark_buffer_dirty - mark a buffer_head as needing writeout
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1128	* @bh: the buffer_head to mark dirty
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1129	*
				1130	* mark_buffer_dirty() will set the dirty bit against the buffer, then set its
				1131	* backing page dirty, then tag the page as dirty in its address_space's radix
				1132	* tree and then attach the address_space's inode to its superblock's dirty
				1133	* inode list.
				1134	*
				1135	* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
				1136	* mapping->tree_lock and the global inode_lock.
				1137	*/
				1138	void fastcall mark_buffer_dirty(struct buffer_head *bh)
				1139	{
				1140	if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
				1141	__set_page_dirty_nobuffers(bh->b_page);
				1142	}
				1143
				1144	/*
				1145	* Decrement a buffer_head's reference count. If all buffers against a page
				1146	* have zero reference count, are clean and unlocked, and if the page is clean
				1147	* and unlocked then try_to_free_buffers() may strip the buffers from the page
				1148	* in preparation for freeing it (sometimes, rarely, buffers are removed from
				1149	* a page but it ends up not being freed, and buffers may later be reattached).
				1150	*/
				1151	void __brelse(struct buffer_head * buf)
				1152	{
				1153	if (atomic_read(&buf->b_count)) {
				1154	put_bh(buf);
				1155	return;
				1156	}
				1157	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
				1158	WARN_ON(1);
				1159	}
				1160
				1161	/*
				1162	* bforget() is like brelse(), except it discards any
				1163	* potentially dirty data.
				1164	*/
				1165	void __bforget(struct buffer_head *bh)
				1166	{
				1167	clear_buffer_dirty(bh);
				1168	if (!list_empty(&bh->b_assoc_buffers)) {
				1169	struct address_space *buffer_mapping = bh->b_page->mapping;
				1170
				1171	spin_lock(&buffer_mapping->private_lock);
				1172	list_del_init(&bh->b_assoc_buffers);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	1173	bh->b_assoc_map = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1174	spin_unlock(&buffer_mapping->private_lock);
				1175	}
				1176	__brelse(bh);
				1177	}
				1178
				1179	static struct buffer_head __bread_slow(struct buffer_head bh)
				1180	{
				1181	lock_buffer(bh);
				1182	if (buffer_uptodate(bh)) {
				1183	unlock_buffer(bh);
				1184	return bh;
				1185	} else {
				1186	get_bh(bh);
				1187	bh->b_end_io = end_buffer_read_sync;
				1188	submit_bh(READ, bh);
				1189	wait_on_buffer(bh);
				1190	if (buffer_uptodate(bh))
				1191	return bh;
				1192	}
				1193	brelse(bh);
				1194	return NULL;
				1195	}
				1196
				1197	/*
				1198	* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
				1199	* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
				1200	* refcount elevated by one when they're in an LRU. A buffer can only appear
				1201	* once in a particular CPU's LRU. A single buffer can be present in multiple
				1202	* CPU's LRUs at the same time.
				1203	*
				1204	* This is a transparent caching front-end to sb_bread(), sb_getblk() and
				1205	* sb_find_get_block().
				1206	*
				1207	* The LRUs themselves only need locking against invalidate_bh_lrus. We use
				1208	* a local interrupt disable for that.
				1209	*/
				1210
				1211	#define BH_LRU_SIZE 8
				1212
				1213	struct bh_lru {
				1214	struct buffer_head *bhs[BH_LRU_SIZE];
				1215	};
				1216
				1217	static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
				1218
				1219	#ifdef CONFIG_SMP
				1220	#define bh_lru_lock() local_irq_disable()
				1221	#define bh_lru_unlock() local_irq_enable()
				1222	#else
				1223	#define bh_lru_lock() preempt_disable()
				1224	#define bh_lru_unlock() preempt_enable()
				1225	#endif
				1226
				1227	static inline void check_irqs_on(void)
				1228	{
				1229	#ifdef irqs_disabled
				1230	BUG_ON(irqs_disabled());
				1231	#endif
				1232	}
				1233
				1234	/*
				1235	* The LRU management algorithm is dopey-but-simple. Sorry.
				1236	*/
				1237	static void bh_lru_install(struct buffer_head *bh)
				1238	{
				1239	struct buffer_head *evictee = NULL;
				1240	struct bh_lru *lru;
				1241
				1242	check_irqs_on();
				1243	bh_lru_lock();
				1244	lru = &__get_cpu_var(bh_lrus);
				1245	if (lru->bhs[0] != bh) {
				1246	struct buffer_head *bhs[BH_LRU_SIZE];
				1247	int in;
				1248	int out = 0;
				1249
				1250	get_bh(bh);
				1251	bhs[out++] = bh;
				1252	for (in = 0; in < BH_LRU_SIZE; in++) {
				1253	struct buffer_head *bh2 = lru->bhs[in];
				1254
				1255	if (bh2 == bh) {
				1256	__brelse(bh2);
				1257	} else {
				1258	if (out >= BH_LRU_SIZE) {
				1259	BUG_ON(evictee != NULL);
				1260	evictee = bh2;
				1261	} else {
				1262	bhs[out++] = bh2;
				1263	}
				1264	}
				1265	}
				1266	while (out < BH_LRU_SIZE)
				1267	bhs[out++] = NULL;
				1268	memcpy(lru->bhs, bhs, sizeof(bhs));
				1269	}
				1270	bh_lru_unlock();
				1271
				1272	if (evictee)
				1273	__brelse(evictee);
				1274	}
				1275
				1276	/*
				1277	* Look up the bh in this cpu's LRU. If it's there, move it to the head.
				1278	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1279	static struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1280	lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1281	{
				1282	struct buffer_head *ret = NULL;
				1283	struct bh_lru *lru;
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1284	unsigned int i;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1285
				1286	check_irqs_on();
				1287	bh_lru_lock();
				1288	lru = &__get_cpu_var(bh_lrus);
				1289	for (i = 0; i < BH_LRU_SIZE; i++) {
				1290	struct buffer_head *bh = lru->bhs[i];
				1291
				1292	if (bh && bh->b_bdev == bdev &&
				1293	bh->b_blocknr == block && bh->b_size == size) {
				1294	if (i) {
				1295	while (i) {
				1296	lru->bhs[i] = lru->bhs[i - 1];
				1297	i--;
				1298	}
				1299	lru->bhs[0] = bh;
				1300	}
				1301	get_bh(bh);
				1302	ret = bh;
				1303	break;
				1304	}
				1305	}
				1306	bh_lru_unlock();
				1307	return ret;
				1308	}
				1309
				1310	/*
				1311	* Perform a pagecache lookup for the matching buffer. If it's there, refresh
				1312	* it in the LRU and mark it as accessed. If it is not present then return
				1313	* NULL
				1314	*/
				1315	struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1316	__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1317	{
				1318	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
				1319
				1320	if (bh == NULL) {
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1321	bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1322	if (bh)
				1323	bh_lru_install(bh);
				1324	}
				1325	if (bh)
				1326	touch_buffer(bh);
				1327	return bh;
				1328	}
				1329	EXPORT_SYMBOL(__find_get_block);
				1330
				1331	/*
				1332	* __getblk will locate (and, if necessary, create) the buffer_head
				1333	* which corresponds to the passed block_device, block and size. The
				1334	* returned buffer has its reference count incremented.
				1335	*
				1336	* __getblk() cannot fail - it just keeps trying. If you pass it an
				1337	* illegal block number, __getblk() will happily return a buffer_head
				1338	* which represents the non-existent block. Very weird.
				1339	*
				1340	* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
				1341	* attempt is failing. FIXME, perhaps?
				1342	*/
				1343	struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1344	__getblk(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1345	{
				1346	struct buffer_head *bh = __find_get_block(bdev, block, size);
				1347
				1348	might_sleep();
				1349	if (bh == NULL)
				1350	bh = __getblk_slow(bdev, block, size);
				1351	return bh;
				1352	}
				1353	EXPORT_SYMBOL(__getblk);
				1354
				1355	/*
				1356	* Do async read-ahead on a buffer..
				1357	*/
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1358	void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1359	{
				1360	struct buffer_head *bh = __getblk(bdev, block, size);
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1361	if (likely(bh)) {
				1362	ll_rw_block(READA, 1, &bh);
				1363	brelse(bh);
				1364	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1365	}
				1366	EXPORT_SYMBOL(__breadahead);
				1367
				1368	/**
				1369	* __bread() - reads a specified block and returns the bh
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1370	* @bdev: the block_device to read from
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1371	* @block: number of block
				1372	* @size: size (in bytes) to read
				1373	*
				1374	* Reads a specified block, and returns buffer head that contains it.
				1375	* It returns NULL if the block was unreadable.
				1376	*/
				1377	struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1378	__bread(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1379	{
				1380	struct buffer_head *bh = __getblk(bdev, block, size);
				1381
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1382	if (likely(bh) && !buffer_uptodate(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1383	bh = __bread_slow(bh);
				1384	return bh;
				1385	}
				1386	EXPORT_SYMBOL(__bread);
				1387
				1388	/*
				1389	* invalidate_bh_lrus() is called rarely - but not only at unmount.
				1390	* This doesn't race because it runs in each cpu either in irq
				1391	* or with preempt disabled.
				1392	*/
				1393	static void invalidate_bh_lru(void *arg)
				1394	{
				1395	struct bh_lru *b = &get_cpu_var(bh_lrus);
				1396	int i;
				1397
				1398	for (i = 0; i < BH_LRU_SIZE; i++) {
				1399	brelse(b->bhs[i]);
				1400	b->bhs[i] = NULL;
				1401	}
				1402	put_cpu_var(bh_lrus);
				1403	}
				1404
Peter Zijlstra	f9a1439	2007-05-06 14:49:55 -0700	[diff] [blame]	1405	void invalidate_bh_lrus(void)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1406	{
				1407	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
				1408	}
				1409
				1410	void set_bh_page(struct buffer_head *bh,
				1411	struct page *page, unsigned long offset)
				1412	{
				1413	bh->b_page = page;
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	1414	BUG_ON(offset >= PAGE_SIZE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1415	if (PageHighMem(page))
				1416	/*
				1417	* This catches illegal uses and preserves the offset:
				1418	*/
				1419	bh->b_data = (char *)(0 + offset);
				1420	else
				1421	bh->b_data = page_address(page) + offset;
				1422	}
				1423	EXPORT_SYMBOL(set_bh_page);
				1424
				1425	/*
				1426	* Called when truncating a buffer on a page completely.
				1427	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1428	static void discard_buffer(struct buffer_head * bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1429	{
				1430	lock_buffer(bh);
				1431	clear_buffer_dirty(bh);
				1432	bh->b_bdev = NULL;
				1433	clear_buffer_mapped(bh);
				1434	clear_buffer_req(bh);
				1435	clear_buffer_new(bh);
				1436	clear_buffer_delay(bh);
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	1437	clear_buffer_unwritten(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1438	unlock_buffer(bh);
				1439	}
				1440
				1441	/**
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1442	* block_invalidatepage - invalidate part of all of a buffer-backed page
				1443	*
				1444	* @page: the page which is affected
				1445	* @offset: the index of the truncation point
				1446	*
				1447	* block_invalidatepage() is called when all or part of the page has become
				1448	* invalidatedby a truncate operation.
				1449	*
				1450	* block_invalidatepage() does not have to release all buffers, but it must
				1451	* ensure that no dirty buffer is left outside @offset and that no I/O
				1452	* is underway against any of the blocks which are outside the truncation
				1453	* point. Because the caller is about to free (and possibly reuse) those
				1454	* blocks on-disk.
				1455	*/
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1456	void block_invalidatepage(struct page *page, unsigned long offset)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1457	{
				1458	struct buffer_head head, bh, *next;
				1459	unsigned int curr_off = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1460
				1461	BUG_ON(!PageLocked(page));
				1462	if (!page_has_buffers(page))
				1463	goto out;
				1464
				1465	head = page_buffers(page);
				1466	bh = head;
				1467	do {
				1468	unsigned int next_off = curr_off + bh->b_size;
				1469	next = bh->b_this_page;
				1470
				1471	/*
				1472	* is this block fully invalidated?
				1473	*/
				1474	if (offset <= curr_off)
				1475	discard_buffer(bh);
				1476	curr_off = next_off;
				1477	bh = next;
				1478	} while (bh != head);
				1479
				1480	/*
				1481	* We release buffers only if the entire page is being invalidated.
				1482	* The get_block cached value has been unconditionally invalidated,
				1483	* so real IO is not possible anymore.
				1484	*/
				1485	if (offset == 0)
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1486	try_to_release_page(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1487	out:
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1488	return;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1489	}
				1490	EXPORT_SYMBOL(block_invalidatepage);
				1491
				1492	/*
				1493	* We attach and possibly dirty the buffers atomically wrt
				1494	* __set_page_dirty_buffers() via private_lock. try_to_free_buffers
				1495	* is already excluded via the page lock.
				1496	*/
				1497	void create_empty_buffers(struct page *page,
				1498	unsigned long blocksize, unsigned long b_state)
				1499	{
				1500	struct buffer_head bh, head, *tail;
				1501
				1502	head = alloc_page_buffers(page, blocksize, 1);
				1503	bh = head;
				1504	do {
				1505	bh->b_state \|= b_state;
				1506	tail = bh;
				1507	bh = bh->b_this_page;
				1508	} while (bh);
				1509	tail->b_this_page = head;
				1510
				1511	spin_lock(&page->mapping->private_lock);
				1512	if (PageUptodate(page) \|\| PageDirty(page)) {
				1513	bh = head;
				1514	do {
				1515	if (PageDirty(page))
				1516	set_buffer_dirty(bh);
				1517	if (PageUptodate(page))
				1518	set_buffer_uptodate(bh);
				1519	bh = bh->b_this_page;
				1520	} while (bh != head);
				1521	}
				1522	attach_page_buffers(page, head);
				1523	spin_unlock(&page->mapping->private_lock);
				1524	}
				1525	EXPORT_SYMBOL(create_empty_buffers);
				1526
				1527	/*
				1528	* We are taking a block for data and we don't want any output from any
				1529	* buffer-cache aliases starting from return from that function and
				1530	* until the moment when something will explicitly mark the buffer
				1531	* dirty (hopefully that will not happen until we will free that block ;-)
				1532	* We don't even need to mark it not-uptodate - nobody can expect
				1533	* anything from a newly allocated buffer anyway. We used to used
				1534	* unmap_buffer() for such invalidation, but that was wrong. We definitely
				1535	* don't want to mark the alias unmapped, for example - it would confuse
				1536	* anyone who might pick it with bread() afterwards...
				1537	*
				1538	* Also.. Note that bforget() doesn't lock the buffer. So there can
				1539	* be writeout I/O going on against recently-freed buffers. We don't
				1540	* wait on that I/O in bforget() - it's more efficient to wait on the I/O
				1541	* only if we really need to. That happens here.
				1542	*/
				1543	void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
				1544	{
				1545	struct buffer_head *old_bh;
				1546
				1547	might_sleep();
				1548
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1549	old_bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1550	if (old_bh) {
				1551	clear_buffer_dirty(old_bh);
				1552	wait_on_buffer(old_bh);
				1553	clear_buffer_req(old_bh);
				1554	__brelse(old_bh);
				1555	}
				1556	}
				1557	EXPORT_SYMBOL(unmap_underlying_metadata);
				1558
				1559	/*
				1560	* NOTE! All mapped/uptodate combinations are valid:
				1561	*
				1562	* Mapped Uptodate Meaning
				1563	*
				1564	* No No "unknown" - must do get_block()
				1565	* No Yes "hole" - zero-filled
				1566	* Yes No "allocated" - allocated on disk, not read in
				1567	* Yes Yes "valid" - allocated and up-to-date in memory.
				1568	*
				1569	* "Dirty" is valid only with the last case (mapped+uptodate).
				1570	*/
				1571
				1572	/*
				1573	* While block_write_full_page is writing back the dirty buffers under
				1574	* the page lock, whoever dirtied the buffers may decide to clean them
				1575	* again at any time. We handle that by only looking at the buffer
				1576	* state inside lock_buffer().
				1577	*
				1578	* If block_write_full_page() is called for regular writeback
				1579	* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
				1580	* locked buffer. This only can happen if someone has written the buffer
				1581	* directly, with submit_bh(). At the address_space level PageWriteback
				1582	* prevents this contention from occurring.
				1583	*/
				1584	static int __block_write_full_page(struct inode inode, struct page page,
				1585	get_block_t get_block, struct writeback_control wbc)
				1586	{
				1587	int err;
				1588	sector_t block;
				1589	sector_t last_block;
Andrew Morton	f0fbd5f	2005-05-05 16:15:48 -0700	[diff] [blame]	1590	struct buffer_head bh, head;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1591	const unsigned blocksize = 1 << inode->i_blkbits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1592	int nr_underway = 0;
				1593
				1594	BUG_ON(!PageLocked(page));
				1595
				1596	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
				1597
				1598	if (!page_has_buffers(page)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1599	create_empty_buffers(page, blocksize,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1600	(1 << BH_Dirty)\|(1 << BH_Uptodate));
				1601	}
				1602
				1603	/*
				1604	* Be very careful. We have no exclusion from __set_page_dirty_buffers
				1605	* here, and the (potentially unmapped) buffers may become dirty at
				1606	* any time. If a buffer becomes dirty here after we've inspected it
				1607	* then we just miss that fact, and the page stays dirty.
				1608	*
				1609	* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
				1610	* handle that here by just cleaning them.
				1611	*/
				1612
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	1613	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1614	head = page_buffers(page);
				1615	bh = head;
				1616
				1617	/*
				1618	* Get all the dirty buffers mapped to disk addresses and
				1619	* handle any aliases from the underlying blockdev's mapping.
				1620	*/
				1621	do {
				1622	if (block > last_block) {
				1623	/*
				1624	* mapped buffers outside i_size will occur, because
				1625	* this page can be outside i_size when there is a
				1626	* truncate in progress.
				1627	*/
				1628	/*
				1629	* The buffer was zeroed by block_write_full_page()
				1630	*/
				1631	clear_buffer_dirty(bh);
				1632	set_buffer_uptodate(bh);
				1633	} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1634	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1635	err = get_block(inode, block, bh, 1);
				1636	if (err)
				1637	goto recover;
				1638	if (buffer_new(bh)) {
				1639	/* blockdev mappings never come here */
				1640	clear_buffer_new(bh);
				1641	unmap_underlying_metadata(bh->b_bdev,
				1642	bh->b_blocknr);
				1643	}
				1644	}
				1645	bh = bh->b_this_page;
				1646	block++;
				1647	} while (bh != head);
				1648
				1649	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1650	if (!buffer_mapped(bh))
				1651	continue;
				1652	/*
				1653	* If it's a fully non-blocking write attempt and we cannot
				1654	* lock the buffer then redirty the page. Note that this can
				1655	* potentially cause a busy-wait loop from pdflush and kswapd
				1656	* activity, but those code paths have their own higher-level
				1657	* throttling.
				1658	*/
				1659	if (wbc->sync_mode != WB_SYNC_NONE \|\| !wbc->nonblocking) {
				1660	lock_buffer(bh);
				1661	} else if (test_set_buffer_locked(bh)) {
				1662	redirty_page_for_writepage(wbc, page);
				1663	continue;
				1664	}
				1665	if (test_clear_buffer_dirty(bh)) {
				1666	mark_buffer_async_write(bh);
				1667	} else {
				1668	unlock_buffer(bh);
				1669	}
				1670	} while ((bh = bh->b_this_page) != head);
				1671
				1672	/*
				1673	* The page and its buffers are protected by PageWriteback(), so we can
				1674	* drop the bh refcounts early.
				1675	*/
				1676	BUG_ON(PageWriteback(page));
				1677	set_page_writeback(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1678
				1679	do {
				1680	struct buffer_head *next = bh->b_this_page;
				1681	if (buffer_async_write(bh)) {
				1682	submit_bh(WRITE, bh);
				1683	nr_underway++;
				1684	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1685	bh = next;
				1686	} while (bh != head);
Andrew Morton	05937ba	2005-05-05 16:15:47 -0700	[diff] [blame]	1687	unlock_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1688
				1689	err = 0;
				1690	done:
				1691	if (nr_underway == 0) {
				1692	/*
				1693	* The page was marked dirty, but the buffers were
				1694	* clean. Someone wrote them back by hand with
				1695	* ll_rw_block/submit_bh. A rare case.
				1696	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1697	end_page_writeback(page);
Nick Piggin	3d67f2d	2007-05-06 14:49:05 -0700	[diff] [blame]	1698
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1699	/*
				1700	* The page and buffer_heads can be released at any time from
				1701	* here on.
				1702	*/
				1703	wbc->pages_skipped++; /* We didn't write this page */
				1704	}
				1705	return err;
				1706
				1707	recover:
				1708	/*
				1709	* ENOSPC, or some other error. We may already have added some
				1710	* blocks to the file, so we need to write these out to avoid
				1711	* exposing stale data.
				1712	* The page is currently locked and not marked for writeback
				1713	*/
				1714	bh = head;
				1715	/* Recovery: lock and submit the mapped buffers */
				1716	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1717	if (buffer_mapped(bh) && buffer_dirty(bh)) {
				1718	lock_buffer(bh);
				1719	mark_buffer_async_write(bh);
				1720	} else {
				1721	/*
				1722	* The buffer may have been set dirty during
				1723	* attachment to a dirty page.
				1724	*/
				1725	clear_buffer_dirty(bh);
				1726	}
				1727	} while ((bh = bh->b_this_page) != head);
				1728	SetPageError(page);
				1729	BUG_ON(PageWriteback(page));
Andrew Morton	7e4c369	2007-05-08 00:23:27 -0700	[diff] [blame^]	1730	mapping_set_error(page->mapping, err);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1731	set_page_writeback(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1732	do {
				1733	struct buffer_head *next = bh->b_this_page;
				1734	if (buffer_async_write(bh)) {
				1735	clear_buffer_dirty(bh);
				1736	submit_bh(WRITE, bh);
				1737	nr_underway++;
				1738	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1739	bh = next;
				1740	} while (bh != head);
Nick Piggin	ffda9d3	2007-02-20 13:57:54 -0800	[diff] [blame]	1741	unlock_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1742	goto done;
				1743	}
				1744
				1745	static int __block_prepare_write(struct inode inode, struct page page,
				1746	unsigned from, unsigned to, get_block_t *get_block)
				1747	{
				1748	unsigned block_start, block_end;
				1749	sector_t block;
				1750	int err = 0;
				1751	unsigned blocksize, bbits;
				1752	struct buffer_head bh, head, wait[2], *wait_bh=wait;
				1753
				1754	BUG_ON(!PageLocked(page));
				1755	BUG_ON(from > PAGE_CACHE_SIZE);
				1756	BUG_ON(to > PAGE_CACHE_SIZE);
				1757	BUG_ON(from > to);
				1758
				1759	blocksize = 1 << inode->i_blkbits;
				1760	if (!page_has_buffers(page))
				1761	create_empty_buffers(page, blocksize, 0);
				1762	head = page_buffers(page);
				1763
				1764	bbits = inode->i_blkbits;
				1765	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
				1766
				1767	for(bh = head, block_start = 0; bh != head \|\| !block_start;
				1768	block++, block_start=block_end, bh = bh->b_this_page) {
				1769	block_end = block_start + blocksize;
				1770	if (block_end <= from \|\| block_start >= to) {
				1771	if (PageUptodate(page)) {
				1772	if (!buffer_uptodate(bh))
				1773	set_buffer_uptodate(bh);
				1774	}
				1775	continue;
				1776	}
				1777	if (buffer_new(bh))
				1778	clear_buffer_new(bh);
				1779	if (!buffer_mapped(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1780	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1781	err = get_block(inode, block, bh, 1);
				1782	if (err)
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1783	break;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1784	if (buffer_new(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1785	unmap_underlying_metadata(bh->b_bdev,
				1786	bh->b_blocknr);
				1787	if (PageUptodate(page)) {
				1788	set_buffer_uptodate(bh);
				1789	continue;
				1790	}
				1791	if (block_end > to \|\| block_start < from) {
				1792	void *kaddr;
				1793
				1794	kaddr = kmap_atomic(page, KM_USER0);
				1795	if (block_end > to)
				1796	memset(kaddr+to, 0,
				1797	block_end-to);
				1798	if (block_start < from)
				1799	memset(kaddr+block_start,
				1800	0, from-block_start);
				1801	flush_dcache_page(page);
				1802	kunmap_atomic(kaddr, KM_USER0);
				1803	}
				1804	continue;
				1805	}
				1806	}
				1807	if (PageUptodate(page)) {
				1808	if (!buffer_uptodate(bh))
				1809	set_buffer_uptodate(bh);
				1810	continue;
				1811	}
				1812	if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	1813	!buffer_unwritten(bh) &&
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1814	(block_start < from \|\| block_end > to)) {
				1815	ll_rw_block(READ, 1, &bh);
				1816	*wait_bh++=bh;
				1817	}
				1818	}
				1819	/*
				1820	* If we issued read requests - let them complete.
				1821	*/
				1822	while(wait_bh > wait) {
				1823	wait_on_buffer(*--wait_bh);
				1824	if (!buffer_uptodate(*wait_bh))
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1825	err = -EIO;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1826	}
Anton Altaparmakov	152becd	2005-06-23 00:10:21 -0700	[diff] [blame]	1827	if (!err) {
				1828	bh = head;
				1829	do {
				1830	if (buffer_new(bh))
				1831	clear_buffer_new(bh);
				1832	} while ((bh = bh->b_this_page) != head);
				1833	return 0;
				1834	}
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1835	/* Error case: */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1836	/*
				1837	* Zero out any newly allocated blocks to avoid exposing stale
				1838	* data. If BH_New is set, we know that the block was newly
				1839	* allocated in the above loop.
				1840	*/
				1841	bh = head;
				1842	block_start = 0;
				1843	do {
				1844	block_end = block_start+blocksize;
				1845	if (block_end <= from)
				1846	goto next_bh;
				1847	if (block_start >= to)
				1848	break;
				1849	if (buffer_new(bh)) {
				1850	void *kaddr;
				1851
				1852	clear_buffer_new(bh);
				1853	kaddr = kmap_atomic(page, KM_USER0);
				1854	memset(kaddr+block_start, 0, bh->b_size);
Monakhov Dmitriy	8c58165	2006-10-11 01:22:00 -0700	[diff] [blame]	1855	flush_dcache_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1856	kunmap_atomic(kaddr, KM_USER0);
				1857	set_buffer_uptodate(bh);
				1858	mark_buffer_dirty(bh);
				1859	}
				1860	next_bh:
				1861	block_start = block_end;
				1862	bh = bh->b_this_page;
				1863	} while (bh != head);
				1864	return err;
				1865	}
				1866
				1867	static int __block_commit_write(struct inode inode, struct page page,
				1868	unsigned from, unsigned to)
				1869	{
				1870	unsigned block_start, block_end;
				1871	int partial = 0;
				1872	unsigned blocksize;
				1873	struct buffer_head bh, head;
				1874
				1875	blocksize = 1 << inode->i_blkbits;
				1876
				1877	for(bh = head = page_buffers(page), block_start = 0;
				1878	bh != head \|\| !block_start;
				1879	block_start=block_end, bh = bh->b_this_page) {
				1880	block_end = block_start + blocksize;
				1881	if (block_end <= from \|\| block_start >= to) {
				1882	if (!buffer_uptodate(bh))
				1883	partial = 1;
				1884	} else {
				1885	set_buffer_uptodate(bh);
				1886	mark_buffer_dirty(bh);
				1887	}
				1888	}
				1889
				1890	/*
				1891	* If this is a partial write which happened to make all buffers
				1892	* uptodate then we can optimize away a bogus readpage() for
				1893	* the next read(). Here we 'discover' whether the page went
				1894	* uptodate as a result of this (potentially partial) write.
				1895	*/
				1896	if (!partial)
				1897	SetPageUptodate(page);
				1898	return 0;
				1899	}
				1900
				1901	/*
				1902	* Generic "read page" function for block devices that have the normal
				1903	* get_block functionality. This is most of the block device filesystems.
				1904	* Reads the page asynchronously --- the unlock_buffer() and
				1905	* set/clear_buffer_uptodate() functions propagate buffer state into the
				1906	* page struct once IO has completed.
				1907	*/
				1908	int block_read_full_page(struct page page, get_block_t get_block)
				1909	{
				1910	struct inode *inode = page->mapping->host;
				1911	sector_t iblock, lblock;
				1912	struct buffer_head bh, head, *arr[MAX_BUF_PER_PAGE];
				1913	unsigned int blocksize;
				1914	int nr, i;
				1915	int fully_mapped = 1;
				1916
Matt Mackall	cd7619d	2005-05-01 08:59:01 -0700	[diff] [blame]	1917	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1918	blocksize = 1 << inode->i_blkbits;
				1919	if (!page_has_buffers(page))
				1920	create_empty_buffers(page, blocksize, 0);
				1921	head = page_buffers(page);
				1922
				1923	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
				1924	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
				1925	bh = head;
				1926	nr = 0;
				1927	i = 0;
				1928
				1929	do {
				1930	if (buffer_uptodate(bh))
				1931	continue;
				1932
				1933	if (!buffer_mapped(bh)) {
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	1934	int err = 0;
				1935
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1936	fully_mapped = 0;
				1937	if (iblock < lblock) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1938	WARN_ON(bh->b_size != blocksize);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	1939	err = get_block(inode, iblock, bh, 0);
				1940	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1941	SetPageError(page);
				1942	}
				1943	if (!buffer_mapped(bh)) {
				1944	void *kaddr = kmap_atomic(page, KM_USER0);
				1945	memset(kaddr + i * blocksize, 0, blocksize);
				1946	flush_dcache_page(page);
				1947	kunmap_atomic(kaddr, KM_USER0);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	1948	if (!err)
				1949	set_buffer_uptodate(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1950	continue;
				1951	}
				1952	/*
				1953	* get_block() might have updated the buffer
				1954	* synchronously
				1955	*/
				1956	if (buffer_uptodate(bh))
				1957	continue;
				1958	}
				1959	arr[nr++] = bh;
				1960	} while (i++, iblock++, (bh = bh->b_this_page) != head);
				1961
				1962	if (fully_mapped)
				1963	SetPageMappedToDisk(page);
				1964
				1965	if (!nr) {
				1966	/*
				1967	* All buffers are uptodate - we can set the page uptodate
				1968	* as well. But not if get_block() returned an error.
				1969	*/
				1970	if (!PageError(page))
				1971	SetPageUptodate(page);
				1972	unlock_page(page);
				1973	return 0;
				1974	}
				1975
				1976	/* Stage two: lock the buffers */
				1977	for (i = 0; i < nr; i++) {
				1978	bh = arr[i];
				1979	lock_buffer(bh);
				1980	mark_buffer_async_read(bh);
				1981	}
				1982
				1983	/*
				1984	* Stage 3: start the IO. Check for uptodateness
				1985	* inside the buffer lock in case another process reading
				1986	* the underlying blockdev brought it uptodate (the sct fix).
				1987	*/
				1988	for (i = 0; i < nr; i++) {
				1989	bh = arr[i];
				1990	if (buffer_uptodate(bh))
				1991	end_buffer_async_read(bh, 1);
				1992	else
				1993	submit_bh(READ, bh);
				1994	}
				1995	return 0;
				1996	}
				1997
				1998	/* utility function for filesystems that need to do work on expanding
				1999	* truncates. Uses prepare/commit_write to allow the filesystem to
				2000	* deal with the hole.
				2001	*/
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2002	static int __generic_cont_expand(struct inode *inode, loff_t size,
				2003	pgoff_t index, unsigned int offset)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2004	{
				2005	struct address_space *mapping = inode->i_mapping;
				2006	struct page *page;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2007	unsigned long limit;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2008	int err;
				2009
				2010	err = -EFBIG;
				2011	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
				2012	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
				2013	send_sig(SIGXFSZ, current, 0);
				2014	goto out;
				2015	}
				2016	if (size > inode->i_sb->s_maxbytes)
				2017	goto out;
				2018
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2019	err = -ENOMEM;
				2020	page = grab_cache_page(mapping, index);
				2021	if (!page)
				2022	goto out;
				2023	err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2024	if (err) {
				2025	/*
				2026	* ->prepare_write() may have instantiated a few blocks
				2027	* outside i_size. Trim these off again.
				2028	*/
				2029	unlock_page(page);
				2030	page_cache_release(page);
				2031	vmtruncate(inode, inode->i_size);
				2032	goto out;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2033	}
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2034
				2035	err = mapping->a_ops->commit_write(NULL, page, offset, offset);
				2036
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2037	unlock_page(page);
				2038	page_cache_release(page);
				2039	if (err > 0)
				2040	err = 0;
				2041	out:
				2042	return err;
				2043	}
				2044
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2045	int generic_cont_expand(struct inode *inode, loff_t size)
				2046	{
				2047	pgoff_t index;
				2048	unsigned int offset;
				2049
				2050	offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
				2051
				2052	/* ugh. in prepare/commit_write, if from==to==start of block, we
				2053	** skip the prepare. make sure we never send an offset for the start
				2054	** of a block
				2055	*/
				2056	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
				2057	/* caller must handle this extra byte. */
				2058	offset++;
				2059	}
				2060	index = size >> PAGE_CACHE_SHIFT;
				2061
				2062	return __generic_cont_expand(inode, size, index, offset);
				2063	}
				2064
				2065	int generic_cont_expand_simple(struct inode *inode, loff_t size)
				2066	{
				2067	loff_t pos = size - 1;
				2068	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
				2069	unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
				2070
				2071	/* prepare/commit_write can handle even if from==to==start of block. */
				2072	return __generic_cont_expand(inode, size, index, offset);
				2073	}
				2074
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2075	/*
				2076	* For moronic filesystems that do not allow holes in file.
				2077	* We may have to extend the file.
				2078	*/
				2079
				2080	int cont_prepare_write(struct page *page, unsigned offset,
				2081	unsigned to, get_block_t get_block, loff_t bytes)
				2082	{
				2083	struct address_space *mapping = page->mapping;
				2084	struct inode *inode = mapping->host;
				2085	struct page *new_page;
				2086	pgoff_t pgpos;
				2087	long status;
				2088	unsigned zerofrom;
				2089	unsigned blocksize = 1 << inode->i_blkbits;
				2090	void *kaddr;
				2091
				2092	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
				2093	status = -ENOMEM;
				2094	new_page = grab_cache_page(mapping, pgpos);
				2095	if (!new_page)
				2096	goto out;
				2097	/* we might sleep */
				2098	if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
				2099	unlock_page(new_page);
				2100	page_cache_release(new_page);
				2101	continue;
				2102	}
				2103	zerofrom = *bytes & ~PAGE_CACHE_MASK;
				2104	if (zerofrom & (blocksize-1)) {
				2105	*bytes \|= (blocksize-1);
				2106	(*bytes)++;
				2107	}
				2108	status = __block_prepare_write(inode, new_page, zerofrom,
				2109	PAGE_CACHE_SIZE, get_block);
				2110	if (status)
				2111	goto out_unmap;
				2112	kaddr = kmap_atomic(new_page, KM_USER0);
				2113	memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
				2114	flush_dcache_page(new_page);
				2115	kunmap_atomic(kaddr, KM_USER0);
				2116	generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
				2117	unlock_page(new_page);
				2118	page_cache_release(new_page);
				2119	}
				2120
				2121	if (page->index < pgpos) {
				2122	/* completely inside the area */
				2123	zerofrom = offset;
				2124	} else {
				2125	/* page covers the boundary, find the boundary offset */
				2126	zerofrom = *bytes & ~PAGE_CACHE_MASK;
				2127
				2128	/* if we will expand the thing last block will be filled */
				2129	if (to > zerofrom && (zerofrom & (blocksize-1))) {
				2130	*bytes \|= (blocksize-1);
				2131	(*bytes)++;
				2132	}
				2133
				2134	/* starting below the boundary? Nothing to zero out */
				2135	if (offset <= zerofrom)
				2136	zerofrom = offset;
				2137	}
				2138	status = __block_prepare_write(inode, page, zerofrom, to, get_block);
				2139	if (status)
				2140	goto out1;
				2141	if (zerofrom < offset) {
				2142	kaddr = kmap_atomic(page, KM_USER0);
				2143	memset(kaddr+zerofrom, 0, offset-zerofrom);
				2144	flush_dcache_page(page);
				2145	kunmap_atomic(kaddr, KM_USER0);
				2146	__block_commit_write(inode, page, zerofrom, offset);
				2147	}
				2148	return 0;
				2149	out1:
				2150	ClearPageUptodate(page);
				2151	return status;
				2152
				2153	out_unmap:
				2154	ClearPageUptodate(new_page);
				2155	unlock_page(new_page);
				2156	page_cache_release(new_page);
				2157	out:
				2158	return status;
				2159	}
				2160
				2161	int block_prepare_write(struct page *page, unsigned from, unsigned to,
				2162	get_block_t *get_block)
				2163	{
				2164	struct inode *inode = page->mapping->host;
				2165	int err = __block_prepare_write(inode, page, from, to, get_block);
				2166	if (err)
				2167	ClearPageUptodate(page);
				2168	return err;
				2169	}
				2170
				2171	int block_commit_write(struct page *page, unsigned from, unsigned to)
				2172	{
				2173	struct inode *inode = page->mapping->host;
				2174	__block_commit_write(inode,page,from,to);
				2175	return 0;
				2176	}
				2177
				2178	int generic_commit_write(struct file file, struct page page,
				2179	unsigned from, unsigned to)
				2180	{
				2181	struct inode *inode = page->mapping->host;
				2182	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				2183	__block_commit_write(inode,page,from,to);
				2184	/*
				2185	* No need to use i_size_read() here, the i_size
Jes Sorensen	1b1dcc1	2006-01-09 15:59:24 -0800	[diff] [blame]	2186	* cannot change under us because we hold i_mutex.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2187	*/
				2188	if (pos > inode->i_size) {
				2189	i_size_write(inode, pos);
				2190	mark_inode_dirty(inode);
				2191	}
				2192	return 0;
				2193	}
				2194
				2195
				2196	/*
				2197	* nobh_prepare_write()'s prereads are special: the buffer_heads are freed
				2198	* immediately, while under the page lock. So it needs a special end_io
				2199	* handler which does not touch the bh after unlocking it.
				2200	*
				2201	* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
				2202	* a race there is benign: unlock_buffer() only use the bh's address for
				2203	* hashing after unlocking the buffer, so it doesn't actually touch the bh
				2204	* itself.
				2205	*/
				2206	static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
				2207	{
				2208	if (uptodate) {
				2209	set_buffer_uptodate(bh);
				2210	} else {
				2211	/* This happens, due to failed READA attempts. */
				2212	clear_buffer_uptodate(bh);
				2213	}
				2214	unlock_buffer(bh);
				2215	}
				2216
				2217	/*
				2218	* On entry, the page is fully not uptodate.
				2219	* On exit the page is fully uptodate in the areas outside (from,to)
				2220	*/
				2221	int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
				2222	get_block_t *get_block)
				2223	{
				2224	struct inode *inode = page->mapping->host;
				2225	const unsigned blkbits = inode->i_blkbits;
				2226	const unsigned blocksize = 1 << blkbits;
				2227	struct buffer_head map_bh;
				2228	struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
				2229	unsigned block_in_page;
				2230	unsigned block_start;
				2231	sector_t block_in_file;
				2232	char *kaddr;
				2233	int nr_reads = 0;
				2234	int i;
				2235	int ret = 0;
				2236	int is_mapped_to_disk = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2237
				2238	if (PageMappedToDisk(page))
				2239	return 0;
				2240
				2241	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
				2242	map_bh.b_page = page;
				2243
				2244	/*
				2245	* We loop across all blocks in the page, whether or not they are
				2246	* part of the affected region. This is so we can discover if the
				2247	* page is fully mapped-to-disk.
				2248	*/
				2249	for (block_start = 0, block_in_page = 0;
				2250	block_start < PAGE_CACHE_SIZE;
				2251	block_in_page++, block_start += blocksize) {
				2252	unsigned block_end = block_start + blocksize;
				2253	int create;
				2254
				2255	map_bh.b_state = 0;
				2256	create = 1;
				2257	if (block_start >= to)
				2258	create = 0;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2259	map_bh.b_size = blocksize;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2260	ret = get_block(inode, block_in_file + block_in_page,
				2261	&map_bh, create);
				2262	if (ret)
				2263	goto failed;
				2264	if (!buffer_mapped(&map_bh))
				2265	is_mapped_to_disk = 0;
				2266	if (buffer_new(&map_bh))
				2267	unmap_underlying_metadata(map_bh.b_bdev,
				2268	map_bh.b_blocknr);
				2269	if (PageUptodate(page))
				2270	continue;
				2271	if (buffer_new(&map_bh) \|\| !buffer_mapped(&map_bh)) {
				2272	kaddr = kmap_atomic(page, KM_USER0);
Nick Piggin	22c8ca7	2007-02-20 13:58:09 -0800	[diff] [blame]	2273	if (block_start < from)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2274	memset(kaddr+block_start, 0, from-block_start);
Nick Piggin	22c8ca7	2007-02-20 13:58:09 -0800	[diff] [blame]	2275	if (block_end > to)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2276	memset(kaddr + to, 0, block_end - to);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2277	flush_dcache_page(page);
				2278	kunmap_atomic(kaddr, KM_USER0);
				2279	continue;
				2280	}
				2281	if (buffer_uptodate(&map_bh))
				2282	continue; /* reiserfs does this */
				2283	if (block_start < from \|\| block_end > to) {
				2284	struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
				2285
				2286	if (!bh) {
				2287	ret = -ENOMEM;
				2288	goto failed;
				2289	}
				2290	bh->b_state = map_bh.b_state;
				2291	atomic_set(&bh->b_count, 0);
				2292	bh->b_this_page = NULL;
				2293	bh->b_page = page;
				2294	bh->b_blocknr = map_bh.b_blocknr;
				2295	bh->b_size = blocksize;
				2296	bh->b_data = (char *)(long)block_start;
				2297	bh->b_bdev = map_bh.b_bdev;
				2298	bh->b_private = NULL;
				2299	read_bh[nr_reads++] = bh;
				2300	}
				2301	}
				2302
				2303	if (nr_reads) {
				2304	struct buffer_head *bh;
				2305
				2306	/*
				2307	* The page is locked, so these buffers are protected from
				2308	* any VM or truncate activity. Hence we don't need to care
				2309	* for the buffer_head refcounts.
				2310	*/
				2311	for (i = 0; i < nr_reads; i++) {
				2312	bh = read_bh[i];
				2313	lock_buffer(bh);
				2314	bh->b_end_io = end_buffer_read_nobh;
				2315	submit_bh(READ, bh);
				2316	}
				2317	for (i = 0; i < nr_reads; i++) {
				2318	bh = read_bh[i];
				2319	wait_on_buffer(bh);
				2320	if (!buffer_uptodate(bh))
				2321	ret = -EIO;
				2322	free_buffer_head(bh);
				2323	read_bh[i] = NULL;
				2324	}
				2325	if (ret)
				2326	goto failed;
				2327	}
				2328
				2329	if (is_mapped_to_disk)
				2330	SetPageMappedToDisk(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2331
				2332	return 0;
				2333
				2334	failed:
				2335	for (i = 0; i < nr_reads; i++) {
				2336	if (read_bh[i])
				2337	free_buffer_head(read_bh[i]);
				2338	}
				2339
				2340	/*
				2341	* Error recovery is pretty slack. Clear the page and mark it dirty
				2342	* so we'll later zero out any blocks which _were_ allocated.
				2343	*/
				2344	kaddr = kmap_atomic(page, KM_USER0);
				2345	memset(kaddr, 0, PAGE_CACHE_SIZE);
Monakhov Dmitriy	8c58165	2006-10-11 01:22:00 -0700	[diff] [blame]	2346	flush_dcache_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2347	kunmap_atomic(kaddr, KM_USER0);
				2348	SetPageUptodate(page);
				2349	set_page_dirty(page);
				2350	return ret;
				2351	}
				2352	EXPORT_SYMBOL(nobh_prepare_write);
				2353
Dave Kleikamp	57bf63d	2007-03-06 01:42:12 -0800	[diff] [blame]	2354	/*
				2355	* Make sure any changes to nobh_commit_write() are reflected in
				2356	* nobh_truncate_page(), since it doesn't call commit_write().
				2357	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2358	int nobh_commit_write(struct file file, struct page page,
				2359	unsigned from, unsigned to)
				2360	{
				2361	struct inode *inode = page->mapping->host;
				2362	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				2363
Nick Piggin	22c8ca7	2007-02-20 13:58:09 -0800	[diff] [blame]	2364	SetPageUptodate(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2365	set_page_dirty(page);
				2366	if (pos > inode->i_size) {
				2367	i_size_write(inode, pos);
				2368	mark_inode_dirty(inode);
				2369	}
				2370	return 0;
				2371	}
				2372	EXPORT_SYMBOL(nobh_commit_write);
				2373
				2374	/*
				2375	* nobh_writepage() - based on block_full_write_page() except
				2376	* that it tries to operate without attaching bufferheads to
				2377	* the page.
				2378	*/
				2379	int nobh_writepage(struct page page, get_block_t get_block,
				2380	struct writeback_control *wbc)
				2381	{
				2382	struct inode * const inode = page->mapping->host;
				2383	loff_t i_size = i_size_read(inode);
				2384	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2385	unsigned offset;
				2386	void *kaddr;
				2387	int ret;
				2388
				2389	/* Is the page fully inside i_size? */
				2390	if (page->index < end_index)
				2391	goto out;
				2392
				2393	/* Is the page fully outside i_size? (truncate in progress) */
				2394	offset = i_size & (PAGE_CACHE_SIZE-1);
				2395	if (page->index >= end_index+1 \|\| !offset) {
				2396	/*
				2397	* The page may have dirty, unmapped buffers. For example,
				2398	* they may have been added in ext3_writepage(). Make them
				2399	* freeable here, so the page does not leak.
				2400	*/
				2401	#if 0
				2402	/* Not really sure about this - do we need this ? */
				2403	if (page->mapping->a_ops->invalidatepage)
				2404	page->mapping->a_ops->invalidatepage(page, offset);
				2405	#endif
				2406	unlock_page(page);
				2407	return 0; /* don't care */
				2408	}
				2409
				2410	/*
				2411	* The page straddles i_size. It must be zeroed out on each and every
				2412	* writepage invocation because it may be mmapped. "A file is mapped
				2413	* in multiples of the page size. For a file that is not a multiple of
				2414	* the page size, the remaining memory is zeroed when mapped, and
				2415	* writes to that region are not written out to the file."
				2416	*/
				2417	kaddr = kmap_atomic(page, KM_USER0);
				2418	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2419	flush_dcache_page(page);
				2420	kunmap_atomic(kaddr, KM_USER0);
				2421	out:
				2422	ret = mpage_writepage(page, get_block, wbc);
				2423	if (ret == -EAGAIN)
				2424	ret = __block_write_full_page(inode, page, get_block, wbc);
				2425	return ret;
				2426	}
				2427	EXPORT_SYMBOL(nobh_writepage);
				2428
				2429	/*
				2430	* This function assumes that ->prepare_write() uses nobh_prepare_write().
				2431	*/
				2432	int nobh_truncate_page(struct address_space *mapping, loff_t from)
				2433	{
				2434	struct inode *inode = mapping->host;
				2435	unsigned blocksize = 1 << inode->i_blkbits;
				2436	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2437	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				2438	unsigned to;
				2439	struct page *page;
Christoph Hellwig	f5e54d6	2006-06-28 04:26:44 -0700	[diff] [blame]	2440	const struct address_space_operations *a_ops = mapping->a_ops;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2441	char *kaddr;
				2442	int ret = 0;
				2443
				2444	if ((offset & (blocksize - 1)) == 0)
				2445	goto out;
				2446
				2447	ret = -ENOMEM;
				2448	page = grab_cache_page(mapping, index);
				2449	if (!page)
				2450	goto out;
				2451
				2452	to = (offset + blocksize) & ~(blocksize - 1);
				2453	ret = a_ops->prepare_write(NULL, page, offset, to);
				2454	if (ret == 0) {
				2455	kaddr = kmap_atomic(page, KM_USER0);
				2456	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2457	flush_dcache_page(page);
				2458	kunmap_atomic(kaddr, KM_USER0);
Dave Kleikamp	57bf63d	2007-03-06 01:42:12 -0800	[diff] [blame]	2459	/*
				2460	* It would be more correct to call aops->commit_write()
				2461	* here, but this is more efficient.
				2462	*/
				2463	SetPageUptodate(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2464	set_page_dirty(page);
				2465	}
				2466	unlock_page(page);
				2467	page_cache_release(page);
				2468	out:
				2469	return ret;
				2470	}
				2471	EXPORT_SYMBOL(nobh_truncate_page);
				2472
				2473	int block_truncate_page(struct address_space *mapping,
				2474	loff_t from, get_block_t *get_block)
				2475	{
				2476	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2477	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				2478	unsigned blocksize;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2479	sector_t iblock;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2480	unsigned length, pos;
				2481	struct inode *inode = mapping->host;
				2482	struct page *page;
				2483	struct buffer_head *bh;
				2484	void *kaddr;
				2485	int err;
				2486
				2487	blocksize = 1 << inode->i_blkbits;
				2488	length = offset & (blocksize - 1);
				2489
				2490	/* Block boundary? Nothing to do */
				2491	if (!length)
				2492	return 0;
				2493
				2494	length = blocksize - length;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2495	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2496
				2497	page = grab_cache_page(mapping, index);
				2498	err = -ENOMEM;
				2499	if (!page)
				2500	goto out;
				2501
				2502	if (!page_has_buffers(page))
				2503	create_empty_buffers(page, blocksize, 0);
				2504
				2505	/* Find the buffer that contains "offset" */
				2506	bh = page_buffers(page);
				2507	pos = blocksize;
				2508	while (offset >= pos) {
				2509	bh = bh->b_this_page;
				2510	iblock++;
				2511	pos += blocksize;
				2512	}
				2513
				2514	err = 0;
				2515	if (!buffer_mapped(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2516	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2517	err = get_block(inode, iblock, bh, 0);
				2518	if (err)
				2519	goto unlock;
				2520	/* unmapped? It's a hole - nothing to do */
				2521	if (!buffer_mapped(bh))
				2522	goto unlock;
				2523	}
				2524
				2525	/* Ok, it's mapped. Make sure it's up-to-date */
				2526	if (PageUptodate(page))
				2527	set_buffer_uptodate(bh);
				2528
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	2529	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2530	err = -EIO;
				2531	ll_rw_block(READ, 1, &bh);
				2532	wait_on_buffer(bh);
				2533	/* Uhhuh. Read error. Complain and punt. */
				2534	if (!buffer_uptodate(bh))
				2535	goto unlock;
				2536	}
				2537
				2538	kaddr = kmap_atomic(page, KM_USER0);
				2539	memset(kaddr + offset, 0, length);
				2540	flush_dcache_page(page);
				2541	kunmap_atomic(kaddr, KM_USER0);
				2542
				2543	mark_buffer_dirty(bh);
				2544	err = 0;
				2545
				2546	unlock:
				2547	unlock_page(page);
				2548	page_cache_release(page);
				2549	out:
				2550	return err;
				2551	}
				2552
				2553	/*
				2554	* The generic ->writepage function for buffer-backed address_spaces
				2555	*/
				2556	int block_write_full_page(struct page page, get_block_t get_block,
				2557	struct writeback_control *wbc)
				2558	{
				2559	struct inode * const inode = page->mapping->host;
				2560	loff_t i_size = i_size_read(inode);
				2561	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2562	unsigned offset;
				2563	void *kaddr;
				2564
				2565	/* Is the page fully inside i_size? */
				2566	if (page->index < end_index)
				2567	return __block_write_full_page(inode, page, get_block, wbc);
				2568
				2569	/* Is the page fully outside i_size? (truncate in progress) */
				2570	offset = i_size & (PAGE_CACHE_SIZE-1);
				2571	if (page->index >= end_index+1 \|\| !offset) {
				2572	/*
				2573	* The page may have dirty, unmapped buffers. For example,
				2574	* they may have been added in ext3_writepage(). Make them
				2575	* freeable here, so the page does not leak.
				2576	*/
Jan Kara	aaa4059	2005-10-30 15:00:16 -0800	[diff] [blame]	2577	do_invalidatepage(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2578	unlock_page(page);
				2579	return 0; /* don't care */
				2580	}
				2581
				2582	/*
				2583	* The page straddles i_size. It must be zeroed out on each and every
				2584	* writepage invokation because it may be mmapped. "A file is mapped
				2585	* in multiples of the page size. For a file that is not a multiple of
				2586	* the page size, the remaining memory is zeroed when mapped, and
				2587	* writes to that region are not written out to the file."
				2588	*/
				2589	kaddr = kmap_atomic(page, KM_USER0);
				2590	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2591	flush_dcache_page(page);
				2592	kunmap_atomic(kaddr, KM_USER0);
				2593	return __block_write_full_page(inode, page, get_block, wbc);
				2594	}
				2595
				2596	sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
				2597	get_block_t *get_block)
				2598	{
				2599	struct buffer_head tmp;
				2600	struct inode *inode = mapping->host;
				2601	tmp.b_state = 0;
				2602	tmp.b_blocknr = 0;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2603	tmp.b_size = 1 << inode->i_blkbits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2604	get_block(inode, block, &tmp, 0);
				2605	return tmp.b_blocknr;
				2606	}
				2607
				2608	static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
				2609	{
				2610	struct buffer_head *bh = bio->bi_private;
				2611
				2612	if (bio->bi_size)
				2613	return 1;
				2614
				2615	if (err == -EOPNOTSUPP) {
				2616	set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
				2617	set_bit(BH_Eopnotsupp, &bh->b_state);
				2618	}
				2619
				2620	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
				2621	bio_put(bio);
				2622	return 0;
				2623	}
				2624
				2625	int submit_bh(int rw, struct buffer_head * bh)
				2626	{
				2627	struct bio *bio;
				2628	int ret = 0;
				2629
				2630	BUG_ON(!buffer_locked(bh));
				2631	BUG_ON(!buffer_mapped(bh));
				2632	BUG_ON(!bh->b_end_io);
				2633
				2634	if (buffer_ordered(bh) && (rw == WRITE))
				2635	rw = WRITE_BARRIER;
				2636
				2637	/*
				2638	* Only clear out a write error when rewriting, should this
				2639	* include WRITE_SYNC as well?
				2640	*/
				2641	if (test_set_buffer_req(bh) && (rw == WRITE \|\| rw == WRITE_BARRIER))
				2642	clear_buffer_write_io_error(bh);
				2643
				2644	/*
				2645	* from here on down, it's all bio -- do the initial mapping,
				2646	* submit_bio -> generic_make_request may further map this bio around
				2647	*/
				2648	bio = bio_alloc(GFP_NOIO, 1);
				2649
				2650	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
				2651	bio->bi_bdev = bh->b_bdev;
				2652	bio->bi_io_vec[0].bv_page = bh->b_page;
				2653	bio->bi_io_vec[0].bv_len = bh->b_size;
				2654	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
				2655
				2656	bio->bi_vcnt = 1;
				2657	bio->bi_idx = 0;
				2658	bio->bi_size = bh->b_size;
				2659
				2660	bio->bi_end_io = end_bio_bh_io_sync;
				2661	bio->bi_private = bh;
				2662
				2663	bio_get(bio);
				2664	submit_bio(rw, bio);
				2665
				2666	if (bio_flagged(bio, BIO_EOPNOTSUPP))
				2667	ret = -EOPNOTSUPP;
				2668
				2669	bio_put(bio);
				2670	return ret;
				2671	}
				2672
				2673	/**
				2674	* ll_rw_block: low-level access to block devices (DEPRECATED)
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2675	* @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2676	* @nr: number of &struct buffer_heads in the array
				2677	* @bhs: array of pointers to &struct buffer_head
				2678	*
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2679	* ll_rw_block() takes an array of pointers to &struct buffer_heads, and
				2680	* requests an I/O operation on them, either a %READ or a %WRITE. The third
				2681	* %SWRITE is like %WRITE only we make sure that the current data in buffers
				2682	* are sent to disk. The fourth %READA option is described in the documentation
				2683	* for generic_make_request() which ll_rw_block() calls.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2684	*
				2685	* This function drops any buffer that it cannot get a lock on (with the
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2686	* BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
				2687	* clean when doing a write request, and any buffer that appears to be
				2688	* up-to-date when doing read request. Further it marks as clean buffers that
				2689	* are processed for writing (the buffer cache won't assume that they are
				2690	* actually clean until the buffer gets unlocked).
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2691	*
				2692	* ll_rw_block sets b_end_io to simple completion handler that marks
				2693	* the buffer up-to-date (if approriate), unlocks the buffer and wakes
				2694	* any waiters.
				2695	*
				2696	* All of the buffers must be for the same device, and must also be a
				2697	* multiple of the current approved size for the device.
				2698	*/
				2699	void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
				2700	{
				2701	int i;
				2702
				2703	for (i = 0; i < nr; i++) {
				2704	struct buffer_head *bh = bhs[i];
				2705
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2706	if (rw == SWRITE)
				2707	lock_buffer(bh);
				2708	else if (test_set_buffer_locked(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2709	continue;
				2710
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2711	if (rw == WRITE \|\| rw == SWRITE) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2712	if (test_clear_buffer_dirty(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	2713	bh->b_end_io = end_buffer_write_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	2714	get_bh(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2715	submit_bh(WRITE, bh);
				2716	continue;
				2717	}
				2718	} else {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2719	if (!buffer_uptodate(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	2720	bh->b_end_io = end_buffer_read_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	2721	get_bh(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2722	submit_bh(rw, bh);
				2723	continue;
				2724	}
				2725	}
				2726	unlock_buffer(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2727	}
				2728	}
				2729
				2730	/*
				2731	* For a data-integrity writeout, we need to wait upon any in-progress I/O
				2732	* and then start new I/O and then wait upon it. The caller must have a ref on
				2733	* the buffer_head.
				2734	*/
				2735	int sync_dirty_buffer(struct buffer_head *bh)
				2736	{
				2737	int ret = 0;
				2738
				2739	WARN_ON(atomic_read(&bh->b_count) < 1);
				2740	lock_buffer(bh);
				2741	if (test_clear_buffer_dirty(bh)) {
				2742	get_bh(bh);
				2743	bh->b_end_io = end_buffer_write_sync;
				2744	ret = submit_bh(WRITE, bh);
				2745	wait_on_buffer(bh);
				2746	if (buffer_eopnotsupp(bh)) {
				2747	clear_buffer_eopnotsupp(bh);
				2748	ret = -EOPNOTSUPP;
				2749	}
				2750	if (!ret && !buffer_uptodate(bh))
				2751	ret = -EIO;
				2752	} else {
				2753	unlock_buffer(bh);
				2754	}
				2755	return ret;
				2756	}
				2757
				2758	/*
				2759	* try_to_free_buffers() checks if all the buffers on this particular page
				2760	* are unused, and releases them if so.
				2761	*
				2762	* Exclusion against try_to_free_buffers may be obtained by either
				2763	* locking the page or by holding its mapping's private_lock.
				2764	*
				2765	* If the page is dirty but all the buffers are clean then we need to
				2766	* be sure to mark the page clean as well. This is because the page
				2767	* may be against a block device, and a later reattachment of buffers
				2768	* to a dirty page will set all buffers dirty. Which would corrupt
				2769	* filesystem data on the same device.
				2770	*
				2771	* The same applies to regular filesystem pages: if all the buffers are
				2772	* clean then we set the page clean and proceed. To do that, we require
				2773	* total exclusion from __set_page_dirty_buffers(). That is obtained with
				2774	* private_lock.
				2775	*
				2776	* try_to_free_buffers() is non-blocking.
				2777	*/
				2778	static inline int buffer_busy(struct buffer_head *bh)
				2779	{
				2780	return atomic_read(&bh->b_count) \|
				2781	(bh->b_state & ((1 << BH_Dirty) \| (1 << BH_Lock)));
				2782	}
				2783
				2784	static int
				2785	drop_buffers(struct page page, struct buffer_head *buffers_to_free)
				2786	{
				2787	struct buffer_head *head = page_buffers(page);
				2788	struct buffer_head *bh;
				2789
				2790	bh = head;
				2791	do {
akpm@osdl.org	de7d5a3	2005-05-01 08:58:39 -0700	[diff] [blame]	2792	if (buffer_write_io_error(bh) && page->mapping)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2793	set_bit(AS_EIO, &page->mapping->flags);
				2794	if (buffer_busy(bh))
				2795	goto failed;
				2796	bh = bh->b_this_page;
				2797	} while (bh != head);
				2798
				2799	do {
				2800	struct buffer_head *next = bh->b_this_page;
				2801
				2802	if (!list_empty(&bh->b_assoc_buffers))
				2803	__remove_assoc_queue(bh);
				2804	bh = next;
				2805	} while (bh != head);
				2806	*buffers_to_free = head;
				2807	__clear_page_buffers(page);
				2808	return 1;
				2809	failed:
				2810	return 0;
				2811	}
				2812
				2813	int try_to_free_buffers(struct page *page)
				2814	{
				2815	struct address_space * const mapping = page->mapping;
				2816	struct buffer_head *buffers_to_free = NULL;
				2817	int ret = 0;
				2818
				2819	BUG_ON(!PageLocked(page));
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	2820	if (PageWriteback(page))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2821	return 0;
				2822
				2823	if (mapping == NULL) { /* can this still happen? */
				2824	ret = drop_buffers(page, &buffers_to_free);
				2825	goto out;
				2826	}
				2827
				2828	spin_lock(&mapping->private_lock);
				2829	ret = drop_buffers(page, &buffers_to_free);
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	2830
				2831	/*
				2832	* If the filesystem writes its buffers by hand (eg ext3)
				2833	* then we can have clean buffers against a dirty page. We
				2834	* clean the page here; otherwise the VM will never notice
				2835	* that the filesystem did any IO at all.
				2836	*
				2837	* Also, during truncate, discard_buffer will have marked all
				2838	* the page's buffers clean. We discover that here and clean
				2839	* the page also.
Nick Piggin	87df724	2007-01-30 14:36:27 +1100	[diff] [blame]	2840	*
				2841	* private_lock must be held over this entire operation in order
				2842	* to synchronise against __set_page_dirty_buffers and prevent the
				2843	* dirty bit from being lost.
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	2844	*/
				2845	if (ret)
				2846	cancel_dirty_page(page, PAGE_CACHE_SIZE);
Nick Piggin	87df724	2007-01-30 14:36:27 +1100	[diff] [blame]	2847	spin_unlock(&mapping->private_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2848	out:
				2849	if (buffers_to_free) {
				2850	struct buffer_head *bh = buffers_to_free;
				2851
				2852	do {
				2853	struct buffer_head *next = bh->b_this_page;
				2854	free_buffer_head(bh);
				2855	bh = next;
				2856	} while (bh != buffers_to_free);
				2857	}
				2858	return ret;
				2859	}
				2860	EXPORT_SYMBOL(try_to_free_buffers);
				2861
NeilBrown	3978d71	2006-03-26 01:37:17 -0800	[diff] [blame]	2862	void block_sync_page(struct page *page)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2863	{
				2864	struct address_space *mapping;
				2865
				2866	smp_mb();
				2867	mapping = page_mapping(page);
				2868	if (mapping)
				2869	blk_run_backing_dev(mapping->backing_dev_info, page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2870	}
				2871
				2872	/*
				2873	* There are no bdflush tunables left. But distributions are
				2874	* still running obsolete flush daemons, so we terminate them here.
				2875	*
				2876	* Use of bdflush() is deprecated and will be removed in a future kernel.
				2877	* The `pdflush' kernel threads fully replace bdflush daemons and this call.
				2878	*/
				2879	asmlinkage long sys_bdflush(int func, long data)
				2880	{
				2881	static int msg_count;
				2882
				2883	if (!capable(CAP_SYS_ADMIN))
				2884	return -EPERM;
				2885
				2886	if (msg_count < 5) {
				2887	msg_count++;
				2888	printk(KERN_INFO
				2889	"warning: process `%s' used the obsolete bdflush"
				2890	" system call\n", current->comm);
				2891	printk(KERN_INFO "Fix your initscripts?\n");
				2892	}
				2893
				2894	if (func == 1)
				2895	do_exit(0);
				2896	return 0;
				2897	}
				2898
				2899	/*
				2900	* Buffer-head allocation
				2901	*/
Christoph Lameter	e18b890	2006-12-06 20:33:20 -0800	[diff] [blame]	2902	static struct kmem_cache *bh_cachep;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2903
				2904	/*
				2905	* Once the number of bh's in the machine exceeds this level, we start
				2906	* stripping them in writeback.
				2907	*/
				2908	static int max_buffer_heads;
				2909
				2910	int buffer_heads_over_limit;
				2911
				2912	struct bh_accounting {
				2913	int nr; /* Number of live bh's */
				2914	int ratelimit; /* Limit cacheline bouncing */
				2915	};
				2916
				2917	static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
				2918
				2919	static void recalc_bh_state(void)
				2920	{
				2921	int i;
				2922	int tot = 0;
				2923
				2924	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
				2925	return;
				2926	__get_cpu_var(bh_accounting).ratelimit = 0;
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	2927	for_each_online_cpu(i)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2928	tot += per_cpu(bh_accounting, i).nr;
				2929	buffer_heads_over_limit = (tot > max_buffer_heads);
				2930	}
				2931
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	2932	struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2933	{
				2934	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
				2935	if (ret) {
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	2936	get_cpu_var(bh_accounting).nr++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2937	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	2938	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2939	}
				2940	return ret;
				2941	}
				2942	EXPORT_SYMBOL(alloc_buffer_head);
				2943
				2944	void free_buffer_head(struct buffer_head *bh)
				2945	{
				2946	BUG_ON(!list_empty(&bh->b_assoc_buffers));
				2947	kmem_cache_free(bh_cachep, bh);
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	2948	get_cpu_var(bh_accounting).nr--;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2949	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	2950	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2951	}
				2952	EXPORT_SYMBOL(free_buffer_head);
				2953
				2954	static void
Christoph Lameter	e18b890	2006-12-06 20:33:20 -0800	[diff] [blame]	2955	init_buffer_head(void data, struct kmem_cache cachep, unsigned long flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2956	{
Christoph Lameter	50953fe	2007-05-06 14:50:16 -0700	[diff] [blame]	2957	if (flags & SLAB_CTOR_CONSTRUCTOR) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2958	struct buffer_head * bh = (struct buffer_head *)data;
				2959
				2960	memset(bh, 0, sizeof(*bh));
				2961	INIT_LIST_HEAD(&bh->b_assoc_buffers);
				2962	}
				2963	}
				2964
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2965	static void buffer_exit_cpu(int cpu)
				2966	{
				2967	int i;
				2968	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
				2969
				2970	for (i = 0; i < BH_LRU_SIZE; i++) {
				2971	brelse(b->bhs[i]);
				2972	b->bhs[i] = NULL;
				2973	}
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	2974	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
				2975	per_cpu(bh_accounting, cpu).nr = 0;
				2976	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2977	}
				2978
				2979	static int buffer_cpu_notify(struct notifier_block *self,
				2980	unsigned long action, void *hcpu)
				2981	{
				2982	if (action == CPU_DEAD)
				2983	buffer_exit_cpu((unsigned long)hcpu);
				2984	return NOTIFY_OK;
				2985	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2986
				2987	void __init buffer_init(void)
				2988	{
				2989	int nrpages;
				2990
				2991	bh_cachep = kmem_cache_create("buffer_head",
Paul Jackson	b019600	2006-03-24 03:16:09 -0800	[diff] [blame]	2992	sizeof(struct buffer_head), 0,
				2993	(SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|
				2994	SLAB_MEM_SPREAD),
				2995	init_buffer_head,
				2996	NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2997
				2998	/*
				2999	* Limit the bh occupancy to 10% of ZONE_NORMAL
				3000	*/
				3001	nrpages = (nr_free_buffer_pages() * 10) / 100;
				3002	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
				3003	hotcpu_notifier(buffer_cpu_notify, 0);
				3004	}
				3005
				3006	EXPORT_SYMBOL(__bforget);
				3007	EXPORT_SYMBOL(__brelse);
				3008	EXPORT_SYMBOL(__wait_on_buffer);
				3009	EXPORT_SYMBOL(block_commit_write);
				3010	EXPORT_SYMBOL(block_prepare_write);
				3011	EXPORT_SYMBOL(block_read_full_page);
				3012	EXPORT_SYMBOL(block_sync_page);
				3013	EXPORT_SYMBOL(block_truncate_page);
				3014	EXPORT_SYMBOL(block_write_full_page);
				3015	EXPORT_SYMBOL(cont_prepare_write);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3016	EXPORT_SYMBOL(end_buffer_read_sync);
				3017	EXPORT_SYMBOL(end_buffer_write_sync);
				3018	EXPORT_SYMBOL(file_fsync);
				3019	EXPORT_SYMBOL(fsync_bdev);
				3020	EXPORT_SYMBOL(generic_block_bmap);
				3021	EXPORT_SYMBOL(generic_commit_write);
				3022	EXPORT_SYMBOL(generic_cont_expand);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	3023	EXPORT_SYMBOL(generic_cont_expand_simple);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3024	EXPORT_SYMBOL(init_buffer);
				3025	EXPORT_SYMBOL(invalidate_bdev);
				3026	EXPORT_SYMBOL(ll_rw_block);
				3027	EXPORT_SYMBOL(mark_buffer_dirty);
				3028	EXPORT_SYMBOL(submit_bh);
				3029	EXPORT_SYMBOL(sync_dirty_buffer);
				3030	EXPORT_SYMBOL(unlock_buffer);