Blame - fs/buffer.c - kernel/msm

blob: 1d0852fa728bd851969e93a9f9cfcf4fea6010bc [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/fs/buffer.c
				3	*
				4	* Copyright (C) 1991, 1992, 2002 Linus Torvalds
				5	*/
				6
				7	/*
				8	* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
				9	*
				10	* Removed a lot of unnecessary code and simplified things now that
				11	* the buffer cache isn't our primary cache - Andrew Tridgell 12/96
				12	*
				13	* Speed up hash, lru, and free list operations. Use gfp() for allocating
				14	* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
				15	*
				16	* Added 32k buffer block sizes - these are required older ARM systems. - RMK
				17	*
				18	* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
				19	*/
				20
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	21	#include <linux/kernel.h>
				22	#include <linux/syscalls.h>
				23	#include <linux/fs.h>
				24	#include <linux/mm.h>
				25	#include <linux/percpu.h>
				26	#include <linux/slab.h>
				27	#include <linux/smp_lock.h>
Randy Dunlap	16f7e0f	2006-01-11 12:17:46 -0800	[diff] [blame]	28	#include <linux/capability.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	29	#include <linux/blkdev.h>
				30	#include <linux/file.h>
				31	#include <linux/quotaops.h>
				32	#include <linux/highmem.h>
				33	#include <linux/module.h>
				34	#include <linux/writeback.h>
				35	#include <linux/hash.h>
				36	#include <linux/suspend.h>
				37	#include <linux/buffer_head.h>
Andrew Morton	55e829a	2006-12-10 02:19:27 -0800	[diff] [blame]	38	#include <linux/task_io_accounting_ops.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	39	#include <linux/bio.h>
				40	#include <linux/notifier.h>
				41	#include <linux/cpu.h>
				42	#include <linux/bitops.h>
				43	#include <linux/mpage.h>
Ingo Molnar	fb1c8f9	2005-09-10 00:25:56 -0700	[diff] [blame]	44	#include <linux/bit_spinlock.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	45
				46	static int fsync_buffers_list(spinlock_t lock, struct list_head list);
				47	static void invalidate_bh_lrus(void);
				48
				49	#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
				50
				51	inline void
				52	init_buffer(struct buffer_head bh, bh_end_io_t handler, void *private)
				53	{
				54	bh->b_end_io = handler;
				55	bh->b_private = private;
				56	}
				57
				58	static int sync_buffer(void *word)
				59	{
				60	struct block_device *bd;
				61	struct buffer_head *bh
				62	= container_of(word, struct buffer_head, b_state);
				63
				64	smp_mb();
				65	bd = bh->b_bdev;
				66	if (bd)
				67	blk_run_address_space(bd->bd_inode->i_mapping);
				68	io_schedule();
				69	return 0;
				70	}
				71
				72	void fastcall __lock_buffer(struct buffer_head *bh)
				73	{
				74	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
				75	TASK_UNINTERRUPTIBLE);
				76	}
				77	EXPORT_SYMBOL(__lock_buffer);
				78
				79	void fastcall unlock_buffer(struct buffer_head *bh)
				80	{
Nick Piggin	72ed3d0	2007-02-10 01:46:22 -0800	[diff] [blame]	81	smp_mb__before_clear_bit();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	82	clear_buffer_locked(bh);
				83	smp_mb__after_clear_bit();
				84	wake_up_bit(&bh->b_state, BH_Lock);
				85	}
				86
				87	/*
				88	* Block until a buffer comes unlocked. This doesn't stop it
				89	* from becoming locked again - you have to lock it yourself
				90	* if you want to preserve its state.
				91	*/
				92	void __wait_on_buffer(struct buffer_head * bh)
				93	{
				94	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
				95	}
				96
				97	static void
				98	__clear_page_buffers(struct page *page)
				99	{
				100	ClearPagePrivate(page);
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	101	set_page_private(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	102	page_cache_release(page);
				103	}
				104
				105	static void buffer_io_error(struct buffer_head *bh)
				106	{
				107	char b[BDEVNAME_SIZE];
				108
				109	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
				110	bdevname(bh->b_bdev, b),
				111	(unsigned long long)bh->b_blocknr);
				112	}
				113
				114	/*
				115	* Default synchronous end-of-IO handler.. Just mark it up-to-date and
				116	* unlock the buffer. This is what ll_rw_block uses too.
				117	*/
				118	void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
				119	{
				120	if (uptodate) {
				121	set_buffer_uptodate(bh);
				122	} else {
				123	/* This happens, due to failed READA attempts. */
				124	clear_buffer_uptodate(bh);
				125	}
				126	unlock_buffer(bh);
				127	put_bh(bh);
				128	}
				129
				130	void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
				131	{
				132	char b[BDEVNAME_SIZE];
				133
				134	if (uptodate) {
				135	set_buffer_uptodate(bh);
				136	} else {
				137	if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
				138	buffer_io_error(bh);
				139	printk(KERN_WARNING "lost page write due to "
				140	"I/O error on %s\n",
				141	bdevname(bh->b_bdev, b));
				142	}
				143	set_buffer_write_io_error(bh);
				144	clear_buffer_uptodate(bh);
				145	}
				146	unlock_buffer(bh);
				147	put_bh(bh);
				148	}
				149
				150	/*
				151	* Write out and wait upon all the dirty data associated with a block
				152	* device via its mapping. Does not take the superblock lock.
				153	*/
				154	int sync_blockdev(struct block_device *bdev)
				155	{
				156	int ret = 0;
				157
OGAWA Hirofumi	28fd129	2006-01-08 01:02:14 -0800	[diff] [blame]	158	if (bdev)
				159	ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	160	return ret;
				161	}
				162	EXPORT_SYMBOL(sync_blockdev);
				163
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	164	/*
				165	* Write out and wait upon all dirty data associated with this
				166	* device. Filesystem data as well as the underlying block
				167	* device. Takes the superblock lock.
				168	*/
				169	int fsync_bdev(struct block_device *bdev)
				170	{
				171	struct super_block *sb = get_super(bdev);
				172	if (sb) {
				173	int res = fsync_super(sb);
				174	drop_super(sb);
				175	return res;
				176	}
				177	return sync_blockdev(bdev);
				178	}
				179
				180	/**
				181	* freeze_bdev -- lock a filesystem and force it into a consistent state
				182	* @bdev: blockdevice to lock
				183	*
David Chinner	f73ca1b	2007-01-10 23:15:41 -0800	[diff] [blame]	184	* This takes the block device bd_mount_sem to make sure no new mounts
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	185	* happen on bdev until thaw_bdev() is called.
				186	* If a superblock is found on this device, we take the s_umount semaphore
				187	* on it to make sure nobody unmounts until the snapshot creation is done.
				188	*/
				189	struct super_block freeze_bdev(struct block_device bdev)
				190	{
				191	struct super_block *sb;
				192
David Chinner	f73ca1b	2007-01-10 23:15:41 -0800	[diff] [blame]	193	down(&bdev->bd_mount_sem);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	194	sb = get_super(bdev);
				195	if (sb && !(sb->s_flags & MS_RDONLY)) {
				196	sb->s_frozen = SB_FREEZE_WRITE;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	197	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	198
OGAWA Hirofumi	d25b9a1	2006-03-25 03:07:44 -0800	[diff] [blame]	199	__fsync_super(sb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	200
				201	sb->s_frozen = SB_FREEZE_TRANS;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	202	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	203
				204	sync_blockdev(sb->s_bdev);
				205
				206	if (sb->s_op->write_super_lockfs)
				207	sb->s_op->write_super_lockfs(sb);
				208	}
				209
				210	sync_blockdev(bdev);
				211	return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
				212	}
				213	EXPORT_SYMBOL(freeze_bdev);
				214
				215	/**
				216	* thaw_bdev -- unlock filesystem
				217	* @bdev: blockdevice to unlock
				218	* @sb: associated superblock
				219	*
				220	* Unlocks the filesystem and marks it writeable again after freeze_bdev().
				221	*/
				222	void thaw_bdev(struct block_device bdev, struct super_block sb)
				223	{
				224	if (sb) {
				225	BUG_ON(sb->s_bdev != bdev);
				226
				227	if (sb->s_op->unlockfs)
				228	sb->s_op->unlockfs(sb);
				229	sb->s_frozen = SB_UNFROZEN;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	230	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	231	wake_up(&sb->s_wait_unfrozen);
				232	drop_super(sb);
				233	}
				234
David Chinner	f73ca1b	2007-01-10 23:15:41 -0800	[diff] [blame]	235	up(&bdev->bd_mount_sem);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	236	}
				237	EXPORT_SYMBOL(thaw_bdev);
				238
				239	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	240	* Various filesystems appear to want __find_get_block to be non-blocking.
				241	* But it's the page lock which protects the buffers. To get around this,
				242	* we get exclusion from try_to_free_buffers with the blockdev mapping's
				243	* private_lock.
				244	*
				245	* Hack idea: for the blockdev mapping, i_bufferlist_lock contention
				246	* may be quite high. This code could TryLock the page, and if that
				247	* succeeds, there is no need to take private_lock. (But if
				248	* private_lock is contended then so is mapping->tree_lock).
				249	*/
				250	static struct buffer_head *
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	251	__find_get_block_slow(struct block_device *bdev, sector_t block)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	252	{
				253	struct inode *bd_inode = bdev->bd_inode;
				254	struct address_space *bd_mapping = bd_inode->i_mapping;
				255	struct buffer_head *ret = NULL;
				256	pgoff_t index;
				257	struct buffer_head *bh;
				258	struct buffer_head *head;
				259	struct page *page;
				260	int all_mapped = 1;
				261
				262	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
				263	page = find_get_page(bd_mapping, index);
				264	if (!page)
				265	goto out;
				266
				267	spin_lock(&bd_mapping->private_lock);
				268	if (!page_has_buffers(page))
				269	goto out_unlock;
				270	head = page_buffers(page);
				271	bh = head;
				272	do {
				273	if (bh->b_blocknr == block) {
				274	ret = bh;
				275	get_bh(bh);
				276	goto out_unlock;
				277	}
				278	if (!buffer_mapped(bh))
				279	all_mapped = 0;
				280	bh = bh->b_this_page;
				281	} while (bh != head);
				282
				283	/* we might be here because some of the buffers on this page are
				284	* not mapped. This is due to various races between
				285	* file io on the block device and getblk. It gets dealt with
				286	* elsewhere, don't buffer_error if we had some unmapped buffers
				287	*/
				288	if (all_mapped) {
				289	printk("__find_get_block_slow() failed. "
				290	"block=%llu, b_blocknr=%llu\n",
Badari Pulavarty	205f87f	2006-03-26 01:38:00 -0800	[diff] [blame]	291	(unsigned long long)block,
				292	(unsigned long long)bh->b_blocknr);
				293	printk("b_state=0x%08lx, b_size=%zu\n",
				294	bh->b_state, bh->b_size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	295	printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
				296	}
				297	out_unlock:
				298	spin_unlock(&bd_mapping->private_lock);
				299	page_cache_release(page);
				300	out:
				301	return ret;
				302	}
				303
				304	/* If invalidate_buffers() will trash dirty buffers, it means some kind
				305	of fs corruption is going on. Trashing dirty data always imply losing
				306	information that was supposed to be just stored on the physical layer
				307	by the user.
				308
				309	Thus invalidate_buffers in general usage is not allwowed to trash
				310	dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
				311	be preserved. These buffers are simply skipped.
				312
				313	We also skip buffers which are still in use. For example this can
				314	happen if a userspace program is reading the block device.
				315
				316	NOTE: In the case where the user removed a removable-media-disk even if
				317	there's still dirty data not synced on disk (due a bug in the device driver
				318	or due an error of the user), by not destroying the dirty buffers we could
				319	generate corruption also on the next media inserted, thus a parameter is
				320	necessary to handle this case in the most safe way possible (trying
				321	to not corrupt also the new disk inserted with the data belonging to
				322	the old now corrupted disk). Also for the ramdisk the natural thing
				323	to do in order to release the ramdisk memory is to destroy dirty buffers.
				324
				325	These are two special cases. Normal usage imply the device driver
				326	to issue a sync on the device (without waiting I/O completion) and
				327	then an invalidate_buffers call that doesn't trash dirty buffers.
				328
				329	For handling cache coherency with the blkdev pagecache the 'update' case
				330	is been introduced. It is needed to re-read from disk any pinned
				331	buffer. NOTE: re-reading from disk is destructive so we can do it only
				332	when we assume nobody is changing the buffercache under our I/O and when
				333	we think the disk contains more recent information than the buffercache.
				334	The update == 1 pass marks the buffers we need to update, the update == 2
				335	pass does the actual I/O. */
				336	void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
				337	{
Andrew Morton	0e1dfc6	2006-07-30 03:03:28 -0700	[diff] [blame]	338	struct address_space *mapping = bdev->bd_inode->i_mapping;
				339
				340	if (mapping->nrpages == 0)
				341	return;
				342
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	343	invalidate_bh_lrus();
				344	/*
				345	* FIXME: what about destroy_dirty_buffers?
				346	* We really want to use invalidate_inode_pages2() for
				347	* that, but not until that's cleaned up.
				348	*/
Andrew Morton	fc0ecff	2007-02-10 01:45:39 -0800	[diff] [blame]	349	invalidate_mapping_pages(mapping, 0, -1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	350	}
				351
				352	/*
				353	* Kick pdflush then try to free up some ZONE_NORMAL memory.
				354	*/
				355	static void free_more_memory(void)
				356	{
				357	struct zone **zones;
				358	pg_data_t *pgdat;
				359
Pekka J Enberg	687a21c	2005-06-28 20:44:55 -0700	[diff] [blame]	360	wakeup_pdflush(1024);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	361	yield();
				362
KAMEZAWA Hiroyuki	ec936fc	2006-03-27 01:15:59 -0800	[diff] [blame]	363	for_each_online_pgdat(pgdat) {
Al Viro	af4ca45	2005-10-21 02:55:38 -0400	[diff] [blame]	364	zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	365	if (*zones)
Darren Hart	1ad539b	2005-06-21 17:14:53 -0700	[diff] [blame]	366	try_to_free_pages(zones, GFP_NOFS);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	367	}
				368	}
				369
				370	/*
				371	* I/O completion handler for block_read_full_page() - pages
				372	* which come unlocked at the end of I/O.
				373	*/
				374	static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
				375	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	376	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	377	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	378	struct buffer_head *tmp;
				379	struct page *page;
				380	int page_uptodate = 1;
				381
				382	BUG_ON(!buffer_async_read(bh));
				383
				384	page = bh->b_page;
				385	if (uptodate) {
				386	set_buffer_uptodate(bh);
				387	} else {
				388	clear_buffer_uptodate(bh);
				389	if (printk_ratelimit())
				390	buffer_io_error(bh);
				391	SetPageError(page);
				392	}
				393
				394	/*
				395	* Be _very_ careful from here on. Bad things can happen if
				396	* two buffer heads end IO at almost the same time and both
				397	* decide that the page is now completely done.
				398	*/
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	399	first = page_buffers(page);
				400	local_irq_save(flags);
				401	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	402	clear_buffer_async_read(bh);
				403	unlock_buffer(bh);
				404	tmp = bh;
				405	do {
				406	if (!buffer_uptodate(tmp))
				407	page_uptodate = 0;
				408	if (buffer_async_read(tmp)) {
				409	BUG_ON(!buffer_locked(tmp));
				410	goto still_busy;
				411	}
				412	tmp = tmp->b_this_page;
				413	} while (tmp != bh);
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	414	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				415	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	416
				417	/*
				418	* If none of the buffers had errors and they are all
				419	* uptodate then we can set the page uptodate.
				420	*/
				421	if (page_uptodate && !PageError(page))
				422	SetPageUptodate(page);
				423	unlock_page(page);
				424	return;
				425
				426	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	427	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				428	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	429	return;
				430	}
				431
				432	/*
				433	* Completion handler for block_write_full_page() - pages which are unlocked
				434	* during I/O, and which have PageWriteback cleared upon I/O completion.
				435	*/
Adrian Bunk	b6cd0b7	2006-06-27 02:53:54 -0700	[diff] [blame]	436	static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	437	{
				438	char b[BDEVNAME_SIZE];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	439	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	440	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	441	struct buffer_head *tmp;
				442	struct page *page;
				443
				444	BUG_ON(!buffer_async_write(bh));
				445
				446	page = bh->b_page;
				447	if (uptodate) {
				448	set_buffer_uptodate(bh);
				449	} else {
				450	if (printk_ratelimit()) {
				451	buffer_io_error(bh);
				452	printk(KERN_WARNING "lost page write due to "
				453	"I/O error on %s\n",
				454	bdevname(bh->b_bdev, b));
				455	}
				456	set_bit(AS_EIO, &page->mapping->flags);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	457	set_buffer_write_io_error(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	458	clear_buffer_uptodate(bh);
				459	SetPageError(page);
				460	}
				461
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	462	first = page_buffers(page);
				463	local_irq_save(flags);
				464	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
				465
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	466	clear_buffer_async_write(bh);
				467	unlock_buffer(bh);
				468	tmp = bh->b_this_page;
				469	while (tmp != bh) {
				470	if (buffer_async_write(tmp)) {
				471	BUG_ON(!buffer_locked(tmp));
				472	goto still_busy;
				473	}
				474	tmp = tmp->b_this_page;
				475	}
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	476	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				477	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	478	end_page_writeback(page);
				479	return;
				480
				481	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	482	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				483	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	484	return;
				485	}
				486
				487	/*
				488	* If a page's buffers are under async readin (end_buffer_async_read
				489	* completion) then there is a possibility that another thread of
				490	* control could lock one of the buffers after it has completed
				491	* but while some of the other buffers have not completed. This
				492	* locked buffer would confuse end_buffer_async_read() into not unlocking
				493	* the page. So the absence of BH_Async_Read tells end_buffer_async_read()
				494	* that this buffer is not under async I/O.
				495	*
				496	* The page comes unlocked when it has no locked buffer_async buffers
				497	* left.
				498	*
				499	* PageLocked prevents anyone starting new async I/O reads any of
				500	* the buffers.
				501	*
				502	* PageWriteback is used to prevent simultaneous writeout of the same
				503	* page.
				504	*
				505	* PageLocked prevents anyone from starting writeback of a page which is
				506	* under read I/O (PageWriteback is only ever set against a locked page).
				507	*/
				508	static void mark_buffer_async_read(struct buffer_head *bh)
				509	{
				510	bh->b_end_io = end_buffer_async_read;
				511	set_buffer_async_read(bh);
				512	}
				513
				514	void mark_buffer_async_write(struct buffer_head *bh)
				515	{
				516	bh->b_end_io = end_buffer_async_write;
				517	set_buffer_async_write(bh);
				518	}
				519	EXPORT_SYMBOL(mark_buffer_async_write);
				520
				521
				522	/*
				523	* fs/buffer.c contains helper functions for buffer-backed address space's
				524	* fsync functions. A common requirement for buffer-based filesystems is
				525	* that certain data from the backing blockdev needs to be written out for
				526	* a successful fsync(). For example, ext2 indirect blocks need to be
				527	* written back and waited upon before fsync() returns.
				528	*
				529	* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
				530	* inode_has_buffers() and invalidate_inode_buffers() are provided for the
				531	* management of a list of dependent buffers at ->i_mapping->private_list.
				532	*
				533	* Locking is a little subtle: try_to_free_buffers() will remove buffers
				534	* from their controlling inode's queue when they are being freed. But
				535	* try_to_free_buffers() will be operating against the blockdev mapping
				536	* at the time, not against the S_ISREG file which depends on those buffers.
				537	* So the locking for private_list is via the private_lock in the address_space
				538	* which backs the buffers. Which is different from the address_space
				539	* against which the buffers are listed. So for a particular address_space,
				540	* mapping->private_lock does not protect mapping->private_list! In fact,
				541	* mapping->private_list will always be protected by the backing blockdev's
				542	* ->private_lock.
				543	*
				544	* Which introduces a requirement: all buffers on an address_space's
				545	* ->private_list must be from the same address_space: the blockdev's.
				546	*
				547	* address_spaces which do not place buffers at ->private_list via these
				548	* utility functions are free to use private_lock and private_list for
				549	* whatever they want. The only requirement is that list_empty(private_list)
				550	* be true at clear_inode() time.
				551	*
				552	* FIXME: clear_inode should not call invalidate_inode_buffers(). The
				553	* filesystems should do that. invalidate_inode_buffers() should just go
				554	* BUG_ON(!list_empty).
				555	*
				556	* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
				557	* take an address_space, not an inode. And it should be called
				558	* mark_buffer_dirty_fsync() to clearly define why those buffers are being
				559	* queued up.
				560	*
				561	* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
				562	* list if it is already on a list. Because if the buffer is on a list,
				563	* it must already be on the right one. If not, the filesystem is being
				564	* silly. This will save a ton of locking. But first we have to ensure
				565	* that buffers are taken off the old inode's list when they are freed
				566	* (presumably in truncate). That requires careful auditing of all
				567	* filesystems (do it inside bforget()). It could also be done by bringing
				568	* b_inode back.
				569	*/
				570
				571	/*
				572	* The buffer's backing address_space's private_lock must be held
				573	*/
				574	static inline void __remove_assoc_queue(struct buffer_head *bh)
				575	{
				576	list_del_init(&bh->b_assoc_buffers);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	577	WARN_ON(!bh->b_assoc_map);
				578	if (buffer_write_io_error(bh))
				579	set_bit(AS_EIO, &bh->b_assoc_map->flags);
				580	bh->b_assoc_map = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	581	}
				582
				583	int inode_has_buffers(struct inode *inode)
				584	{
				585	return !list_empty(&inode->i_data.private_list);
				586	}
				587
				588	/*
				589	* osync is designed to support O_SYNC io. It waits synchronously for
				590	* all already-submitted IO to complete, but does not queue any new
				591	* writes to the disk.
				592	*
				593	* To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
				594	* you dirty the buffers, and then use osync_inode_buffers to wait for
				595	* completion. Any other dirty buffers which are not yet queued for
				596	* write will not be flushed to disk by the osync.
				597	*/
				598	static int osync_buffers_list(spinlock_t lock, struct list_head list)
				599	{
				600	struct buffer_head *bh;
				601	struct list_head *p;
				602	int err = 0;
				603
				604	spin_lock(lock);
				605	repeat:
				606	list_for_each_prev(p, list) {
				607	bh = BH_ENTRY(p);
				608	if (buffer_locked(bh)) {
				609	get_bh(bh);
				610	spin_unlock(lock);
				611	wait_on_buffer(bh);
				612	if (!buffer_uptodate(bh))
				613	err = -EIO;
				614	brelse(bh);
				615	spin_lock(lock);
				616	goto repeat;
				617	}
				618	}
				619	spin_unlock(lock);
				620	return err;
				621	}
				622
				623	/**
				624	* sync_mapping_buffers - write out and wait upon a mapping's "associated"
				625	* buffers
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	626	* @mapping: the mapping which wants those buffers written
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	627	*
				628	* Starts I/O against the buffers at mapping->private_list, and waits upon
				629	* that I/O.
				630	*
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	631	* Basically, this is a convenience function for fsync().
				632	* @mapping is a file or directory which needs those buffers to be written for
				633	* a successful fsync().
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	634	*/
				635	int sync_mapping_buffers(struct address_space *mapping)
				636	{
				637	struct address_space *buffer_mapping = mapping->assoc_mapping;
				638
				639	if (buffer_mapping == NULL \|\| list_empty(&mapping->private_list))
				640	return 0;
				641
				642	return fsync_buffers_list(&buffer_mapping->private_lock,
				643	&mapping->private_list);
				644	}
				645	EXPORT_SYMBOL(sync_mapping_buffers);
				646
				647	/*
				648	* Called when we've recently written block `bblock', and it is known that
				649	* `bblock' was for a buffer_boundary() buffer. This means that the block at
				650	* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
				651	* dirty, schedule it for IO. So that indirects merge nicely with their data.
				652	*/
				653	void write_boundary_block(struct block_device *bdev,
				654	sector_t bblock, unsigned blocksize)
				655	{
				656	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
				657	if (bh) {
				658	if (buffer_dirty(bh))
				659	ll_rw_block(WRITE, 1, &bh);
				660	put_bh(bh);
				661	}
				662	}
				663
				664	void mark_buffer_dirty_inode(struct buffer_head bh, struct inode inode)
				665	{
				666	struct address_space *mapping = inode->i_mapping;
				667	struct address_space *buffer_mapping = bh->b_page->mapping;
				668
				669	mark_buffer_dirty(bh);
				670	if (!mapping->assoc_mapping) {
				671	mapping->assoc_mapping = buffer_mapping;
				672	} else {
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	673	BUG_ON(mapping->assoc_mapping != buffer_mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	674	}
				675	if (list_empty(&bh->b_assoc_buffers)) {
				676	spin_lock(&buffer_mapping->private_lock);
				677	list_move_tail(&bh->b_assoc_buffers,
				678	&mapping->private_list);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	679	bh->b_assoc_map = mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	680	spin_unlock(&buffer_mapping->private_lock);
				681	}
				682	}
				683	EXPORT_SYMBOL(mark_buffer_dirty_inode);
				684
				685	/*
				686	* Add a page to the dirty page list.
				687	*
				688	* It is a sad fact of life that this function is called from several places
				689	* deeply under spinlocking. It may not sleep.
				690	*
				691	* If the page has buffers, the uptodate buffers are set dirty, to preserve
				692	* dirty-state coherency between the page and the buffers. It the page does
				693	* not have buffers then when they are later attached they will all be set
				694	* dirty.
				695	*
				696	* The buffers are dirtied before the page is dirtied. There's a small race
				697	* window in which a writepage caller may see the page cleanness but not the
				698	* buffer dirtiness. That's fine. If this code were to set the page dirty
				699	* before the buffers, a concurrent writepage caller could clear the page dirty
				700	* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
				701	* page on the dirty page list.
				702	*
				703	* We use private_lock to lock against try_to_free_buffers while using the
				704	* page's buffer list. Also use this to protect against clean buffers being
				705	* added to the page after it was set dirty.
				706	*
				707	* FIXME: may need to call ->reservepage here as well. That's rather up to the
				708	* address_space though.
				709	*/
				710	int __set_page_dirty_buffers(struct page *page)
				711	{
Nick Piggin	ebf7a22	2006-10-10 04:36:54 +0200	[diff] [blame]	712	struct address_space * const mapping = page_mapping(page);
				713
				714	if (unlikely(!mapping))
				715	return !TestSetPageDirty(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	716
				717	spin_lock(&mapping->private_lock);
				718	if (page_has_buffers(page)) {
				719	struct buffer_head *head = page_buffers(page);
				720	struct buffer_head *bh = head;
				721
				722	do {
				723	set_buffer_dirty(bh);
				724	bh = bh->b_this_page;
				725	} while (bh != head);
				726	}
				727	spin_unlock(&mapping->private_lock);
				728
Andrew Morton	8c08540	2006-12-10 02:19:24 -0800	[diff] [blame]	729	if (TestSetPageDirty(page))
				730	return 0;
				731
				732	write_lock_irq(&mapping->tree_lock);
				733	if (page->mapping) { /* Race with truncate? */
Andrew Morton	55e829a	2006-12-10 02:19:27 -0800	[diff] [blame]	734	if (mapping_cap_account_dirty(mapping)) {
Andrew Morton	8c08540	2006-12-10 02:19:24 -0800	[diff] [blame]	735	__inc_zone_page_state(page, NR_FILE_DIRTY);
Andrew Morton	55e829a	2006-12-10 02:19:27 -0800	[diff] [blame]	736	task_io_account_write(PAGE_CACHE_SIZE);
				737	}
Andrew Morton	8c08540	2006-12-10 02:19:24 -0800	[diff] [blame]	738	radix_tree_tag_set(&mapping->page_tree,
				739	page_index(page), PAGECACHE_TAG_DIRTY);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	740	}
Andrew Morton	8c08540	2006-12-10 02:19:24 -0800	[diff] [blame]	741	write_unlock_irq(&mapping->tree_lock);
				742	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
				743	return 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	744	}
				745	EXPORT_SYMBOL(__set_page_dirty_buffers);
				746
				747	/*
				748	* Write out and wait upon a list of buffers.
				749	*
				750	* We have conflicting pressures: we want to make sure that all
				751	* initially dirty buffers get waited on, but that any subsequently
				752	* dirtied buffers don't. After all, we don't want fsync to last
				753	* forever if somebody is actively writing to the file.
				754	*
				755	* Do this in two main stages: first we copy dirty buffers to a
				756	* temporary inode list, queueing the writes as we go. Then we clean
				757	* up, waiting for those writes to complete.
				758	*
				759	* During this second stage, any subsequent updates to the file may end
				760	* up refiling the buffer on the original inode's dirty list again, so
				761	* there is a chance we will end up with a buffer queued for write but
				762	* not yet completed on that list. So, as a final cleanup we go through
				763	* the osync code to catch these locked, dirty buffers without requeuing
				764	* any newly dirty buffers for write.
				765	*/
				766	static int fsync_buffers_list(spinlock_t lock, struct list_head list)
				767	{
				768	struct buffer_head *bh;
				769	struct list_head tmp;
				770	int err = 0, err2;
				771
				772	INIT_LIST_HEAD(&tmp);
				773
				774	spin_lock(lock);
				775	while (!list_empty(list)) {
				776	bh = BH_ENTRY(list->next);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	777	__remove_assoc_queue(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	778	if (buffer_dirty(bh) \|\| buffer_locked(bh)) {
				779	list_add(&bh->b_assoc_buffers, &tmp);
				780	if (buffer_dirty(bh)) {
				781	get_bh(bh);
				782	spin_unlock(lock);
				783	/*
				784	* Ensure any pending I/O completes so that
				785	* ll_rw_block() actually writes the current
				786	* contents - it is a noop if I/O is still in
				787	* flight on potentially older contents.
				788	*/
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	789	ll_rw_block(SWRITE, 1, &bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	790	brelse(bh);
				791	spin_lock(lock);
				792	}
				793	}
				794	}
				795
				796	while (!list_empty(&tmp)) {
				797	bh = BH_ENTRY(tmp.prev);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	798	list_del_init(&bh->b_assoc_buffers);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	799	get_bh(bh);
				800	spin_unlock(lock);
				801	wait_on_buffer(bh);
				802	if (!buffer_uptodate(bh))
				803	err = -EIO;
				804	brelse(bh);
				805	spin_lock(lock);
				806	}
				807
				808	spin_unlock(lock);
				809	err2 = osync_buffers_list(lock, list);
				810	if (err)
				811	return err;
				812	else
				813	return err2;
				814	}
				815
				816	/*
				817	* Invalidate any and all dirty buffers on a given inode. We are
				818	* probably unmounting the fs, but that doesn't mean we have already
				819	* done a sync(). Just drop the buffers from the inode list.
				820	*
				821	* NOTE: we take the inode's blockdev's mapping's private_lock. Which
				822	* assumes that all the buffers are against the blockdev. Not true
				823	* for reiserfs.
				824	*/
				825	void invalidate_inode_buffers(struct inode *inode)
				826	{
				827	if (inode_has_buffers(inode)) {
				828	struct address_space *mapping = &inode->i_data;
				829	struct list_head *list = &mapping->private_list;
				830	struct address_space *buffer_mapping = mapping->assoc_mapping;
				831
				832	spin_lock(&buffer_mapping->private_lock);
				833	while (!list_empty(list))
				834	__remove_assoc_queue(BH_ENTRY(list->next));
				835	spin_unlock(&buffer_mapping->private_lock);
				836	}
				837	}
				838
				839	/*
				840	* Remove any clean buffers from the inode's buffer list. This is called
				841	* when we're trying to free the inode itself. Those buffers can pin it.
				842	*
				843	* Returns true if all buffers were removed.
				844	*/
				845	int remove_inode_buffers(struct inode *inode)
				846	{
				847	int ret = 1;
				848
				849	if (inode_has_buffers(inode)) {
				850	struct address_space *mapping = &inode->i_data;
				851	struct list_head *list = &mapping->private_list;
				852	struct address_space *buffer_mapping = mapping->assoc_mapping;
				853
				854	spin_lock(&buffer_mapping->private_lock);
				855	while (!list_empty(list)) {
				856	struct buffer_head *bh = BH_ENTRY(list->next);
				857	if (buffer_dirty(bh)) {
				858	ret = 0;
				859	break;
				860	}
				861	__remove_assoc_queue(bh);
				862	}
				863	spin_unlock(&buffer_mapping->private_lock);
				864	}
				865	return ret;
				866	}
				867
				868	/*
				869	* Create the appropriate buffers when given a page for data area and
				870	* the size of each buffer.. Use the bh->b_this_page linked list to
				871	* follow the buffers created. Return NULL if unable to create more
				872	* buffers.
				873	*
				874	* The retry flag is used to differentiate async IO (paging, swapping)
				875	* which may not fail from ordinary buffer allocations.
				876	*/
				877	struct buffer_head alloc_page_buffers(struct page page, unsigned long size,
				878	int retry)
				879	{
				880	struct buffer_head bh, head;
				881	long offset;
				882
				883	try_again:
				884	head = NULL;
				885	offset = PAGE_SIZE;
				886	while ((offset -= size) >= 0) {
				887	bh = alloc_buffer_head(GFP_NOFS);
				888	if (!bh)
				889	goto no_grow;
				890
				891	bh->b_bdev = NULL;
				892	bh->b_this_page = head;
				893	bh->b_blocknr = -1;
				894	head = bh;
				895
				896	bh->b_state = 0;
				897	atomic_set(&bh->b_count, 0);
Chris Mason	fc5cd58	2006-02-01 03:06:48 -0800	[diff] [blame]	898	bh->b_private = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	899	bh->b_size = size;
				900
				901	/* Link the buffer to its page */
				902	set_bh_page(bh, page, offset);
				903
Nathan Scott	01ffe33	2006-01-17 09:02:07 +1100	[diff] [blame]	904	init_buffer(bh, NULL, NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	905	}
				906	return head;
				907	/*
				908	* In case anything failed, we just free everything we got.
				909	*/
				910	no_grow:
				911	if (head) {
				912	do {
				913	bh = head;
				914	head = head->b_this_page;
				915	free_buffer_head(bh);
				916	} while (head);
				917	}
				918
				919	/*
				920	* Return failure for non-async IO requests. Async IO requests
				921	* are not allowed to fail, so we have to wait until buffer heads
				922	* become available. But we don't want tasks sleeping with
				923	* partially complete buffers, so all were released above.
				924	*/
				925	if (!retry)
				926	return NULL;
				927
				928	/* We're _really_ low on memory. Now we just
				929	* wait for old buffer heads to become free due to
				930	* finishing IO. Since this is an async request and
				931	* the reserve list is empty, we're sure there are
				932	* async buffer heads in use.
				933	*/
				934	free_more_memory();
				935	goto try_again;
				936	}
				937	EXPORT_SYMBOL_GPL(alloc_page_buffers);
				938
				939	static inline void
				940	link_dev_buffers(struct page page, struct buffer_head head)
				941	{
				942	struct buffer_head bh, tail;
				943
				944	bh = head;
				945	do {
				946	tail = bh;
				947	bh = bh->b_this_page;
				948	} while (bh);
				949	tail->b_this_page = head;
				950	attach_page_buffers(page, head);
				951	}
				952
				953	/*
				954	* Initialise the state of a blockdev page's buffers.
				955	*/
				956	static void
				957	init_page_buffers(struct page page, struct block_device bdev,
				958	sector_t block, int size)
				959	{
				960	struct buffer_head *head = page_buffers(page);
				961	struct buffer_head *bh = head;
				962	int uptodate = PageUptodate(page);
				963
				964	do {
				965	if (!buffer_mapped(bh)) {
				966	init_buffer(bh, NULL, NULL);
				967	bh->b_bdev = bdev;
				968	bh->b_blocknr = block;
				969	if (uptodate)
				970	set_buffer_uptodate(bh);
				971	set_buffer_mapped(bh);
				972	}
				973	block++;
				974	bh = bh->b_this_page;
				975	} while (bh != head);
				976	}
				977
				978	/*
				979	* Create the page-cache page that contains the requested block.
				980	*
				981	* This is user purely for blockdev mappings.
				982	*/
				983	static struct page *
				984	grow_dev_page(struct block_device *bdev, sector_t block,
				985	pgoff_t index, int size)
				986	{
				987	struct inode *inode = bdev->bd_inode;
				988	struct page *page;
				989	struct buffer_head *bh;
				990
				991	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
				992	if (!page)
				993	return NULL;
				994
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	995	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	996
				997	if (page_has_buffers(page)) {
				998	bh = page_buffers(page);
				999	if (bh->b_size == size) {
				1000	init_page_buffers(page, bdev, block, size);
				1001	return page;
				1002	}
				1003	if (!try_to_free_buffers(page))
				1004	goto failed;
				1005	}
				1006
				1007	/*
				1008	* Allocate some buffers for this page
				1009	*/
				1010	bh = alloc_page_buffers(page, size, 0);
				1011	if (!bh)
				1012	goto failed;
				1013
				1014	/*
				1015	* Link the page to the buffers and initialise them. Take the
				1016	* lock to be atomic wrt __find_get_block(), which does not
				1017	* run under the page lock.
				1018	*/
				1019	spin_lock(&inode->i_mapping->private_lock);
				1020	link_dev_buffers(page, bh);
				1021	init_page_buffers(page, bdev, block, size);
				1022	spin_unlock(&inode->i_mapping->private_lock);
				1023	return page;
				1024
				1025	failed:
				1026	BUG();
				1027	unlock_page(page);
				1028	page_cache_release(page);
				1029	return NULL;
				1030	}
				1031
				1032	/*
				1033	* Create buffers for the specified block device block's page. If
				1034	* that page was dirty, the buffers are set dirty also.
				1035	*
				1036	* Except that's a bug. Attaching dirty buffers to a dirty
				1037	* blockdev's page can result in filesystem corruption, because
				1038	* some of those buffers may be aliases of filesystem data.
				1039	* grow_dev_page() will go BUG() if this happens.
				1040	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1041	static int
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1042	grow_buffers(struct block_device *bdev, sector_t block, int size)
				1043	{
				1044	struct page *page;
				1045	pgoff_t index;
				1046	int sizebits;
				1047
				1048	sizebits = -1;
				1049	do {
				1050	sizebits++;
				1051	} while ((size << sizebits) < PAGE_SIZE);
				1052
				1053	index = block >> sizebits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1054
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1055	/*
				1056	* Check for a block which wants to lie outside our maximum possible
				1057	* pagecache index. (this comparison is done using sector_t types).
				1058	*/
				1059	if (unlikely(index != block >> sizebits)) {
				1060	char b[BDEVNAME_SIZE];
				1061
				1062	printk(KERN_ERR "%s: requested out-of-range block %llu for "
				1063	"device %s\n",
				1064	__FUNCTION__, (unsigned long long)block,
				1065	bdevname(bdev, b));
				1066	return -EIO;
				1067	}
				1068	block = index << sizebits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1069	/* Create a page with the proper size buffers.. */
				1070	page = grow_dev_page(bdev, block, index, size);
				1071	if (!page)
				1072	return 0;
				1073	unlock_page(page);
				1074	page_cache_release(page);
				1075	return 1;
				1076	}
				1077
Adrian Bunk	75c96f8	2005-05-05 16:16:09 -0700	[diff] [blame]	1078	static struct buffer_head *
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1079	__getblk_slow(struct block_device *bdev, sector_t block, int size)
				1080	{
				1081	/* Size must be multiple of hard sectorsize */
				1082	if (unlikely(size & (bdev_hardsect_size(bdev)-1) \|\|
				1083	(size < 512 \|\| size > PAGE_SIZE))) {
				1084	printk(KERN_ERR "getblk(): invalid block size %d requested\n",
				1085	size);
				1086	printk(KERN_ERR "hardsect size: %d\n",
				1087	bdev_hardsect_size(bdev));
				1088
				1089	dump_stack();
				1090	return NULL;
				1091	}
				1092
				1093	for (;;) {
				1094	struct buffer_head * bh;
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1095	int ret;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1096
				1097	bh = __find_get_block(bdev, block, size);
				1098	if (bh)
				1099	return bh;
				1100
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1101	ret = grow_buffers(bdev, block, size);
				1102	if (ret < 0)
				1103	return NULL;
				1104	if (ret == 0)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1105	free_more_memory();
				1106	}
				1107	}
				1108
				1109	/*
				1110	* The relationship between dirty buffers and dirty pages:
				1111	*
				1112	* Whenever a page has any dirty buffers, the page's dirty bit is set, and
				1113	* the page is tagged dirty in its radix tree.
				1114	*
				1115	* At all times, the dirtiness of the buffers represents the dirtiness of
				1116	* subsections of the page. If the page has buffers, the page dirty bit is
				1117	* merely a hint about the true dirty state.
				1118	*
				1119	* When a page is set dirty in its entirety, all its buffers are marked dirty
				1120	* (if the page has buffers).
				1121	*
				1122	* When a buffer is marked dirty, its page is dirtied, but the page's other
				1123	* buffers are not.
				1124	*
				1125	* Also. When blockdev buffers are explicitly read with bread(), they
				1126	* individually become uptodate. But their backing page remains not
				1127	* uptodate - even if all of its buffers are uptodate. A subsequent
				1128	* block_read_full_page() against that page will discover all the uptodate
				1129	* buffers, will set the page uptodate and will perform no I/O.
				1130	*/
				1131
				1132	/**
				1133	* mark_buffer_dirty - mark a buffer_head as needing writeout
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1134	* @bh: the buffer_head to mark dirty
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1135	*
				1136	* mark_buffer_dirty() will set the dirty bit against the buffer, then set its
				1137	* backing page dirty, then tag the page as dirty in its address_space's radix
				1138	* tree and then attach the address_space's inode to its superblock's dirty
				1139	* inode list.
				1140	*
				1141	* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
				1142	* mapping->tree_lock and the global inode_lock.
				1143	*/
				1144	void fastcall mark_buffer_dirty(struct buffer_head *bh)
				1145	{
				1146	if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
				1147	__set_page_dirty_nobuffers(bh->b_page);
				1148	}
				1149
				1150	/*
				1151	* Decrement a buffer_head's reference count. If all buffers against a page
				1152	* have zero reference count, are clean and unlocked, and if the page is clean
				1153	* and unlocked then try_to_free_buffers() may strip the buffers from the page
				1154	* in preparation for freeing it (sometimes, rarely, buffers are removed from
				1155	* a page but it ends up not being freed, and buffers may later be reattached).
				1156	*/
				1157	void __brelse(struct buffer_head * buf)
				1158	{
				1159	if (atomic_read(&buf->b_count)) {
				1160	put_bh(buf);
				1161	return;
				1162	}
				1163	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
				1164	WARN_ON(1);
				1165	}
				1166
				1167	/*
				1168	* bforget() is like brelse(), except it discards any
				1169	* potentially dirty data.
				1170	*/
				1171	void __bforget(struct buffer_head *bh)
				1172	{
				1173	clear_buffer_dirty(bh);
				1174	if (!list_empty(&bh->b_assoc_buffers)) {
				1175	struct address_space *buffer_mapping = bh->b_page->mapping;
				1176
				1177	spin_lock(&buffer_mapping->private_lock);
				1178	list_del_init(&bh->b_assoc_buffers);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	1179	bh->b_assoc_map = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1180	spin_unlock(&buffer_mapping->private_lock);
				1181	}
				1182	__brelse(bh);
				1183	}
				1184
				1185	static struct buffer_head __bread_slow(struct buffer_head bh)
				1186	{
				1187	lock_buffer(bh);
				1188	if (buffer_uptodate(bh)) {
				1189	unlock_buffer(bh);
				1190	return bh;
				1191	} else {
				1192	get_bh(bh);
				1193	bh->b_end_io = end_buffer_read_sync;
				1194	submit_bh(READ, bh);
				1195	wait_on_buffer(bh);
				1196	if (buffer_uptodate(bh))
				1197	return bh;
				1198	}
				1199	brelse(bh);
				1200	return NULL;
				1201	}
				1202
				1203	/*
				1204	* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
				1205	* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
				1206	* refcount elevated by one when they're in an LRU. A buffer can only appear
				1207	* once in a particular CPU's LRU. A single buffer can be present in multiple
				1208	* CPU's LRUs at the same time.
				1209	*
				1210	* This is a transparent caching front-end to sb_bread(), sb_getblk() and
				1211	* sb_find_get_block().
				1212	*
				1213	* The LRUs themselves only need locking against invalidate_bh_lrus. We use
				1214	* a local interrupt disable for that.
				1215	*/
				1216
				1217	#define BH_LRU_SIZE 8
				1218
				1219	struct bh_lru {
				1220	struct buffer_head *bhs[BH_LRU_SIZE];
				1221	};
				1222
				1223	static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
				1224
				1225	#ifdef CONFIG_SMP
				1226	#define bh_lru_lock() local_irq_disable()
				1227	#define bh_lru_unlock() local_irq_enable()
				1228	#else
				1229	#define bh_lru_lock() preempt_disable()
				1230	#define bh_lru_unlock() preempt_enable()
				1231	#endif
				1232
				1233	static inline void check_irqs_on(void)
				1234	{
				1235	#ifdef irqs_disabled
				1236	BUG_ON(irqs_disabled());
				1237	#endif
				1238	}
				1239
				1240	/*
				1241	* The LRU management algorithm is dopey-but-simple. Sorry.
				1242	*/
				1243	static void bh_lru_install(struct buffer_head *bh)
				1244	{
				1245	struct buffer_head *evictee = NULL;
				1246	struct bh_lru *lru;
				1247
				1248	check_irqs_on();
				1249	bh_lru_lock();
				1250	lru = &__get_cpu_var(bh_lrus);
				1251	if (lru->bhs[0] != bh) {
				1252	struct buffer_head *bhs[BH_LRU_SIZE];
				1253	int in;
				1254	int out = 0;
				1255
				1256	get_bh(bh);
				1257	bhs[out++] = bh;
				1258	for (in = 0; in < BH_LRU_SIZE; in++) {
				1259	struct buffer_head *bh2 = lru->bhs[in];
				1260
				1261	if (bh2 == bh) {
				1262	__brelse(bh2);
				1263	} else {
				1264	if (out >= BH_LRU_SIZE) {
				1265	BUG_ON(evictee != NULL);
				1266	evictee = bh2;
				1267	} else {
				1268	bhs[out++] = bh2;
				1269	}
				1270	}
				1271	}
				1272	while (out < BH_LRU_SIZE)
				1273	bhs[out++] = NULL;
				1274	memcpy(lru->bhs, bhs, sizeof(bhs));
				1275	}
				1276	bh_lru_unlock();
				1277
				1278	if (evictee)
				1279	__brelse(evictee);
				1280	}
				1281
				1282	/*
				1283	* Look up the bh in this cpu's LRU. If it's there, move it to the head.
				1284	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1285	static struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1286	lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1287	{
				1288	struct buffer_head *ret = NULL;
				1289	struct bh_lru *lru;
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1290	unsigned int i;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1291
				1292	check_irqs_on();
				1293	bh_lru_lock();
				1294	lru = &__get_cpu_var(bh_lrus);
				1295	for (i = 0; i < BH_LRU_SIZE; i++) {
				1296	struct buffer_head *bh = lru->bhs[i];
				1297
				1298	if (bh && bh->b_bdev == bdev &&
				1299	bh->b_blocknr == block && bh->b_size == size) {
				1300	if (i) {
				1301	while (i) {
				1302	lru->bhs[i] = lru->bhs[i - 1];
				1303	i--;
				1304	}
				1305	lru->bhs[0] = bh;
				1306	}
				1307	get_bh(bh);
				1308	ret = bh;
				1309	break;
				1310	}
				1311	}
				1312	bh_lru_unlock();
				1313	return ret;
				1314	}
				1315
				1316	/*
				1317	* Perform a pagecache lookup for the matching buffer. If it's there, refresh
				1318	* it in the LRU and mark it as accessed. If it is not present then return
				1319	* NULL
				1320	*/
				1321	struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1322	__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1323	{
				1324	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
				1325
				1326	if (bh == NULL) {
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1327	bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1328	if (bh)
				1329	bh_lru_install(bh);
				1330	}
				1331	if (bh)
				1332	touch_buffer(bh);
				1333	return bh;
				1334	}
				1335	EXPORT_SYMBOL(__find_get_block);
				1336
				1337	/*
				1338	* __getblk will locate (and, if necessary, create) the buffer_head
				1339	* which corresponds to the passed block_device, block and size. The
				1340	* returned buffer has its reference count incremented.
				1341	*
				1342	* __getblk() cannot fail - it just keeps trying. If you pass it an
				1343	* illegal block number, __getblk() will happily return a buffer_head
				1344	* which represents the non-existent block. Very weird.
				1345	*
				1346	* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
				1347	* attempt is failing. FIXME, perhaps?
				1348	*/
				1349	struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1350	__getblk(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1351	{
				1352	struct buffer_head *bh = __find_get_block(bdev, block, size);
				1353
				1354	might_sleep();
				1355	if (bh == NULL)
				1356	bh = __getblk_slow(bdev, block, size);
				1357	return bh;
				1358	}
				1359	EXPORT_SYMBOL(__getblk);
				1360
				1361	/*
				1362	* Do async read-ahead on a buffer..
				1363	*/
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1364	void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1365	{
				1366	struct buffer_head *bh = __getblk(bdev, block, size);
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1367	if (likely(bh)) {
				1368	ll_rw_block(READA, 1, &bh);
				1369	brelse(bh);
				1370	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1371	}
				1372	EXPORT_SYMBOL(__breadahead);
				1373
				1374	/**
				1375	* __bread() - reads a specified block and returns the bh
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1376	* @bdev: the block_device to read from
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1377	* @block: number of block
				1378	* @size: size (in bytes) to read
				1379	*
				1380	* Reads a specified block, and returns buffer head that contains it.
				1381	* It returns NULL if the block was unreadable.
				1382	*/
				1383	struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1384	__bread(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1385	{
				1386	struct buffer_head *bh = __getblk(bdev, block, size);
				1387
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1388	if (likely(bh) && !buffer_uptodate(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1389	bh = __bread_slow(bh);
				1390	return bh;
				1391	}
				1392	EXPORT_SYMBOL(__bread);
				1393
				1394	/*
				1395	* invalidate_bh_lrus() is called rarely - but not only at unmount.
				1396	* This doesn't race because it runs in each cpu either in irq
				1397	* or with preempt disabled.
				1398	*/
				1399	static void invalidate_bh_lru(void *arg)
				1400	{
				1401	struct bh_lru *b = &get_cpu_var(bh_lrus);
				1402	int i;
				1403
				1404	for (i = 0; i < BH_LRU_SIZE; i++) {
				1405	brelse(b->bhs[i]);
				1406	b->bhs[i] = NULL;
				1407	}
				1408	put_cpu_var(bh_lrus);
				1409	}
				1410
				1411	static void invalidate_bh_lrus(void)
				1412	{
				1413	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
				1414	}
				1415
				1416	void set_bh_page(struct buffer_head *bh,
				1417	struct page *page, unsigned long offset)
				1418	{
				1419	bh->b_page = page;
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	1420	BUG_ON(offset >= PAGE_SIZE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1421	if (PageHighMem(page))
				1422	/*
				1423	* This catches illegal uses and preserves the offset:
				1424	*/
				1425	bh->b_data = (char *)(0 + offset);
				1426	else
				1427	bh->b_data = page_address(page) + offset;
				1428	}
				1429	EXPORT_SYMBOL(set_bh_page);
				1430
				1431	/*
				1432	* Called when truncating a buffer on a page completely.
				1433	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1434	static void discard_buffer(struct buffer_head * bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1435	{
				1436	lock_buffer(bh);
				1437	clear_buffer_dirty(bh);
				1438	bh->b_bdev = NULL;
				1439	clear_buffer_mapped(bh);
				1440	clear_buffer_req(bh);
				1441	clear_buffer_new(bh);
				1442	clear_buffer_delay(bh);
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	1443	clear_buffer_unwritten(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1444	unlock_buffer(bh);
				1445	}
				1446
				1447	/**
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1448	* block_invalidatepage - invalidate part of all of a buffer-backed page
				1449	*
				1450	* @page: the page which is affected
				1451	* @offset: the index of the truncation point
				1452	*
				1453	* block_invalidatepage() is called when all or part of the page has become
				1454	* invalidatedby a truncate operation.
				1455	*
				1456	* block_invalidatepage() does not have to release all buffers, but it must
				1457	* ensure that no dirty buffer is left outside @offset and that no I/O
				1458	* is underway against any of the blocks which are outside the truncation
				1459	* point. Because the caller is about to free (and possibly reuse) those
				1460	* blocks on-disk.
				1461	*/
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1462	void block_invalidatepage(struct page *page, unsigned long offset)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1463	{
				1464	struct buffer_head head, bh, *next;
				1465	unsigned int curr_off = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1466
				1467	BUG_ON(!PageLocked(page));
				1468	if (!page_has_buffers(page))
				1469	goto out;
				1470
				1471	head = page_buffers(page);
				1472	bh = head;
				1473	do {
				1474	unsigned int next_off = curr_off + bh->b_size;
				1475	next = bh->b_this_page;
				1476
				1477	/*
				1478	* is this block fully invalidated?
				1479	*/
				1480	if (offset <= curr_off)
				1481	discard_buffer(bh);
				1482	curr_off = next_off;
				1483	bh = next;
				1484	} while (bh != head);
				1485
				1486	/*
				1487	* We release buffers only if the entire page is being invalidated.
				1488	* The get_block cached value has been unconditionally invalidated,
				1489	* so real IO is not possible anymore.
				1490	*/
				1491	if (offset == 0)
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1492	try_to_release_page(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1493	out:
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1494	return;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1495	}
				1496	EXPORT_SYMBOL(block_invalidatepage);
				1497
				1498	/*
				1499	* We attach and possibly dirty the buffers atomically wrt
				1500	* __set_page_dirty_buffers() via private_lock. try_to_free_buffers
				1501	* is already excluded via the page lock.
				1502	*/
				1503	void create_empty_buffers(struct page *page,
				1504	unsigned long blocksize, unsigned long b_state)
				1505	{
				1506	struct buffer_head bh, head, *tail;
				1507
				1508	head = alloc_page_buffers(page, blocksize, 1);
				1509	bh = head;
				1510	do {
				1511	bh->b_state \|= b_state;
				1512	tail = bh;
				1513	bh = bh->b_this_page;
				1514	} while (bh);
				1515	tail->b_this_page = head;
				1516
				1517	spin_lock(&page->mapping->private_lock);
				1518	if (PageUptodate(page) \|\| PageDirty(page)) {
				1519	bh = head;
				1520	do {
				1521	if (PageDirty(page))
				1522	set_buffer_dirty(bh);
				1523	if (PageUptodate(page))
				1524	set_buffer_uptodate(bh);
				1525	bh = bh->b_this_page;
				1526	} while (bh != head);
				1527	}
				1528	attach_page_buffers(page, head);
				1529	spin_unlock(&page->mapping->private_lock);
				1530	}
				1531	EXPORT_SYMBOL(create_empty_buffers);
				1532
				1533	/*
				1534	* We are taking a block for data and we don't want any output from any
				1535	* buffer-cache aliases starting from return from that function and
				1536	* until the moment when something will explicitly mark the buffer
				1537	* dirty (hopefully that will not happen until we will free that block ;-)
				1538	* We don't even need to mark it not-uptodate - nobody can expect
				1539	* anything from a newly allocated buffer anyway. We used to used
				1540	* unmap_buffer() for such invalidation, but that was wrong. We definitely
				1541	* don't want to mark the alias unmapped, for example - it would confuse
				1542	* anyone who might pick it with bread() afterwards...
				1543	*
				1544	* Also.. Note that bforget() doesn't lock the buffer. So there can
				1545	* be writeout I/O going on against recently-freed buffers. We don't
				1546	* wait on that I/O in bforget() - it's more efficient to wait on the I/O
				1547	* only if we really need to. That happens here.
				1548	*/
				1549	void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
				1550	{
				1551	struct buffer_head *old_bh;
				1552
				1553	might_sleep();
				1554
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1555	old_bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1556	if (old_bh) {
				1557	clear_buffer_dirty(old_bh);
				1558	wait_on_buffer(old_bh);
				1559	clear_buffer_req(old_bh);
				1560	__brelse(old_bh);
				1561	}
				1562	}
				1563	EXPORT_SYMBOL(unmap_underlying_metadata);
				1564
				1565	/*
				1566	* NOTE! All mapped/uptodate combinations are valid:
				1567	*
				1568	* Mapped Uptodate Meaning
				1569	*
				1570	* No No "unknown" - must do get_block()
				1571	* No Yes "hole" - zero-filled
				1572	* Yes No "allocated" - allocated on disk, not read in
				1573	* Yes Yes "valid" - allocated and up-to-date in memory.
				1574	*
				1575	* "Dirty" is valid only with the last case (mapped+uptodate).
				1576	*/
				1577
				1578	/*
				1579	* While block_write_full_page is writing back the dirty buffers under
				1580	* the page lock, whoever dirtied the buffers may decide to clean them
				1581	* again at any time. We handle that by only looking at the buffer
				1582	* state inside lock_buffer().
				1583	*
				1584	* If block_write_full_page() is called for regular writeback
				1585	* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
				1586	* locked buffer. This only can happen if someone has written the buffer
				1587	* directly, with submit_bh(). At the address_space level PageWriteback
				1588	* prevents this contention from occurring.
				1589	*/
				1590	static int __block_write_full_page(struct inode inode, struct page page,
				1591	get_block_t get_block, struct writeback_control wbc)
				1592	{
				1593	int err;
				1594	sector_t block;
				1595	sector_t last_block;
Andrew Morton	f0fbd5f	2005-05-05 16:15:48 -0700	[diff] [blame]	1596	struct buffer_head bh, head;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1597	const unsigned blocksize = 1 << inode->i_blkbits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1598	int nr_underway = 0;
				1599
				1600	BUG_ON(!PageLocked(page));
				1601
				1602	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
				1603
				1604	if (!page_has_buffers(page)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1605	create_empty_buffers(page, blocksize,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1606	(1 << BH_Dirty)\|(1 << BH_Uptodate));
				1607	}
				1608
				1609	/*
				1610	* Be very careful. We have no exclusion from __set_page_dirty_buffers
				1611	* here, and the (potentially unmapped) buffers may become dirty at
				1612	* any time. If a buffer becomes dirty here after we've inspected it
				1613	* then we just miss that fact, and the page stays dirty.
				1614	*
				1615	* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
				1616	* handle that here by just cleaning them.
				1617	*/
				1618
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	1619	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1620	head = page_buffers(page);
				1621	bh = head;
				1622
				1623	/*
				1624	* Get all the dirty buffers mapped to disk addresses and
				1625	* handle any aliases from the underlying blockdev's mapping.
				1626	*/
				1627	do {
				1628	if (block > last_block) {
				1629	/*
				1630	* mapped buffers outside i_size will occur, because
				1631	* this page can be outside i_size when there is a
				1632	* truncate in progress.
				1633	*/
				1634	/*
				1635	* The buffer was zeroed by block_write_full_page()
				1636	*/
				1637	clear_buffer_dirty(bh);
				1638	set_buffer_uptodate(bh);
				1639	} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1640	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1641	err = get_block(inode, block, bh, 1);
				1642	if (err)
				1643	goto recover;
				1644	if (buffer_new(bh)) {
				1645	/* blockdev mappings never come here */
				1646	clear_buffer_new(bh);
				1647	unmap_underlying_metadata(bh->b_bdev,
				1648	bh->b_blocknr);
				1649	}
				1650	}
				1651	bh = bh->b_this_page;
				1652	block++;
				1653	} while (bh != head);
				1654
				1655	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1656	if (!buffer_mapped(bh))
				1657	continue;
				1658	/*
				1659	* If it's a fully non-blocking write attempt and we cannot
				1660	* lock the buffer then redirty the page. Note that this can
				1661	* potentially cause a busy-wait loop from pdflush and kswapd
				1662	* activity, but those code paths have their own higher-level
				1663	* throttling.
				1664	*/
				1665	if (wbc->sync_mode != WB_SYNC_NONE \|\| !wbc->nonblocking) {
				1666	lock_buffer(bh);
				1667	} else if (test_set_buffer_locked(bh)) {
				1668	redirty_page_for_writepage(wbc, page);
				1669	continue;
				1670	}
				1671	if (test_clear_buffer_dirty(bh)) {
				1672	mark_buffer_async_write(bh);
				1673	} else {
				1674	unlock_buffer(bh);
				1675	}
				1676	} while ((bh = bh->b_this_page) != head);
				1677
				1678	/*
				1679	* The page and its buffers are protected by PageWriteback(), so we can
				1680	* drop the bh refcounts early.
				1681	*/
				1682	BUG_ON(PageWriteback(page));
				1683	set_page_writeback(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1684
				1685	do {
				1686	struct buffer_head *next = bh->b_this_page;
				1687	if (buffer_async_write(bh)) {
				1688	submit_bh(WRITE, bh);
				1689	nr_underway++;
				1690	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1691	bh = next;
				1692	} while (bh != head);
Andrew Morton	05937ba	2005-05-05 16:15:47 -0700	[diff] [blame]	1693	unlock_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1694
				1695	err = 0;
				1696	done:
				1697	if (nr_underway == 0) {
				1698	/*
				1699	* The page was marked dirty, but the buffers were
				1700	* clean. Someone wrote them back by hand with
				1701	* ll_rw_block/submit_bh. A rare case.
				1702	*/
				1703	int uptodate = 1;
				1704	do {
				1705	if (!buffer_uptodate(bh)) {
				1706	uptodate = 0;
				1707	break;
				1708	}
				1709	bh = bh->b_this_page;
				1710	} while (bh != head);
				1711	if (uptodate)
				1712	SetPageUptodate(page);
				1713	end_page_writeback(page);
				1714	/*
				1715	* The page and buffer_heads can be released at any time from
				1716	* here on.
				1717	*/
				1718	wbc->pages_skipped++; /* We didn't write this page */
				1719	}
				1720	return err;
				1721
				1722	recover:
				1723	/*
				1724	* ENOSPC, or some other error. We may already have added some
				1725	* blocks to the file, so we need to write these out to avoid
				1726	* exposing stale data.
				1727	* The page is currently locked and not marked for writeback
				1728	*/
				1729	bh = head;
				1730	/* Recovery: lock and submit the mapped buffers */
				1731	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1732	if (buffer_mapped(bh) && buffer_dirty(bh)) {
				1733	lock_buffer(bh);
				1734	mark_buffer_async_write(bh);
				1735	} else {
				1736	/*
				1737	* The buffer may have been set dirty during
				1738	* attachment to a dirty page.
				1739	*/
				1740	clear_buffer_dirty(bh);
				1741	}
				1742	} while ((bh = bh->b_this_page) != head);
				1743	SetPageError(page);
				1744	BUG_ON(PageWriteback(page));
				1745	set_page_writeback(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1746	do {
				1747	struct buffer_head *next = bh->b_this_page;
				1748	if (buffer_async_write(bh)) {
				1749	clear_buffer_dirty(bh);
				1750	submit_bh(WRITE, bh);
				1751	nr_underway++;
				1752	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1753	bh = next;
				1754	} while (bh != head);
Nick Piggin	ffda9d3	2007-02-20 13:57:54 -0800	[diff] [blame]	1755	unlock_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1756	goto done;
				1757	}
				1758
				1759	static int __block_prepare_write(struct inode inode, struct page page,
				1760	unsigned from, unsigned to, get_block_t *get_block)
				1761	{
				1762	unsigned block_start, block_end;
				1763	sector_t block;
				1764	int err = 0;
				1765	unsigned blocksize, bbits;
				1766	struct buffer_head bh, head, wait[2], *wait_bh=wait;
				1767
				1768	BUG_ON(!PageLocked(page));
				1769	BUG_ON(from > PAGE_CACHE_SIZE);
				1770	BUG_ON(to > PAGE_CACHE_SIZE);
				1771	BUG_ON(from > to);
				1772
				1773	blocksize = 1 << inode->i_blkbits;
				1774	if (!page_has_buffers(page))
				1775	create_empty_buffers(page, blocksize, 0);
				1776	head = page_buffers(page);
				1777
				1778	bbits = inode->i_blkbits;
				1779	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
				1780
				1781	for(bh = head, block_start = 0; bh != head \|\| !block_start;
				1782	block++, block_start=block_end, bh = bh->b_this_page) {
				1783	block_end = block_start + blocksize;
				1784	if (block_end <= from \|\| block_start >= to) {
				1785	if (PageUptodate(page)) {
				1786	if (!buffer_uptodate(bh))
				1787	set_buffer_uptodate(bh);
				1788	}
				1789	continue;
				1790	}
				1791	if (buffer_new(bh))
				1792	clear_buffer_new(bh);
				1793	if (!buffer_mapped(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1794	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1795	err = get_block(inode, block, bh, 1);
				1796	if (err)
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1797	break;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1798	if (buffer_new(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1799	unmap_underlying_metadata(bh->b_bdev,
				1800	bh->b_blocknr);
				1801	if (PageUptodate(page)) {
				1802	set_buffer_uptodate(bh);
				1803	continue;
				1804	}
				1805	if (block_end > to \|\| block_start < from) {
				1806	void *kaddr;
				1807
				1808	kaddr = kmap_atomic(page, KM_USER0);
				1809	if (block_end > to)
				1810	memset(kaddr+to, 0,
				1811	block_end-to);
				1812	if (block_start < from)
				1813	memset(kaddr+block_start,
				1814	0, from-block_start);
				1815	flush_dcache_page(page);
				1816	kunmap_atomic(kaddr, KM_USER0);
				1817	}
				1818	continue;
				1819	}
				1820	}
				1821	if (PageUptodate(page)) {
				1822	if (!buffer_uptodate(bh))
				1823	set_buffer_uptodate(bh);
				1824	continue;
				1825	}
				1826	if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	1827	!buffer_unwritten(bh) &&
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1828	(block_start < from \|\| block_end > to)) {
				1829	ll_rw_block(READ, 1, &bh);
				1830	*wait_bh++=bh;
				1831	}
				1832	}
				1833	/*
				1834	* If we issued read requests - let them complete.
				1835	*/
				1836	while(wait_bh > wait) {
				1837	wait_on_buffer(*--wait_bh);
				1838	if (!buffer_uptodate(*wait_bh))
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1839	err = -EIO;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1840	}
Anton Altaparmakov	152becd	2005-06-23 00:10:21 -0700	[diff] [blame]	1841	if (!err) {
				1842	bh = head;
				1843	do {
				1844	if (buffer_new(bh))
				1845	clear_buffer_new(bh);
				1846	} while ((bh = bh->b_this_page) != head);
				1847	return 0;
				1848	}
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1849	/* Error case: */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1850	/*
				1851	* Zero out any newly allocated blocks to avoid exposing stale
				1852	* data. If BH_New is set, we know that the block was newly
				1853	* allocated in the above loop.
				1854	*/
				1855	bh = head;
				1856	block_start = 0;
				1857	do {
				1858	block_end = block_start+blocksize;
				1859	if (block_end <= from)
				1860	goto next_bh;
				1861	if (block_start >= to)
				1862	break;
				1863	if (buffer_new(bh)) {
				1864	void *kaddr;
				1865
				1866	clear_buffer_new(bh);
				1867	kaddr = kmap_atomic(page, KM_USER0);
				1868	memset(kaddr+block_start, 0, bh->b_size);
Monakhov Dmitriy	8c58165	2006-10-11 01:22:00 -0700	[diff] [blame]	1869	flush_dcache_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1870	kunmap_atomic(kaddr, KM_USER0);
				1871	set_buffer_uptodate(bh);
				1872	mark_buffer_dirty(bh);
				1873	}
				1874	next_bh:
				1875	block_start = block_end;
				1876	bh = bh->b_this_page;
				1877	} while (bh != head);
				1878	return err;
				1879	}
				1880
				1881	static int __block_commit_write(struct inode inode, struct page page,
				1882	unsigned from, unsigned to)
				1883	{
				1884	unsigned block_start, block_end;
				1885	int partial = 0;
				1886	unsigned blocksize;
				1887	struct buffer_head bh, head;
				1888
				1889	blocksize = 1 << inode->i_blkbits;
				1890
				1891	for(bh = head = page_buffers(page), block_start = 0;
				1892	bh != head \|\| !block_start;
				1893	block_start=block_end, bh = bh->b_this_page) {
				1894	block_end = block_start + blocksize;
				1895	if (block_end <= from \|\| block_start >= to) {
				1896	if (!buffer_uptodate(bh))
				1897	partial = 1;
				1898	} else {
				1899	set_buffer_uptodate(bh);
				1900	mark_buffer_dirty(bh);
				1901	}
				1902	}
				1903
				1904	/*
				1905	* If this is a partial write which happened to make all buffers
				1906	* uptodate then we can optimize away a bogus readpage() for
				1907	* the next read(). Here we 'discover' whether the page went
				1908	* uptodate as a result of this (potentially partial) write.
				1909	*/
				1910	if (!partial)
				1911	SetPageUptodate(page);
				1912	return 0;
				1913	}
				1914
				1915	/*
				1916	* Generic "read page" function for block devices that have the normal
				1917	* get_block functionality. This is most of the block device filesystems.
				1918	* Reads the page asynchronously --- the unlock_buffer() and
				1919	* set/clear_buffer_uptodate() functions propagate buffer state into the
				1920	* page struct once IO has completed.
				1921	*/
				1922	int block_read_full_page(struct page page, get_block_t get_block)
				1923	{
				1924	struct inode *inode = page->mapping->host;
				1925	sector_t iblock, lblock;
				1926	struct buffer_head bh, head, *arr[MAX_BUF_PER_PAGE];
				1927	unsigned int blocksize;
				1928	int nr, i;
				1929	int fully_mapped = 1;
				1930
Matt Mackall	cd7619d	2005-05-01 08:59:01 -0700	[diff] [blame]	1931	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1932	blocksize = 1 << inode->i_blkbits;
				1933	if (!page_has_buffers(page))
				1934	create_empty_buffers(page, blocksize, 0);
				1935	head = page_buffers(page);
				1936
				1937	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
				1938	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
				1939	bh = head;
				1940	nr = 0;
				1941	i = 0;
				1942
				1943	do {
				1944	if (buffer_uptodate(bh))
				1945	continue;
				1946
				1947	if (!buffer_mapped(bh)) {
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	1948	int err = 0;
				1949
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1950	fully_mapped = 0;
				1951	if (iblock < lblock) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1952	WARN_ON(bh->b_size != blocksize);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	1953	err = get_block(inode, iblock, bh, 0);
				1954	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1955	SetPageError(page);
				1956	}
				1957	if (!buffer_mapped(bh)) {
				1958	void *kaddr = kmap_atomic(page, KM_USER0);
				1959	memset(kaddr + i * blocksize, 0, blocksize);
				1960	flush_dcache_page(page);
				1961	kunmap_atomic(kaddr, KM_USER0);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	1962	if (!err)
				1963	set_buffer_uptodate(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1964	continue;
				1965	}
				1966	/*
				1967	* get_block() might have updated the buffer
				1968	* synchronously
				1969	*/
				1970	if (buffer_uptodate(bh))
				1971	continue;
				1972	}
				1973	arr[nr++] = bh;
				1974	} while (i++, iblock++, (bh = bh->b_this_page) != head);
				1975
				1976	if (fully_mapped)
				1977	SetPageMappedToDisk(page);
				1978
				1979	if (!nr) {
				1980	/*
				1981	* All buffers are uptodate - we can set the page uptodate
				1982	* as well. But not if get_block() returned an error.
				1983	*/
				1984	if (!PageError(page))
				1985	SetPageUptodate(page);
				1986	unlock_page(page);
				1987	return 0;
				1988	}
				1989
				1990	/* Stage two: lock the buffers */
				1991	for (i = 0; i < nr; i++) {
				1992	bh = arr[i];
				1993	lock_buffer(bh);
				1994	mark_buffer_async_read(bh);
				1995	}
				1996
				1997	/*
				1998	* Stage 3: start the IO. Check for uptodateness
				1999	* inside the buffer lock in case another process reading
				2000	* the underlying blockdev brought it uptodate (the sct fix).
				2001	*/
				2002	for (i = 0; i < nr; i++) {
				2003	bh = arr[i];
				2004	if (buffer_uptodate(bh))
				2005	end_buffer_async_read(bh, 1);
				2006	else
				2007	submit_bh(READ, bh);
				2008	}
				2009	return 0;
				2010	}
				2011
				2012	/* utility function for filesystems that need to do work on expanding
				2013	* truncates. Uses prepare/commit_write to allow the filesystem to
				2014	* deal with the hole.
				2015	*/
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2016	static int __generic_cont_expand(struct inode *inode, loff_t size,
				2017	pgoff_t index, unsigned int offset)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2018	{
				2019	struct address_space *mapping = inode->i_mapping;
				2020	struct page *page;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2021	unsigned long limit;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2022	int err;
				2023
				2024	err = -EFBIG;
				2025	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
				2026	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
				2027	send_sig(SIGXFSZ, current, 0);
				2028	goto out;
				2029	}
				2030	if (size > inode->i_sb->s_maxbytes)
				2031	goto out;
				2032
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2033	err = -ENOMEM;
				2034	page = grab_cache_page(mapping, index);
				2035	if (!page)
				2036	goto out;
				2037	err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2038	if (err) {
				2039	/*
				2040	* ->prepare_write() may have instantiated a few blocks
				2041	* outside i_size. Trim these off again.
				2042	*/
				2043	unlock_page(page);
				2044	page_cache_release(page);
				2045	vmtruncate(inode, inode->i_size);
				2046	goto out;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2047	}
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2048
				2049	err = mapping->a_ops->commit_write(NULL, page, offset, offset);
				2050
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2051	unlock_page(page);
				2052	page_cache_release(page);
				2053	if (err > 0)
				2054	err = 0;
				2055	out:
				2056	return err;
				2057	}
				2058
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2059	int generic_cont_expand(struct inode *inode, loff_t size)
				2060	{
				2061	pgoff_t index;
				2062	unsigned int offset;
				2063
				2064	offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
				2065
				2066	/* ugh. in prepare/commit_write, if from==to==start of block, we
				2067	** skip the prepare. make sure we never send an offset for the start
				2068	** of a block
				2069	*/
				2070	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
				2071	/* caller must handle this extra byte. */
				2072	offset++;
				2073	}
				2074	index = size >> PAGE_CACHE_SHIFT;
				2075
				2076	return __generic_cont_expand(inode, size, index, offset);
				2077	}
				2078
				2079	int generic_cont_expand_simple(struct inode *inode, loff_t size)
				2080	{
				2081	loff_t pos = size - 1;
				2082	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
				2083	unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
				2084
				2085	/* prepare/commit_write can handle even if from==to==start of block. */
				2086	return __generic_cont_expand(inode, size, index, offset);
				2087	}
				2088
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2089	/*
				2090	* For moronic filesystems that do not allow holes in file.
				2091	* We may have to extend the file.
				2092	*/
				2093
				2094	int cont_prepare_write(struct page *page, unsigned offset,
				2095	unsigned to, get_block_t get_block, loff_t bytes)
				2096	{
				2097	struct address_space *mapping = page->mapping;
				2098	struct inode *inode = mapping->host;
				2099	struct page *new_page;
				2100	pgoff_t pgpos;
				2101	long status;
				2102	unsigned zerofrom;
				2103	unsigned blocksize = 1 << inode->i_blkbits;
				2104	void *kaddr;
				2105
				2106	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
				2107	status = -ENOMEM;
				2108	new_page = grab_cache_page(mapping, pgpos);
				2109	if (!new_page)
				2110	goto out;
				2111	/* we might sleep */
				2112	if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
				2113	unlock_page(new_page);
				2114	page_cache_release(new_page);
				2115	continue;
				2116	}
				2117	zerofrom = *bytes & ~PAGE_CACHE_MASK;
				2118	if (zerofrom & (blocksize-1)) {
				2119	*bytes \|= (blocksize-1);
				2120	(*bytes)++;
				2121	}
				2122	status = __block_prepare_write(inode, new_page, zerofrom,
				2123	PAGE_CACHE_SIZE, get_block);
				2124	if (status)
				2125	goto out_unmap;
				2126	kaddr = kmap_atomic(new_page, KM_USER0);
				2127	memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
				2128	flush_dcache_page(new_page);
				2129	kunmap_atomic(kaddr, KM_USER0);
				2130	generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
				2131	unlock_page(new_page);
				2132	page_cache_release(new_page);
				2133	}
				2134
				2135	if (page->index < pgpos) {
				2136	/* completely inside the area */
				2137	zerofrom = offset;
				2138	} else {
				2139	/* page covers the boundary, find the boundary offset */
				2140	zerofrom = *bytes & ~PAGE_CACHE_MASK;
				2141
				2142	/* if we will expand the thing last block will be filled */
				2143	if (to > zerofrom && (zerofrom & (blocksize-1))) {
				2144	*bytes \|= (blocksize-1);
				2145	(*bytes)++;
				2146	}
				2147
				2148	/* starting below the boundary? Nothing to zero out */
				2149	if (offset <= zerofrom)
				2150	zerofrom = offset;
				2151	}
				2152	status = __block_prepare_write(inode, page, zerofrom, to, get_block);
				2153	if (status)
				2154	goto out1;
				2155	if (zerofrom < offset) {
				2156	kaddr = kmap_atomic(page, KM_USER0);
				2157	memset(kaddr+zerofrom, 0, offset-zerofrom);
				2158	flush_dcache_page(page);
				2159	kunmap_atomic(kaddr, KM_USER0);
				2160	__block_commit_write(inode, page, zerofrom, offset);
				2161	}
				2162	return 0;
				2163	out1:
				2164	ClearPageUptodate(page);
				2165	return status;
				2166
				2167	out_unmap:
				2168	ClearPageUptodate(new_page);
				2169	unlock_page(new_page);
				2170	page_cache_release(new_page);
				2171	out:
				2172	return status;
				2173	}
				2174
				2175	int block_prepare_write(struct page *page, unsigned from, unsigned to,
				2176	get_block_t *get_block)
				2177	{
				2178	struct inode *inode = page->mapping->host;
				2179	int err = __block_prepare_write(inode, page, from, to, get_block);
				2180	if (err)
				2181	ClearPageUptodate(page);
				2182	return err;
				2183	}
				2184
				2185	int block_commit_write(struct page *page, unsigned from, unsigned to)
				2186	{
				2187	struct inode *inode = page->mapping->host;
				2188	__block_commit_write(inode,page,from,to);
				2189	return 0;
				2190	}
				2191
				2192	int generic_commit_write(struct file file, struct page page,
				2193	unsigned from, unsigned to)
				2194	{
				2195	struct inode *inode = page->mapping->host;
				2196	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				2197	__block_commit_write(inode,page,from,to);
				2198	/*
				2199	* No need to use i_size_read() here, the i_size
Jes Sorensen	1b1dcc1	2006-01-09 15:59:24 -0800	[diff] [blame]	2200	* cannot change under us because we hold i_mutex.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2201	*/
				2202	if (pos > inode->i_size) {
				2203	i_size_write(inode, pos);
				2204	mark_inode_dirty(inode);
				2205	}
				2206	return 0;
				2207	}
				2208
				2209
				2210	/*
				2211	* nobh_prepare_write()'s prereads are special: the buffer_heads are freed
				2212	* immediately, while under the page lock. So it needs a special end_io
				2213	* handler which does not touch the bh after unlocking it.
				2214	*
				2215	* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
				2216	* a race there is benign: unlock_buffer() only use the bh's address for
				2217	* hashing after unlocking the buffer, so it doesn't actually touch the bh
				2218	* itself.
				2219	*/
				2220	static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
				2221	{
				2222	if (uptodate) {
				2223	set_buffer_uptodate(bh);
				2224	} else {
				2225	/* This happens, due to failed READA attempts. */
				2226	clear_buffer_uptodate(bh);
				2227	}
				2228	unlock_buffer(bh);
				2229	}
				2230
				2231	/*
				2232	* On entry, the page is fully not uptodate.
				2233	* On exit the page is fully uptodate in the areas outside (from,to)
				2234	*/
				2235	int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
				2236	get_block_t *get_block)
				2237	{
				2238	struct inode *inode = page->mapping->host;
				2239	const unsigned blkbits = inode->i_blkbits;
				2240	const unsigned blocksize = 1 << blkbits;
				2241	struct buffer_head map_bh;
				2242	struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
				2243	unsigned block_in_page;
				2244	unsigned block_start;
				2245	sector_t block_in_file;
				2246	char *kaddr;
				2247	int nr_reads = 0;
				2248	int i;
				2249	int ret = 0;
				2250	int is_mapped_to_disk = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2251
				2252	if (PageMappedToDisk(page))
				2253	return 0;
				2254
				2255	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
				2256	map_bh.b_page = page;
				2257
				2258	/*
				2259	* We loop across all blocks in the page, whether or not they are
				2260	* part of the affected region. This is so we can discover if the
				2261	* page is fully mapped-to-disk.
				2262	*/
				2263	for (block_start = 0, block_in_page = 0;
				2264	block_start < PAGE_CACHE_SIZE;
				2265	block_in_page++, block_start += blocksize) {
				2266	unsigned block_end = block_start + blocksize;
				2267	int create;
				2268
				2269	map_bh.b_state = 0;
				2270	create = 1;
				2271	if (block_start >= to)
				2272	create = 0;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2273	map_bh.b_size = blocksize;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2274	ret = get_block(inode, block_in_file + block_in_page,
				2275	&map_bh, create);
				2276	if (ret)
				2277	goto failed;
				2278	if (!buffer_mapped(&map_bh))
				2279	is_mapped_to_disk = 0;
				2280	if (buffer_new(&map_bh))
				2281	unmap_underlying_metadata(map_bh.b_bdev,
				2282	map_bh.b_blocknr);
				2283	if (PageUptodate(page))
				2284	continue;
				2285	if (buffer_new(&map_bh) \|\| !buffer_mapped(&map_bh)) {
				2286	kaddr = kmap_atomic(page, KM_USER0);
Nick Piggin	22c8ca7	2007-02-20 13:58:09 -0800	[diff] [blame]	2287	if (block_start < from)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2288	memset(kaddr+block_start, 0, from-block_start);
Nick Piggin	22c8ca7	2007-02-20 13:58:09 -0800	[diff] [blame]	2289	if (block_end > to)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2290	memset(kaddr + to, 0, block_end - to);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2291	flush_dcache_page(page);
				2292	kunmap_atomic(kaddr, KM_USER0);
				2293	continue;
				2294	}
				2295	if (buffer_uptodate(&map_bh))
				2296	continue; /* reiserfs does this */
				2297	if (block_start < from \|\| block_end > to) {
				2298	struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
				2299
				2300	if (!bh) {
				2301	ret = -ENOMEM;
				2302	goto failed;
				2303	}
				2304	bh->b_state = map_bh.b_state;
				2305	atomic_set(&bh->b_count, 0);
				2306	bh->b_this_page = NULL;
				2307	bh->b_page = page;
				2308	bh->b_blocknr = map_bh.b_blocknr;
				2309	bh->b_size = blocksize;
				2310	bh->b_data = (char *)(long)block_start;
				2311	bh->b_bdev = map_bh.b_bdev;
				2312	bh->b_private = NULL;
				2313	read_bh[nr_reads++] = bh;
				2314	}
				2315	}
				2316
				2317	if (nr_reads) {
				2318	struct buffer_head *bh;
				2319
				2320	/*
				2321	* The page is locked, so these buffers are protected from
				2322	* any VM or truncate activity. Hence we don't need to care
				2323	* for the buffer_head refcounts.
				2324	*/
				2325	for (i = 0; i < nr_reads; i++) {
				2326	bh = read_bh[i];
				2327	lock_buffer(bh);
				2328	bh->b_end_io = end_buffer_read_nobh;
				2329	submit_bh(READ, bh);
				2330	}
				2331	for (i = 0; i < nr_reads; i++) {
				2332	bh = read_bh[i];
				2333	wait_on_buffer(bh);
				2334	if (!buffer_uptodate(bh))
				2335	ret = -EIO;
				2336	free_buffer_head(bh);
				2337	read_bh[i] = NULL;
				2338	}
				2339	if (ret)
				2340	goto failed;
				2341	}
				2342
				2343	if (is_mapped_to_disk)
				2344	SetPageMappedToDisk(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2345
				2346	return 0;
				2347
				2348	failed:
				2349	for (i = 0; i < nr_reads; i++) {
				2350	if (read_bh[i])
				2351	free_buffer_head(read_bh[i]);
				2352	}
				2353
				2354	/*
				2355	* Error recovery is pretty slack. Clear the page and mark it dirty
				2356	* so we'll later zero out any blocks which _were_ allocated.
				2357	*/
				2358	kaddr = kmap_atomic(page, KM_USER0);
				2359	memset(kaddr, 0, PAGE_CACHE_SIZE);
Monakhov Dmitriy	8c58165	2006-10-11 01:22:00 -0700	[diff] [blame]	2360	flush_dcache_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2361	kunmap_atomic(kaddr, KM_USER0);
				2362	SetPageUptodate(page);
				2363	set_page_dirty(page);
				2364	return ret;
				2365	}
				2366	EXPORT_SYMBOL(nobh_prepare_write);
				2367
Dave Kleikamp	57bf63d	2007-03-06 01:42:12 -0800	[diff] [blame]	2368	/*
				2369	* Make sure any changes to nobh_commit_write() are reflected in
				2370	* nobh_truncate_page(), since it doesn't call commit_write().
				2371	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2372	int nobh_commit_write(struct file file, struct page page,
				2373	unsigned from, unsigned to)
				2374	{
				2375	struct inode *inode = page->mapping->host;
				2376	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				2377
Nick Piggin	22c8ca7	2007-02-20 13:58:09 -0800	[diff] [blame]	2378	SetPageUptodate(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2379	set_page_dirty(page);
				2380	if (pos > inode->i_size) {
				2381	i_size_write(inode, pos);
				2382	mark_inode_dirty(inode);
				2383	}
				2384	return 0;
				2385	}
				2386	EXPORT_SYMBOL(nobh_commit_write);
				2387
				2388	/*
				2389	* nobh_writepage() - based on block_full_write_page() except
				2390	* that it tries to operate without attaching bufferheads to
				2391	* the page.
				2392	*/
				2393	int nobh_writepage(struct page page, get_block_t get_block,
				2394	struct writeback_control *wbc)
				2395	{
				2396	struct inode * const inode = page->mapping->host;
				2397	loff_t i_size = i_size_read(inode);
				2398	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2399	unsigned offset;
				2400	void *kaddr;
				2401	int ret;
				2402
				2403	/* Is the page fully inside i_size? */
				2404	if (page->index < end_index)
				2405	goto out;
				2406
				2407	/* Is the page fully outside i_size? (truncate in progress) */
				2408	offset = i_size & (PAGE_CACHE_SIZE-1);
				2409	if (page->index >= end_index+1 \|\| !offset) {
				2410	/*
				2411	* The page may have dirty, unmapped buffers. For example,
				2412	* they may have been added in ext3_writepage(). Make them
				2413	* freeable here, so the page does not leak.
				2414	*/
				2415	#if 0
				2416	/* Not really sure about this - do we need this ? */
				2417	if (page->mapping->a_ops->invalidatepage)
				2418	page->mapping->a_ops->invalidatepage(page, offset);
				2419	#endif
				2420	unlock_page(page);
				2421	return 0; /* don't care */
				2422	}
				2423
				2424	/*
				2425	* The page straddles i_size. It must be zeroed out on each and every
				2426	* writepage invocation because it may be mmapped. "A file is mapped
				2427	* in multiples of the page size. For a file that is not a multiple of
				2428	* the page size, the remaining memory is zeroed when mapped, and
				2429	* writes to that region are not written out to the file."
				2430	*/
				2431	kaddr = kmap_atomic(page, KM_USER0);
				2432	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2433	flush_dcache_page(page);
				2434	kunmap_atomic(kaddr, KM_USER0);
				2435	out:
				2436	ret = mpage_writepage(page, get_block, wbc);
				2437	if (ret == -EAGAIN)
				2438	ret = __block_write_full_page(inode, page, get_block, wbc);
				2439	return ret;
				2440	}
				2441	EXPORT_SYMBOL(nobh_writepage);
				2442
				2443	/*
				2444	* This function assumes that ->prepare_write() uses nobh_prepare_write().
				2445	*/
				2446	int nobh_truncate_page(struct address_space *mapping, loff_t from)
				2447	{
				2448	struct inode *inode = mapping->host;
				2449	unsigned blocksize = 1 << inode->i_blkbits;
				2450	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2451	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				2452	unsigned to;
				2453	struct page *page;
Christoph Hellwig	f5e54d6	2006-06-28 04:26:44 -0700	[diff] [blame]	2454	const struct address_space_operations *a_ops = mapping->a_ops;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2455	char *kaddr;
				2456	int ret = 0;
				2457
				2458	if ((offset & (blocksize - 1)) == 0)
				2459	goto out;
				2460
				2461	ret = -ENOMEM;
				2462	page = grab_cache_page(mapping, index);
				2463	if (!page)
				2464	goto out;
				2465
				2466	to = (offset + blocksize) & ~(blocksize - 1);
				2467	ret = a_ops->prepare_write(NULL, page, offset, to);
				2468	if (ret == 0) {
				2469	kaddr = kmap_atomic(page, KM_USER0);
				2470	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2471	flush_dcache_page(page);
				2472	kunmap_atomic(kaddr, KM_USER0);
Dave Kleikamp	57bf63d	2007-03-06 01:42:12 -0800	[diff] [blame]	2473	/*
				2474	* It would be more correct to call aops->commit_write()
				2475	* here, but this is more efficient.
				2476	*/
				2477	SetPageUptodate(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2478	set_page_dirty(page);
				2479	}
				2480	unlock_page(page);
				2481	page_cache_release(page);
				2482	out:
				2483	return ret;
				2484	}
				2485	EXPORT_SYMBOL(nobh_truncate_page);
				2486
				2487	int block_truncate_page(struct address_space *mapping,
				2488	loff_t from, get_block_t *get_block)
				2489	{
				2490	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2491	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				2492	unsigned blocksize;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2493	sector_t iblock;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2494	unsigned length, pos;
				2495	struct inode *inode = mapping->host;
				2496	struct page *page;
				2497	struct buffer_head *bh;
				2498	void *kaddr;
				2499	int err;
				2500
				2501	blocksize = 1 << inode->i_blkbits;
				2502	length = offset & (blocksize - 1);
				2503
				2504	/* Block boundary? Nothing to do */
				2505	if (!length)
				2506	return 0;
				2507
				2508	length = blocksize - length;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2509	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2510
				2511	page = grab_cache_page(mapping, index);
				2512	err = -ENOMEM;
				2513	if (!page)
				2514	goto out;
				2515
				2516	if (!page_has_buffers(page))
				2517	create_empty_buffers(page, blocksize, 0);
				2518
				2519	/* Find the buffer that contains "offset" */
				2520	bh = page_buffers(page);
				2521	pos = blocksize;
				2522	while (offset >= pos) {
				2523	bh = bh->b_this_page;
				2524	iblock++;
				2525	pos += blocksize;
				2526	}
				2527
				2528	err = 0;
				2529	if (!buffer_mapped(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2530	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2531	err = get_block(inode, iblock, bh, 0);
				2532	if (err)
				2533	goto unlock;
				2534	/* unmapped? It's a hole - nothing to do */
				2535	if (!buffer_mapped(bh))
				2536	goto unlock;
				2537	}
				2538
				2539	/* Ok, it's mapped. Make sure it's up-to-date */
				2540	if (PageUptodate(page))
				2541	set_buffer_uptodate(bh);
				2542
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	2543	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2544	err = -EIO;
				2545	ll_rw_block(READ, 1, &bh);
				2546	wait_on_buffer(bh);
				2547	/* Uhhuh. Read error. Complain and punt. */
				2548	if (!buffer_uptodate(bh))
				2549	goto unlock;
				2550	}
				2551
				2552	kaddr = kmap_atomic(page, KM_USER0);
				2553	memset(kaddr + offset, 0, length);
				2554	flush_dcache_page(page);
				2555	kunmap_atomic(kaddr, KM_USER0);
				2556
				2557	mark_buffer_dirty(bh);
				2558	err = 0;
				2559
				2560	unlock:
				2561	unlock_page(page);
				2562	page_cache_release(page);
				2563	out:
				2564	return err;
				2565	}
				2566
				2567	/*
				2568	* The generic ->writepage function for buffer-backed address_spaces
				2569	*/
				2570	int block_write_full_page(struct page page, get_block_t get_block,
				2571	struct writeback_control *wbc)
				2572	{
				2573	struct inode * const inode = page->mapping->host;
				2574	loff_t i_size = i_size_read(inode);
				2575	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2576	unsigned offset;
				2577	void *kaddr;
				2578
				2579	/* Is the page fully inside i_size? */
				2580	if (page->index < end_index)
				2581	return __block_write_full_page(inode, page, get_block, wbc);
				2582
				2583	/* Is the page fully outside i_size? (truncate in progress) */
				2584	offset = i_size & (PAGE_CACHE_SIZE-1);
				2585	if (page->index >= end_index+1 \|\| !offset) {
				2586	/*
				2587	* The page may have dirty, unmapped buffers. For example,
				2588	* they may have been added in ext3_writepage(). Make them
				2589	* freeable here, so the page does not leak.
				2590	*/
Jan Kara	aaa4059	2005-10-30 15:00:16 -0800	[diff] [blame]	2591	do_invalidatepage(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2592	unlock_page(page);
				2593	return 0; /* don't care */
				2594	}
				2595
				2596	/*
				2597	* The page straddles i_size. It must be zeroed out on each and every
				2598	* writepage invokation because it may be mmapped. "A file is mapped
				2599	* in multiples of the page size. For a file that is not a multiple of
				2600	* the page size, the remaining memory is zeroed when mapped, and
				2601	* writes to that region are not written out to the file."
				2602	*/
				2603	kaddr = kmap_atomic(page, KM_USER0);
				2604	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2605	flush_dcache_page(page);
				2606	kunmap_atomic(kaddr, KM_USER0);
				2607	return __block_write_full_page(inode, page, get_block, wbc);
				2608	}
				2609
				2610	sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
				2611	get_block_t *get_block)
				2612	{
				2613	struct buffer_head tmp;
				2614	struct inode *inode = mapping->host;
				2615	tmp.b_state = 0;
				2616	tmp.b_blocknr = 0;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2617	tmp.b_size = 1 << inode->i_blkbits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2618	get_block(inode, block, &tmp, 0);
				2619	return tmp.b_blocknr;
				2620	}
				2621
				2622	static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
				2623	{
				2624	struct buffer_head *bh = bio->bi_private;
				2625
				2626	if (bio->bi_size)
				2627	return 1;
				2628
				2629	if (err == -EOPNOTSUPP) {
				2630	set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
				2631	set_bit(BH_Eopnotsupp, &bh->b_state);
				2632	}
				2633
				2634	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
				2635	bio_put(bio);
				2636	return 0;
				2637	}
				2638
				2639	int submit_bh(int rw, struct buffer_head * bh)
				2640	{
				2641	struct bio *bio;
				2642	int ret = 0;
				2643
				2644	BUG_ON(!buffer_locked(bh));
				2645	BUG_ON(!buffer_mapped(bh));
				2646	BUG_ON(!bh->b_end_io);
				2647
				2648	if (buffer_ordered(bh) && (rw == WRITE))
				2649	rw = WRITE_BARRIER;
				2650
				2651	/*
				2652	* Only clear out a write error when rewriting, should this
				2653	* include WRITE_SYNC as well?
				2654	*/
				2655	if (test_set_buffer_req(bh) && (rw == WRITE \|\| rw == WRITE_BARRIER))
				2656	clear_buffer_write_io_error(bh);
				2657
				2658	/*
				2659	* from here on down, it's all bio -- do the initial mapping,
				2660	* submit_bio -> generic_make_request may further map this bio around
				2661	*/
				2662	bio = bio_alloc(GFP_NOIO, 1);
				2663
				2664	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
				2665	bio->bi_bdev = bh->b_bdev;
				2666	bio->bi_io_vec[0].bv_page = bh->b_page;
				2667	bio->bi_io_vec[0].bv_len = bh->b_size;
				2668	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
				2669
				2670	bio->bi_vcnt = 1;
				2671	bio->bi_idx = 0;
				2672	bio->bi_size = bh->b_size;
				2673
				2674	bio->bi_end_io = end_bio_bh_io_sync;
				2675	bio->bi_private = bh;
				2676
				2677	bio_get(bio);
				2678	submit_bio(rw, bio);
				2679
				2680	if (bio_flagged(bio, BIO_EOPNOTSUPP))
				2681	ret = -EOPNOTSUPP;
				2682
				2683	bio_put(bio);
				2684	return ret;
				2685	}
				2686
				2687	/**
				2688	* ll_rw_block: low-level access to block devices (DEPRECATED)
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2689	* @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2690	* @nr: number of &struct buffer_heads in the array
				2691	* @bhs: array of pointers to &struct buffer_head
				2692	*
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2693	* ll_rw_block() takes an array of pointers to &struct buffer_heads, and
				2694	* requests an I/O operation on them, either a %READ or a %WRITE. The third
				2695	* %SWRITE is like %WRITE only we make sure that the current data in buffers
				2696	* are sent to disk. The fourth %READA option is described in the documentation
				2697	* for generic_make_request() which ll_rw_block() calls.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2698	*
				2699	* This function drops any buffer that it cannot get a lock on (with the
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2700	* BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
				2701	* clean when doing a write request, and any buffer that appears to be
				2702	* up-to-date when doing read request. Further it marks as clean buffers that
				2703	* are processed for writing (the buffer cache won't assume that they are
				2704	* actually clean until the buffer gets unlocked).
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2705	*
				2706	* ll_rw_block sets b_end_io to simple completion handler that marks
				2707	* the buffer up-to-date (if approriate), unlocks the buffer and wakes
				2708	* any waiters.
				2709	*
				2710	* All of the buffers must be for the same device, and must also be a
				2711	* multiple of the current approved size for the device.
				2712	*/
				2713	void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
				2714	{
				2715	int i;
				2716
				2717	for (i = 0; i < nr; i++) {
				2718	struct buffer_head *bh = bhs[i];
				2719
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2720	if (rw == SWRITE)
				2721	lock_buffer(bh);
				2722	else if (test_set_buffer_locked(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2723	continue;
				2724
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2725	if (rw == WRITE \|\| rw == SWRITE) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2726	if (test_clear_buffer_dirty(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	2727	bh->b_end_io = end_buffer_write_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	2728	get_bh(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2729	submit_bh(WRITE, bh);
				2730	continue;
				2731	}
				2732	} else {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2733	if (!buffer_uptodate(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	2734	bh->b_end_io = end_buffer_read_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	2735	get_bh(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2736	submit_bh(rw, bh);
				2737	continue;
				2738	}
				2739	}
				2740	unlock_buffer(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2741	}
				2742	}
				2743
				2744	/*
				2745	* For a data-integrity writeout, we need to wait upon any in-progress I/O
				2746	* and then start new I/O and then wait upon it. The caller must have a ref on
				2747	* the buffer_head.
				2748	*/
				2749	int sync_dirty_buffer(struct buffer_head *bh)
				2750	{
				2751	int ret = 0;
				2752
				2753	WARN_ON(atomic_read(&bh->b_count) < 1);
				2754	lock_buffer(bh);
				2755	if (test_clear_buffer_dirty(bh)) {
				2756	get_bh(bh);
				2757	bh->b_end_io = end_buffer_write_sync;
				2758	ret = submit_bh(WRITE, bh);
				2759	wait_on_buffer(bh);
				2760	if (buffer_eopnotsupp(bh)) {
				2761	clear_buffer_eopnotsupp(bh);
				2762	ret = -EOPNOTSUPP;
				2763	}
				2764	if (!ret && !buffer_uptodate(bh))
				2765	ret = -EIO;
				2766	} else {
				2767	unlock_buffer(bh);
				2768	}
				2769	return ret;
				2770	}
				2771
				2772	/*
				2773	* try_to_free_buffers() checks if all the buffers on this particular page
				2774	* are unused, and releases them if so.
				2775	*
				2776	* Exclusion against try_to_free_buffers may be obtained by either
				2777	* locking the page or by holding its mapping's private_lock.
				2778	*
				2779	* If the page is dirty but all the buffers are clean then we need to
				2780	* be sure to mark the page clean as well. This is because the page
				2781	* may be against a block device, and a later reattachment of buffers
				2782	* to a dirty page will set all buffers dirty. Which would corrupt
				2783	* filesystem data on the same device.
				2784	*
				2785	* The same applies to regular filesystem pages: if all the buffers are
				2786	* clean then we set the page clean and proceed. To do that, we require
				2787	* total exclusion from __set_page_dirty_buffers(). That is obtained with
				2788	* private_lock.
				2789	*
				2790	* try_to_free_buffers() is non-blocking.
				2791	*/
				2792	static inline int buffer_busy(struct buffer_head *bh)
				2793	{
				2794	return atomic_read(&bh->b_count) \|
				2795	(bh->b_state & ((1 << BH_Dirty) \| (1 << BH_Lock)));
				2796	}
				2797
				2798	static int
				2799	drop_buffers(struct page page, struct buffer_head *buffers_to_free)
				2800	{
				2801	struct buffer_head *head = page_buffers(page);
				2802	struct buffer_head *bh;
				2803
				2804	bh = head;
				2805	do {
akpm@osdl.org	de7d5a3	2005-05-01 08:58:39 -0700	[diff] [blame]	2806	if (buffer_write_io_error(bh) && page->mapping)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2807	set_bit(AS_EIO, &page->mapping->flags);
				2808	if (buffer_busy(bh))
				2809	goto failed;
				2810	bh = bh->b_this_page;
				2811	} while (bh != head);
				2812
				2813	do {
				2814	struct buffer_head *next = bh->b_this_page;
				2815
				2816	if (!list_empty(&bh->b_assoc_buffers))
				2817	__remove_assoc_queue(bh);
				2818	bh = next;
				2819	} while (bh != head);
				2820	*buffers_to_free = head;
				2821	__clear_page_buffers(page);
				2822	return 1;
				2823	failed:
				2824	return 0;
				2825	}
				2826
				2827	int try_to_free_buffers(struct page *page)
				2828	{
				2829	struct address_space * const mapping = page->mapping;
				2830	struct buffer_head *buffers_to_free = NULL;
				2831	int ret = 0;
				2832
				2833	BUG_ON(!PageLocked(page));
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	2834	if (PageWriteback(page))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2835	return 0;
				2836
				2837	if (mapping == NULL) { /* can this still happen? */
				2838	ret = drop_buffers(page, &buffers_to_free);
				2839	goto out;
				2840	}
				2841
				2842	spin_lock(&mapping->private_lock);
				2843	ret = drop_buffers(page, &buffers_to_free);
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	2844
				2845	/*
				2846	* If the filesystem writes its buffers by hand (eg ext3)
				2847	* then we can have clean buffers against a dirty page. We
				2848	* clean the page here; otherwise the VM will never notice
				2849	* that the filesystem did any IO at all.
				2850	*
				2851	* Also, during truncate, discard_buffer will have marked all
				2852	* the page's buffers clean. We discover that here and clean
				2853	* the page also.
Nick Piggin	87df724	2007-01-30 14:36:27 +1100	[diff] [blame]	2854	*
				2855	* private_lock must be held over this entire operation in order
				2856	* to synchronise against __set_page_dirty_buffers and prevent the
				2857	* dirty bit from being lost.
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	2858	*/
				2859	if (ret)
				2860	cancel_dirty_page(page, PAGE_CACHE_SIZE);
Nick Piggin	87df724	2007-01-30 14:36:27 +1100	[diff] [blame]	2861	spin_unlock(&mapping->private_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2862	out:
				2863	if (buffers_to_free) {
				2864	struct buffer_head *bh = buffers_to_free;
				2865
				2866	do {
				2867	struct buffer_head *next = bh->b_this_page;
				2868	free_buffer_head(bh);
				2869	bh = next;
				2870	} while (bh != buffers_to_free);
				2871	}
				2872	return ret;
				2873	}
				2874	EXPORT_SYMBOL(try_to_free_buffers);
				2875
NeilBrown	3978d71	2006-03-26 01:37:17 -0800	[diff] [blame]	2876	void block_sync_page(struct page *page)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2877	{
				2878	struct address_space *mapping;
				2879
				2880	smp_mb();
				2881	mapping = page_mapping(page);
				2882	if (mapping)
				2883	blk_run_backing_dev(mapping->backing_dev_info, page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2884	}
				2885
				2886	/*
				2887	* There are no bdflush tunables left. But distributions are
				2888	* still running obsolete flush daemons, so we terminate them here.
				2889	*
				2890	* Use of bdflush() is deprecated and will be removed in a future kernel.
				2891	* The `pdflush' kernel threads fully replace bdflush daemons and this call.
				2892	*/
				2893	asmlinkage long sys_bdflush(int func, long data)
				2894	{
				2895	static int msg_count;
				2896
				2897	if (!capable(CAP_SYS_ADMIN))
				2898	return -EPERM;
				2899
				2900	if (msg_count < 5) {
				2901	msg_count++;
				2902	printk(KERN_INFO
				2903	"warning: process `%s' used the obsolete bdflush"
				2904	" system call\n", current->comm);
				2905	printk(KERN_INFO "Fix your initscripts?\n");
				2906	}
				2907
				2908	if (func == 1)
				2909	do_exit(0);
				2910	return 0;
				2911	}
				2912
				2913	/*
				2914	* Buffer-head allocation
				2915	*/
Christoph Lameter	e18b890	2006-12-06 20:33:20 -0800	[diff] [blame]	2916	static struct kmem_cache *bh_cachep;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2917
				2918	/*
				2919	* Once the number of bh's in the machine exceeds this level, we start
				2920	* stripping them in writeback.
				2921	*/
				2922	static int max_buffer_heads;
				2923
				2924	int buffer_heads_over_limit;
				2925
				2926	struct bh_accounting {
				2927	int nr; /* Number of live bh's */
				2928	int ratelimit; /* Limit cacheline bouncing */
				2929	};
				2930
				2931	static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
				2932
				2933	static void recalc_bh_state(void)
				2934	{
				2935	int i;
				2936	int tot = 0;
				2937
				2938	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
				2939	return;
				2940	__get_cpu_var(bh_accounting).ratelimit = 0;
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	2941	for_each_online_cpu(i)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2942	tot += per_cpu(bh_accounting, i).nr;
				2943	buffer_heads_over_limit = (tot > max_buffer_heads);
				2944	}
				2945
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	2946	struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2947	{
				2948	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
				2949	if (ret) {
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	2950	get_cpu_var(bh_accounting).nr++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2951	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	2952	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2953	}
				2954	return ret;
				2955	}
				2956	EXPORT_SYMBOL(alloc_buffer_head);
				2957
				2958	void free_buffer_head(struct buffer_head *bh)
				2959	{
				2960	BUG_ON(!list_empty(&bh->b_assoc_buffers));
				2961	kmem_cache_free(bh_cachep, bh);
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	2962	get_cpu_var(bh_accounting).nr--;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2963	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	2964	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2965	}
				2966	EXPORT_SYMBOL(free_buffer_head);
				2967
				2968	static void
Christoph Lameter	e18b890	2006-12-06 20:33:20 -0800	[diff] [blame]	2969	init_buffer_head(void data, struct kmem_cache cachep, unsigned long flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2970	{
				2971	if ((flags & (SLAB_CTOR_VERIFY\|SLAB_CTOR_CONSTRUCTOR)) ==
				2972	SLAB_CTOR_CONSTRUCTOR) {
				2973	struct buffer_head * bh = (struct buffer_head *)data;
				2974
				2975	memset(bh, 0, sizeof(*bh));
				2976	INIT_LIST_HEAD(&bh->b_assoc_buffers);
				2977	}
				2978	}
				2979
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2980	static void buffer_exit_cpu(int cpu)
				2981	{
				2982	int i;
				2983	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
				2984
				2985	for (i = 0; i < BH_LRU_SIZE; i++) {
				2986	brelse(b->bhs[i]);
				2987	b->bhs[i] = NULL;
				2988	}
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	2989	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
				2990	per_cpu(bh_accounting, cpu).nr = 0;
				2991	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2992	}
				2993
				2994	static int buffer_cpu_notify(struct notifier_block *self,
				2995	unsigned long action, void *hcpu)
				2996	{
				2997	if (action == CPU_DEAD)
				2998	buffer_exit_cpu((unsigned long)hcpu);
				2999	return NOTIFY_OK;
				3000	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3001
				3002	void __init buffer_init(void)
				3003	{
				3004	int nrpages;
				3005
				3006	bh_cachep = kmem_cache_create("buffer_head",
Paul Jackson	b019600	2006-03-24 03:16:09 -0800	[diff] [blame]	3007	sizeof(struct buffer_head), 0,
				3008	(SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|
				3009	SLAB_MEM_SPREAD),
				3010	init_buffer_head,
				3011	NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3012
				3013	/*
				3014	* Limit the bh occupancy to 10% of ZONE_NORMAL
				3015	*/
				3016	nrpages = (nr_free_buffer_pages() * 10) / 100;
				3017	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
				3018	hotcpu_notifier(buffer_cpu_notify, 0);
				3019	}
				3020
				3021	EXPORT_SYMBOL(__bforget);
				3022	EXPORT_SYMBOL(__brelse);
				3023	EXPORT_SYMBOL(__wait_on_buffer);
				3024	EXPORT_SYMBOL(block_commit_write);
				3025	EXPORT_SYMBOL(block_prepare_write);
				3026	EXPORT_SYMBOL(block_read_full_page);
				3027	EXPORT_SYMBOL(block_sync_page);
				3028	EXPORT_SYMBOL(block_truncate_page);
				3029	EXPORT_SYMBOL(block_write_full_page);
				3030	EXPORT_SYMBOL(cont_prepare_write);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3031	EXPORT_SYMBOL(end_buffer_read_sync);
				3032	EXPORT_SYMBOL(end_buffer_write_sync);
				3033	EXPORT_SYMBOL(file_fsync);
				3034	EXPORT_SYMBOL(fsync_bdev);
				3035	EXPORT_SYMBOL(generic_block_bmap);
				3036	EXPORT_SYMBOL(generic_commit_write);
				3037	EXPORT_SYMBOL(generic_cont_expand);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	3038	EXPORT_SYMBOL(generic_cont_expand_simple);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3039	EXPORT_SYMBOL(init_buffer);
				3040	EXPORT_SYMBOL(invalidate_bdev);
				3041	EXPORT_SYMBOL(ll_rw_block);
				3042	EXPORT_SYMBOL(mark_buffer_dirty);
				3043	EXPORT_SYMBOL(submit_bh);
				3044	EXPORT_SYMBOL(sync_dirty_buffer);
				3045	EXPORT_SYMBOL(unlock_buffer);