Blame - fs/buffer.c - kernel/msm-4.9

blob: 6d77ce9f54e52031ef23d50643bcce772ed20c65 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/fs/buffer.c
				3	*
				4	* Copyright (C) 1991, 1992, 2002 Linus Torvalds
				5	*/
				6
				7	/*
				8	* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
				9	*
				10	* Removed a lot of unnecessary code and simplified things now that
				11	* the buffer cache isn't our primary cache - Andrew Tridgell 12/96
				12	*
				13	* Speed up hash, lru, and free list operations. Use gfp() for allocating
				14	* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
				15	*
				16	* Added 32k buffer block sizes - these are required older ARM systems. - RMK
				17	*
				18	* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
				19	*/
				20
				21	#include <linux/config.h>
				22	#include <linux/kernel.h>
				23	#include <linux/syscalls.h>
				24	#include <linux/fs.h>
				25	#include <linux/mm.h>
				26	#include <linux/percpu.h>
				27	#include <linux/slab.h>
				28	#include <linux/smp_lock.h>
Randy Dunlap	16f7e0f	2006-01-11 12:17:46 -0800	[diff] [blame]	29	#include <linux/capability.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	30	#include <linux/blkdev.h>
				31	#include <linux/file.h>
				32	#include <linux/quotaops.h>
				33	#include <linux/highmem.h>
				34	#include <linux/module.h>
				35	#include <linux/writeback.h>
				36	#include <linux/hash.h>
				37	#include <linux/suspend.h>
				38	#include <linux/buffer_head.h>
				39	#include <linux/bio.h>
				40	#include <linux/notifier.h>
				41	#include <linux/cpu.h>
				42	#include <linux/bitops.h>
				43	#include <linux/mpage.h>
Ingo Molnar	fb1c8f9	2005-09-10 00:25:56 -0700	[diff] [blame]	44	#include <linux/bit_spinlock.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	45
				46	static int fsync_buffers_list(spinlock_t lock, struct list_head list);
				47	static void invalidate_bh_lrus(void);
				48
				49	#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
				50
				51	inline void
				52	init_buffer(struct buffer_head bh, bh_end_io_t handler, void *private)
				53	{
				54	bh->b_end_io = handler;
				55	bh->b_private = private;
				56	}
				57
				58	static int sync_buffer(void *word)
				59	{
				60	struct block_device *bd;
				61	struct buffer_head *bh
				62	= container_of(word, struct buffer_head, b_state);
				63
				64	smp_mb();
				65	bd = bh->b_bdev;
				66	if (bd)
				67	blk_run_address_space(bd->bd_inode->i_mapping);
				68	io_schedule();
				69	return 0;
				70	}
				71
				72	void fastcall __lock_buffer(struct buffer_head *bh)
				73	{
				74	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
				75	TASK_UNINTERRUPTIBLE);
				76	}
				77	EXPORT_SYMBOL(__lock_buffer);
				78
				79	void fastcall unlock_buffer(struct buffer_head *bh)
				80	{
				81	clear_buffer_locked(bh);
				82	smp_mb__after_clear_bit();
				83	wake_up_bit(&bh->b_state, BH_Lock);
				84	}
				85
				86	/*
				87	* Block until a buffer comes unlocked. This doesn't stop it
				88	* from becoming locked again - you have to lock it yourself
				89	* if you want to preserve its state.
				90	*/
				91	void __wait_on_buffer(struct buffer_head * bh)
				92	{
				93	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
				94	}
				95
				96	static void
				97	__clear_page_buffers(struct page *page)
				98	{
				99	ClearPagePrivate(page);
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	100	set_page_private(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	101	page_cache_release(page);
				102	}
				103
				104	static void buffer_io_error(struct buffer_head *bh)
				105	{
				106	char b[BDEVNAME_SIZE];
				107
				108	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
				109	bdevname(bh->b_bdev, b),
				110	(unsigned long long)bh->b_blocknr);
				111	}
				112
				113	/*
				114	* Default synchronous end-of-IO handler.. Just mark it up-to-date and
				115	* unlock the buffer. This is what ll_rw_block uses too.
				116	*/
				117	void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
				118	{
				119	if (uptodate) {
				120	set_buffer_uptodate(bh);
				121	} else {
				122	/* This happens, due to failed READA attempts. */
				123	clear_buffer_uptodate(bh);
				124	}
				125	unlock_buffer(bh);
				126	put_bh(bh);
				127	}
				128
				129	void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
				130	{
				131	char b[BDEVNAME_SIZE];
				132
				133	if (uptodate) {
				134	set_buffer_uptodate(bh);
				135	} else {
				136	if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
				137	buffer_io_error(bh);
				138	printk(KERN_WARNING "lost page write due to "
				139	"I/O error on %s\n",
				140	bdevname(bh->b_bdev, b));
				141	}
				142	set_buffer_write_io_error(bh);
				143	clear_buffer_uptodate(bh);
				144	}
				145	unlock_buffer(bh);
				146	put_bh(bh);
				147	}
				148
				149	/*
				150	* Write out and wait upon all the dirty data associated with a block
				151	* device via its mapping. Does not take the superblock lock.
				152	*/
				153	int sync_blockdev(struct block_device *bdev)
				154	{
				155	int ret = 0;
				156
OGAWA Hirofumi	28fd129	2006-01-08 01:02:14 -0800	[diff] [blame]	157	if (bdev)
				158	ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	159	return ret;
				160	}
				161	EXPORT_SYMBOL(sync_blockdev);
				162
				163	/*
				164	* Write out and wait upon all dirty data associated with this
				165	* superblock. Filesystem data as well as the underlying block
				166	* device. Takes the superblock lock.
				167	*/
				168	int fsync_super(struct super_block *sb)
				169	{
				170	sync_inodes_sb(sb, 0);
				171	DQUOT_SYNC(sb);
				172	lock_super(sb);
				173	if (sb->s_dirt && sb->s_op->write_super)
				174	sb->s_op->write_super(sb);
				175	unlock_super(sb);
				176	if (sb->s_op->sync_fs)
				177	sb->s_op->sync_fs(sb, 1);
				178	sync_blockdev(sb->s_bdev);
				179	sync_inodes_sb(sb, 1);
				180
				181	return sync_blockdev(sb->s_bdev);
				182	}
				183
				184	/*
				185	* Write out and wait upon all dirty data associated with this
				186	* device. Filesystem data as well as the underlying block
				187	* device. Takes the superblock lock.
				188	*/
				189	int fsync_bdev(struct block_device *bdev)
				190	{
				191	struct super_block *sb = get_super(bdev);
				192	if (sb) {
				193	int res = fsync_super(sb);
				194	drop_super(sb);
				195	return res;
				196	}
				197	return sync_blockdev(bdev);
				198	}
				199
				200	/**
				201	* freeze_bdev -- lock a filesystem and force it into a consistent state
				202	* @bdev: blockdevice to lock
				203	*
Arjan van de Ven	c039e31	2006-03-23 03:00:28 -0800	[diff] [blame]	204	* This takes the block device bd_mount_mutex to make sure no new mounts
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	205	* happen on bdev until thaw_bdev() is called.
				206	* If a superblock is found on this device, we take the s_umount semaphore
				207	* on it to make sure nobody unmounts until the snapshot creation is done.
				208	*/
				209	struct super_block freeze_bdev(struct block_device bdev)
				210	{
				211	struct super_block *sb;
				212
Arjan van de Ven	c039e31	2006-03-23 03:00:28 -0800	[diff] [blame]	213	mutex_lock(&bdev->bd_mount_mutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	214	sb = get_super(bdev);
				215	if (sb && !(sb->s_flags & MS_RDONLY)) {
				216	sb->s_frozen = SB_FREEZE_WRITE;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	217	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	218
				219	sync_inodes_sb(sb, 0);
				220	DQUOT_SYNC(sb);
				221
				222	lock_super(sb);
				223	if (sb->s_dirt && sb->s_op->write_super)
				224	sb->s_op->write_super(sb);
				225	unlock_super(sb);
				226
				227	if (sb->s_op->sync_fs)
				228	sb->s_op->sync_fs(sb, 1);
				229
				230	sync_blockdev(sb->s_bdev);
				231	sync_inodes_sb(sb, 1);
				232
				233	sb->s_frozen = SB_FREEZE_TRANS;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	234	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	235
				236	sync_blockdev(sb->s_bdev);
				237
				238	if (sb->s_op->write_super_lockfs)
				239	sb->s_op->write_super_lockfs(sb);
				240	}
				241
				242	sync_blockdev(bdev);
				243	return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
				244	}
				245	EXPORT_SYMBOL(freeze_bdev);
				246
				247	/**
				248	* thaw_bdev -- unlock filesystem
				249	* @bdev: blockdevice to unlock
				250	* @sb: associated superblock
				251	*
				252	* Unlocks the filesystem and marks it writeable again after freeze_bdev().
				253	*/
				254	void thaw_bdev(struct block_device bdev, struct super_block sb)
				255	{
				256	if (sb) {
				257	BUG_ON(sb->s_bdev != bdev);
				258
				259	if (sb->s_op->unlockfs)
				260	sb->s_op->unlockfs(sb);
				261	sb->s_frozen = SB_UNFROZEN;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	262	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	263	wake_up(&sb->s_wait_unfrozen);
				264	drop_super(sb);
				265	}
				266
Arjan van de Ven	c039e31	2006-03-23 03:00:28 -0800	[diff] [blame]	267	mutex_unlock(&bdev->bd_mount_mutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	268	}
				269	EXPORT_SYMBOL(thaw_bdev);
				270
				271	/*
				272	* sync everything. Start out by waking pdflush, because that writes back
				273	* all queues in parallel.
				274	*/
				275	static void do_sync(unsigned long wait)
				276	{
Pekka J Enberg	687a21c	2005-06-28 20:44:55 -0700	[diff] [blame]	277	wakeup_pdflush(0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	278	sync_inodes(0); /* All mappings, inodes and their blockdevs */
				279	DQUOT_SYNC(NULL);
				280	sync_supers(); /* Write the superblocks */
				281	sync_filesystems(0); /* Start syncing the filesystems */
				282	sync_filesystems(wait); /* Waitingly sync the filesystems */
				283	sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */
				284	if (!wait)
				285	printk("Emergency Sync complete\n");
				286	if (unlikely(laptop_mode))
				287	laptop_sync_completion();
				288	}
				289
				290	asmlinkage long sys_sync(void)
				291	{
				292	do_sync(1);
				293	return 0;
				294	}
				295
				296	void emergency_sync(void)
				297	{
				298	pdflush_operation(do_sync, 0);
				299	}
				300
				301	/*
				302	* Generic function to fsync a file.
				303	*
				304	* filp may be NULL if called via the msync of a vma.
				305	*/
				306
				307	int file_fsync(struct file filp, struct dentry dentry, int datasync)
				308	{
				309	struct inode * inode = dentry->d_inode;
				310	struct super_block * sb;
				311	int ret, err;
				312
				313	/* sync the inode to buffers */
				314	ret = write_inode_now(inode, 0);
				315
				316	/* sync the superblock to buffers */
				317	sb = inode->i_sb;
				318	lock_super(sb);
				319	if (sb->s_op->write_super)
				320	sb->s_op->write_super(sb);
				321	unlock_super(sb);
				322
				323	/* .. finally sync the buffers to disk */
				324	err = sync_blockdev(sb->s_bdev);
				325	if (!ret)
				326	ret = err;
				327	return ret;
				328	}
				329
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame^]	330	long do_fsync(struct file *file, int datasync)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	331	{
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame^]	332	int ret;
				333	int err;
				334	struct address_space *mapping = file->f_mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	335
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	336	if (!file->f_op \|\| !file->f_op->fsync) {
				337	/* Why? We can still call filemap_fdatawrite */
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame^]	338	ret = -EINVAL;
				339	goto out;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	340	}
				341
				342	current->flags \|= PF_SYNCWRITE;
				343	ret = filemap_fdatawrite(mapping);
				344
				345	/*
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame^]	346	* We need to protect against concurrent writers, which could cause
				347	* livelocks in fsync_buffers_list().
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	348	*/
Jes Sorensen	1b1dcc1	2006-01-09 15:59:24 -0800	[diff] [blame]	349	mutex_lock(&mapping->host->i_mutex);
Oleg Nesterov	dfb388b	2005-06-23 00:10:02 -0700	[diff] [blame]	350	err = file->f_op->fsync(file, file->f_dentry, datasync);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	351	if (!ret)
				352	ret = err;
Jes Sorensen	1b1dcc1	2006-01-09 15:59:24 -0800	[diff] [blame]	353	mutex_unlock(&mapping->host->i_mutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	354	err = filemap_fdatawait(mapping);
				355	if (!ret)
				356	ret = err;
				357	current->flags &= ~PF_SYNCWRITE;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	358	out:
				359	return ret;
				360	}
				361
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame^]	362	static long __do_fsync(unsigned int fd, int datasync)
				363	{
				364	struct file *file;
				365	int ret = -EBADF;
				366
				367	file = fget(fd);
				368	if (file) {
				369	ret = do_fsync(file, datasync);
				370	fput(file);
				371	}
				372	return ret;
				373	}
				374
Oleg Nesterov	dfb388b	2005-06-23 00:10:02 -0700	[diff] [blame]	375	asmlinkage long sys_fsync(unsigned int fd)
				376	{
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame^]	377	return __do_fsync(fd, 0);
Oleg Nesterov	dfb388b	2005-06-23 00:10:02 -0700	[diff] [blame]	378	}
				379
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	380	asmlinkage long sys_fdatasync(unsigned int fd)
				381	{
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame^]	382	return __do_fsync(fd, 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	383	}
				384
				385	/*
				386	* Various filesystems appear to want __find_get_block to be non-blocking.
				387	* But it's the page lock which protects the buffers. To get around this,
				388	* we get exclusion from try_to_free_buffers with the blockdev mapping's
				389	* private_lock.
				390	*
				391	* Hack idea: for the blockdev mapping, i_bufferlist_lock contention
				392	* may be quite high. This code could TryLock the page, and if that
				393	* succeeds, there is no need to take private_lock. (But if
				394	* private_lock is contended then so is mapping->tree_lock).
				395	*/
				396	static struct buffer_head *
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	397	__find_get_block_slow(struct block_device *bdev, sector_t block)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	398	{
				399	struct inode *bd_inode = bdev->bd_inode;
				400	struct address_space *bd_mapping = bd_inode->i_mapping;
				401	struct buffer_head *ret = NULL;
				402	pgoff_t index;
				403	struct buffer_head *bh;
				404	struct buffer_head *head;
				405	struct page *page;
				406	int all_mapped = 1;
				407
				408	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
				409	page = find_get_page(bd_mapping, index);
				410	if (!page)
				411	goto out;
				412
				413	spin_lock(&bd_mapping->private_lock);
				414	if (!page_has_buffers(page))
				415	goto out_unlock;
				416	head = page_buffers(page);
				417	bh = head;
				418	do {
				419	if (bh->b_blocknr == block) {
				420	ret = bh;
				421	get_bh(bh);
				422	goto out_unlock;
				423	}
				424	if (!buffer_mapped(bh))
				425	all_mapped = 0;
				426	bh = bh->b_this_page;
				427	} while (bh != head);
				428
				429	/* we might be here because some of the buffers on this page are
				430	* not mapped. This is due to various races between
				431	* file io on the block device and getblk. It gets dealt with
				432	* elsewhere, don't buffer_error if we had some unmapped buffers
				433	*/
				434	if (all_mapped) {
				435	printk("__find_get_block_slow() failed. "
				436	"block=%llu, b_blocknr=%llu\n",
				437	(unsigned long long)block, (unsigned long long)bh->b_blocknr);
				438	printk("b_state=0x%08lx, b_size=%u\n", bh->b_state, bh->b_size);
				439	printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
				440	}
				441	out_unlock:
				442	spin_unlock(&bd_mapping->private_lock);
				443	page_cache_release(page);
				444	out:
				445	return ret;
				446	}
				447
				448	/* If invalidate_buffers() will trash dirty buffers, it means some kind
				449	of fs corruption is going on. Trashing dirty data always imply losing
				450	information that was supposed to be just stored on the physical layer
				451	by the user.
				452
				453	Thus invalidate_buffers in general usage is not allwowed to trash
				454	dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
				455	be preserved. These buffers are simply skipped.
				456
				457	We also skip buffers which are still in use. For example this can
				458	happen if a userspace program is reading the block device.
				459
				460	NOTE: In the case where the user removed a removable-media-disk even if
				461	there's still dirty data not synced on disk (due a bug in the device driver
				462	or due an error of the user), by not destroying the dirty buffers we could
				463	generate corruption also on the next media inserted, thus a parameter is
				464	necessary to handle this case in the most safe way possible (trying
				465	to not corrupt also the new disk inserted with the data belonging to
				466	the old now corrupted disk). Also for the ramdisk the natural thing
				467	to do in order to release the ramdisk memory is to destroy dirty buffers.
				468
				469	These are two special cases. Normal usage imply the device driver
				470	to issue a sync on the device (without waiting I/O completion) and
				471	then an invalidate_buffers call that doesn't trash dirty buffers.
				472
				473	For handling cache coherency with the blkdev pagecache the 'update' case
				474	is been introduced. It is needed to re-read from disk any pinned
				475	buffer. NOTE: re-reading from disk is destructive so we can do it only
				476	when we assume nobody is changing the buffercache under our I/O and when
				477	we think the disk contains more recent information than the buffercache.
				478	The update == 1 pass marks the buffers we need to update, the update == 2
				479	pass does the actual I/O. */
				480	void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
				481	{
				482	invalidate_bh_lrus();
				483	/*
				484	* FIXME: what about destroy_dirty_buffers?
				485	* We really want to use invalidate_inode_pages2() for
				486	* that, but not until that's cleaned up.
				487	*/
				488	invalidate_inode_pages(bdev->bd_inode->i_mapping);
				489	}
				490
				491	/*
				492	* Kick pdflush then try to free up some ZONE_NORMAL memory.
				493	*/
				494	static void free_more_memory(void)
				495	{
				496	struct zone **zones;
				497	pg_data_t *pgdat;
				498
Pekka J Enberg	687a21c	2005-06-28 20:44:55 -0700	[diff] [blame]	499	wakeup_pdflush(1024);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	500	yield();
				501
				502	for_each_pgdat(pgdat) {
Al Viro	af4ca45	2005-10-21 02:55:38 -0400	[diff] [blame]	503	zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	504	if (*zones)
Darren Hart	1ad539b	2005-06-21 17:14:53 -0700	[diff] [blame]	505	try_to_free_pages(zones, GFP_NOFS);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	506	}
				507	}
				508
				509	/*
				510	* I/O completion handler for block_read_full_page() - pages
				511	* which come unlocked at the end of I/O.
				512	*/
				513	static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
				514	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	515	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	516	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	517	struct buffer_head *tmp;
				518	struct page *page;
				519	int page_uptodate = 1;
				520
				521	BUG_ON(!buffer_async_read(bh));
				522
				523	page = bh->b_page;
				524	if (uptodate) {
				525	set_buffer_uptodate(bh);
				526	} else {
				527	clear_buffer_uptodate(bh);
				528	if (printk_ratelimit())
				529	buffer_io_error(bh);
				530	SetPageError(page);
				531	}
				532
				533	/*
				534	* Be _very_ careful from here on. Bad things can happen if
				535	* two buffer heads end IO at almost the same time and both
				536	* decide that the page is now completely done.
				537	*/
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	538	first = page_buffers(page);
				539	local_irq_save(flags);
				540	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	541	clear_buffer_async_read(bh);
				542	unlock_buffer(bh);
				543	tmp = bh;
				544	do {
				545	if (!buffer_uptodate(tmp))
				546	page_uptodate = 0;
				547	if (buffer_async_read(tmp)) {
				548	BUG_ON(!buffer_locked(tmp));
				549	goto still_busy;
				550	}
				551	tmp = tmp->b_this_page;
				552	} while (tmp != bh);
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	553	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				554	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	555
				556	/*
				557	* If none of the buffers had errors and they are all
				558	* uptodate then we can set the page uptodate.
				559	*/
				560	if (page_uptodate && !PageError(page))
				561	SetPageUptodate(page);
				562	unlock_page(page);
				563	return;
				564
				565	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	566	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				567	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	568	return;
				569	}
				570
				571	/*
				572	* Completion handler for block_write_full_page() - pages which are unlocked
				573	* during I/O, and which have PageWriteback cleared upon I/O completion.
				574	*/
				575	void end_buffer_async_write(struct buffer_head *bh, int uptodate)
				576	{
				577	char b[BDEVNAME_SIZE];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	578	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	579	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	580	struct buffer_head *tmp;
				581	struct page *page;
				582
				583	BUG_ON(!buffer_async_write(bh));
				584
				585	page = bh->b_page;
				586	if (uptodate) {
				587	set_buffer_uptodate(bh);
				588	} else {
				589	if (printk_ratelimit()) {
				590	buffer_io_error(bh);
				591	printk(KERN_WARNING "lost page write due to "
				592	"I/O error on %s\n",
				593	bdevname(bh->b_bdev, b));
				594	}
				595	set_bit(AS_EIO, &page->mapping->flags);
				596	clear_buffer_uptodate(bh);
				597	SetPageError(page);
				598	}
				599
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	600	first = page_buffers(page);
				601	local_irq_save(flags);
				602	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
				603
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	604	clear_buffer_async_write(bh);
				605	unlock_buffer(bh);
				606	tmp = bh->b_this_page;
				607	while (tmp != bh) {
				608	if (buffer_async_write(tmp)) {
				609	BUG_ON(!buffer_locked(tmp));
				610	goto still_busy;
				611	}
				612	tmp = tmp->b_this_page;
				613	}
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	614	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				615	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	616	end_page_writeback(page);
				617	return;
				618
				619	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	620	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				621	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	622	return;
				623	}
				624
				625	/*
				626	* If a page's buffers are under async readin (end_buffer_async_read
				627	* completion) then there is a possibility that another thread of
				628	* control could lock one of the buffers after it has completed
				629	* but while some of the other buffers have not completed. This
				630	* locked buffer would confuse end_buffer_async_read() into not unlocking
				631	* the page. So the absence of BH_Async_Read tells end_buffer_async_read()
				632	* that this buffer is not under async I/O.
				633	*
				634	* The page comes unlocked when it has no locked buffer_async buffers
				635	* left.
				636	*
				637	* PageLocked prevents anyone starting new async I/O reads any of
				638	* the buffers.
				639	*
				640	* PageWriteback is used to prevent simultaneous writeout of the same
				641	* page.
				642	*
				643	* PageLocked prevents anyone from starting writeback of a page which is
				644	* under read I/O (PageWriteback is only ever set against a locked page).
				645	*/
				646	static void mark_buffer_async_read(struct buffer_head *bh)
				647	{
				648	bh->b_end_io = end_buffer_async_read;
				649	set_buffer_async_read(bh);
				650	}
				651
				652	void mark_buffer_async_write(struct buffer_head *bh)
				653	{
				654	bh->b_end_io = end_buffer_async_write;
				655	set_buffer_async_write(bh);
				656	}
				657	EXPORT_SYMBOL(mark_buffer_async_write);
				658
				659
				660	/*
				661	* fs/buffer.c contains helper functions for buffer-backed address space's
				662	* fsync functions. A common requirement for buffer-based filesystems is
				663	* that certain data from the backing blockdev needs to be written out for
				664	* a successful fsync(). For example, ext2 indirect blocks need to be
				665	* written back and waited upon before fsync() returns.
				666	*
				667	* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
				668	* inode_has_buffers() and invalidate_inode_buffers() are provided for the
				669	* management of a list of dependent buffers at ->i_mapping->private_list.
				670	*
				671	* Locking is a little subtle: try_to_free_buffers() will remove buffers
				672	* from their controlling inode's queue when they are being freed. But
				673	* try_to_free_buffers() will be operating against the blockdev mapping
				674	* at the time, not against the S_ISREG file which depends on those buffers.
				675	* So the locking for private_list is via the private_lock in the address_space
				676	* which backs the buffers. Which is different from the address_space
				677	* against which the buffers are listed. So for a particular address_space,
				678	* mapping->private_lock does not protect mapping->private_list! In fact,
				679	* mapping->private_list will always be protected by the backing blockdev's
				680	* ->private_lock.
				681	*
				682	* Which introduces a requirement: all buffers on an address_space's
				683	* ->private_list must be from the same address_space: the blockdev's.
				684	*
				685	* address_spaces which do not place buffers at ->private_list via these
				686	* utility functions are free to use private_lock and private_list for
				687	* whatever they want. The only requirement is that list_empty(private_list)
				688	* be true at clear_inode() time.
				689	*
				690	* FIXME: clear_inode should not call invalidate_inode_buffers(). The
				691	* filesystems should do that. invalidate_inode_buffers() should just go
				692	* BUG_ON(!list_empty).
				693	*
				694	* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
				695	* take an address_space, not an inode. And it should be called
				696	* mark_buffer_dirty_fsync() to clearly define why those buffers are being
				697	* queued up.
				698	*
				699	* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
				700	* list if it is already on a list. Because if the buffer is on a list,
				701	* it must already be on the right one. If not, the filesystem is being
				702	* silly. This will save a ton of locking. But first we have to ensure
				703	* that buffers are taken off the old inode's list when they are freed
				704	* (presumably in truncate). That requires careful auditing of all
				705	* filesystems (do it inside bforget()). It could also be done by bringing
				706	* b_inode back.
				707	*/
				708
				709	/*
				710	* The buffer's backing address_space's private_lock must be held
				711	*/
				712	static inline void __remove_assoc_queue(struct buffer_head *bh)
				713	{
				714	list_del_init(&bh->b_assoc_buffers);
				715	}
				716
				717	int inode_has_buffers(struct inode *inode)
				718	{
				719	return !list_empty(&inode->i_data.private_list);
				720	}
				721
				722	/*
				723	* osync is designed to support O_SYNC io. It waits synchronously for
				724	* all already-submitted IO to complete, but does not queue any new
				725	* writes to the disk.
				726	*
				727	* To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
				728	* you dirty the buffers, and then use osync_inode_buffers to wait for
				729	* completion. Any other dirty buffers which are not yet queued for
				730	* write will not be flushed to disk by the osync.
				731	*/
				732	static int osync_buffers_list(spinlock_t lock, struct list_head list)
				733	{
				734	struct buffer_head *bh;
				735	struct list_head *p;
				736	int err = 0;
				737
				738	spin_lock(lock);
				739	repeat:
				740	list_for_each_prev(p, list) {
				741	bh = BH_ENTRY(p);
				742	if (buffer_locked(bh)) {
				743	get_bh(bh);
				744	spin_unlock(lock);
				745	wait_on_buffer(bh);
				746	if (!buffer_uptodate(bh))
				747	err = -EIO;
				748	brelse(bh);
				749	spin_lock(lock);
				750	goto repeat;
				751	}
				752	}
				753	spin_unlock(lock);
				754	return err;
				755	}
				756
				757	/**
				758	* sync_mapping_buffers - write out and wait upon a mapping's "associated"
				759	* buffers
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	760	* @mapping: the mapping which wants those buffers written
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	761	*
				762	* Starts I/O against the buffers at mapping->private_list, and waits upon
				763	* that I/O.
				764	*
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	765	* Basically, this is a convenience function for fsync().
				766	* @mapping is a file or directory which needs those buffers to be written for
				767	* a successful fsync().
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	768	*/
				769	int sync_mapping_buffers(struct address_space *mapping)
				770	{
				771	struct address_space *buffer_mapping = mapping->assoc_mapping;
				772
				773	if (buffer_mapping == NULL \|\| list_empty(&mapping->private_list))
				774	return 0;
				775
				776	return fsync_buffers_list(&buffer_mapping->private_lock,
				777	&mapping->private_list);
				778	}
				779	EXPORT_SYMBOL(sync_mapping_buffers);
				780
				781	/*
				782	* Called when we've recently written block `bblock', and it is known that
				783	* `bblock' was for a buffer_boundary() buffer. This means that the block at
				784	* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
				785	* dirty, schedule it for IO. So that indirects merge nicely with their data.
				786	*/
				787	void write_boundary_block(struct block_device *bdev,
				788	sector_t bblock, unsigned blocksize)
				789	{
				790	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
				791	if (bh) {
				792	if (buffer_dirty(bh))
				793	ll_rw_block(WRITE, 1, &bh);
				794	put_bh(bh);
				795	}
				796	}
				797
				798	void mark_buffer_dirty_inode(struct buffer_head bh, struct inode inode)
				799	{
				800	struct address_space *mapping = inode->i_mapping;
				801	struct address_space *buffer_mapping = bh->b_page->mapping;
				802
				803	mark_buffer_dirty(bh);
				804	if (!mapping->assoc_mapping) {
				805	mapping->assoc_mapping = buffer_mapping;
				806	} else {
				807	if (mapping->assoc_mapping != buffer_mapping)
				808	BUG();
				809	}
				810	if (list_empty(&bh->b_assoc_buffers)) {
				811	spin_lock(&buffer_mapping->private_lock);
				812	list_move_tail(&bh->b_assoc_buffers,
				813	&mapping->private_list);
				814	spin_unlock(&buffer_mapping->private_lock);
				815	}
				816	}
				817	EXPORT_SYMBOL(mark_buffer_dirty_inode);
				818
				819	/*
				820	* Add a page to the dirty page list.
				821	*
				822	* It is a sad fact of life that this function is called from several places
				823	* deeply under spinlocking. It may not sleep.
				824	*
				825	* If the page has buffers, the uptodate buffers are set dirty, to preserve
				826	* dirty-state coherency between the page and the buffers. It the page does
				827	* not have buffers then when they are later attached they will all be set
				828	* dirty.
				829	*
				830	* The buffers are dirtied before the page is dirtied. There's a small race
				831	* window in which a writepage caller may see the page cleanness but not the
				832	* buffer dirtiness. That's fine. If this code were to set the page dirty
				833	* before the buffers, a concurrent writepage caller could clear the page dirty
				834	* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
				835	* page on the dirty page list.
				836	*
				837	* We use private_lock to lock against try_to_free_buffers while using the
				838	* page's buffer list. Also use this to protect against clean buffers being
				839	* added to the page after it was set dirty.
				840	*
				841	* FIXME: may need to call ->reservepage here as well. That's rather up to the
				842	* address_space though.
				843	*/
				844	int __set_page_dirty_buffers(struct page *page)
				845	{
				846	struct address_space * const mapping = page->mapping;
				847
				848	spin_lock(&mapping->private_lock);
				849	if (page_has_buffers(page)) {
				850	struct buffer_head *head = page_buffers(page);
				851	struct buffer_head *bh = head;
				852
				853	do {
				854	set_buffer_dirty(bh);
				855	bh = bh->b_this_page;
				856	} while (bh != head);
				857	}
				858	spin_unlock(&mapping->private_lock);
				859
				860	if (!TestSetPageDirty(page)) {
				861	write_lock_irq(&mapping->tree_lock);
				862	if (page->mapping) { /* Race with truncate? */
				863	if (mapping_cap_account_dirty(mapping))
				864	inc_page_state(nr_dirty);
				865	radix_tree_tag_set(&mapping->page_tree,
				866	page_index(page),
				867	PAGECACHE_TAG_DIRTY);
				868	}
				869	write_unlock_irq(&mapping->tree_lock);
				870	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
Andrew Morton	4741c9f	2006-03-24 03:18:11 -0800	[diff] [blame]	871	return 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	872	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	873	return 0;
				874	}
				875	EXPORT_SYMBOL(__set_page_dirty_buffers);
				876
				877	/*
				878	* Write out and wait upon a list of buffers.
				879	*
				880	* We have conflicting pressures: we want to make sure that all
				881	* initially dirty buffers get waited on, but that any subsequently
				882	* dirtied buffers don't. After all, we don't want fsync to last
				883	* forever if somebody is actively writing to the file.
				884	*
				885	* Do this in two main stages: first we copy dirty buffers to a
				886	* temporary inode list, queueing the writes as we go. Then we clean
				887	* up, waiting for those writes to complete.
				888	*
				889	* During this second stage, any subsequent updates to the file may end
				890	* up refiling the buffer on the original inode's dirty list again, so
				891	* there is a chance we will end up with a buffer queued for write but
				892	* not yet completed on that list. So, as a final cleanup we go through
				893	* the osync code to catch these locked, dirty buffers without requeuing
				894	* any newly dirty buffers for write.
				895	*/
				896	static int fsync_buffers_list(spinlock_t lock, struct list_head list)
				897	{
				898	struct buffer_head *bh;
				899	struct list_head tmp;
				900	int err = 0, err2;
				901
				902	INIT_LIST_HEAD(&tmp);
				903
				904	spin_lock(lock);
				905	while (!list_empty(list)) {
				906	bh = BH_ENTRY(list->next);
				907	list_del_init(&bh->b_assoc_buffers);
				908	if (buffer_dirty(bh) \|\| buffer_locked(bh)) {
				909	list_add(&bh->b_assoc_buffers, &tmp);
				910	if (buffer_dirty(bh)) {
				911	get_bh(bh);
				912	spin_unlock(lock);
				913	/*
				914	* Ensure any pending I/O completes so that
				915	* ll_rw_block() actually writes the current
				916	* contents - it is a noop if I/O is still in
				917	* flight on potentially older contents.
				918	*/
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	919	ll_rw_block(SWRITE, 1, &bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	920	brelse(bh);
				921	spin_lock(lock);
				922	}
				923	}
				924	}
				925
				926	while (!list_empty(&tmp)) {
				927	bh = BH_ENTRY(tmp.prev);
				928	__remove_assoc_queue(bh);
				929	get_bh(bh);
				930	spin_unlock(lock);
				931	wait_on_buffer(bh);
				932	if (!buffer_uptodate(bh))
				933	err = -EIO;
				934	brelse(bh);
				935	spin_lock(lock);
				936	}
				937
				938	spin_unlock(lock);
				939	err2 = osync_buffers_list(lock, list);
				940	if (err)
				941	return err;
				942	else
				943	return err2;
				944	}
				945
				946	/*
				947	* Invalidate any and all dirty buffers on a given inode. We are
				948	* probably unmounting the fs, but that doesn't mean we have already
				949	* done a sync(). Just drop the buffers from the inode list.
				950	*
				951	* NOTE: we take the inode's blockdev's mapping's private_lock. Which
				952	* assumes that all the buffers are against the blockdev. Not true
				953	* for reiserfs.
				954	*/
				955	void invalidate_inode_buffers(struct inode *inode)
				956	{
				957	if (inode_has_buffers(inode)) {
				958	struct address_space *mapping = &inode->i_data;
				959	struct list_head *list = &mapping->private_list;
				960	struct address_space *buffer_mapping = mapping->assoc_mapping;
				961
				962	spin_lock(&buffer_mapping->private_lock);
				963	while (!list_empty(list))
				964	__remove_assoc_queue(BH_ENTRY(list->next));
				965	spin_unlock(&buffer_mapping->private_lock);
				966	}
				967	}
				968
				969	/*
				970	* Remove any clean buffers from the inode's buffer list. This is called
				971	* when we're trying to free the inode itself. Those buffers can pin it.
				972	*
				973	* Returns true if all buffers were removed.
				974	*/
				975	int remove_inode_buffers(struct inode *inode)
				976	{
				977	int ret = 1;
				978
				979	if (inode_has_buffers(inode)) {
				980	struct address_space *mapping = &inode->i_data;
				981	struct list_head *list = &mapping->private_list;
				982	struct address_space *buffer_mapping = mapping->assoc_mapping;
				983
				984	spin_lock(&buffer_mapping->private_lock);
				985	while (!list_empty(list)) {
				986	struct buffer_head *bh = BH_ENTRY(list->next);
				987	if (buffer_dirty(bh)) {
				988	ret = 0;
				989	break;
				990	}
				991	__remove_assoc_queue(bh);
				992	}
				993	spin_unlock(&buffer_mapping->private_lock);
				994	}
				995	return ret;
				996	}
				997
				998	/*
				999	* Create the appropriate buffers when given a page for data area and
				1000	* the size of each buffer.. Use the bh->b_this_page linked list to
				1001	* follow the buffers created. Return NULL if unable to create more
				1002	* buffers.
				1003	*
				1004	* The retry flag is used to differentiate async IO (paging, swapping)
				1005	* which may not fail from ordinary buffer allocations.
				1006	*/
				1007	struct buffer_head alloc_page_buffers(struct page page, unsigned long size,
				1008	int retry)
				1009	{
				1010	struct buffer_head bh, head;
				1011	long offset;
				1012
				1013	try_again:
				1014	head = NULL;
				1015	offset = PAGE_SIZE;
				1016	while ((offset -= size) >= 0) {
				1017	bh = alloc_buffer_head(GFP_NOFS);
				1018	if (!bh)
				1019	goto no_grow;
				1020
				1021	bh->b_bdev = NULL;
				1022	bh->b_this_page = head;
				1023	bh->b_blocknr = -1;
				1024	head = bh;
				1025
				1026	bh->b_state = 0;
				1027	atomic_set(&bh->b_count, 0);
Chris Mason	fc5cd58	2006-02-01 03:06:48 -0800	[diff] [blame]	1028	bh->b_private = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1029	bh->b_size = size;
				1030
				1031	/* Link the buffer to its page */
				1032	set_bh_page(bh, page, offset);
				1033
Nathan Scott	01ffe33	2006-01-17 09:02:07 +1100	[diff] [blame]	1034	init_buffer(bh, NULL, NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1035	}
				1036	return head;
				1037	/*
				1038	* In case anything failed, we just free everything we got.
				1039	*/
				1040	no_grow:
				1041	if (head) {
				1042	do {
				1043	bh = head;
				1044	head = head->b_this_page;
				1045	free_buffer_head(bh);
				1046	} while (head);
				1047	}
				1048
				1049	/*
				1050	* Return failure for non-async IO requests. Async IO requests
				1051	* are not allowed to fail, so we have to wait until buffer heads
				1052	* become available. But we don't want tasks sleeping with
				1053	* partially complete buffers, so all were released above.
				1054	*/
				1055	if (!retry)
				1056	return NULL;
				1057
				1058	/* We're _really_ low on memory. Now we just
				1059	* wait for old buffer heads to become free due to
				1060	* finishing IO. Since this is an async request and
				1061	* the reserve list is empty, we're sure there are
				1062	* async buffer heads in use.
				1063	*/
				1064	free_more_memory();
				1065	goto try_again;
				1066	}
				1067	EXPORT_SYMBOL_GPL(alloc_page_buffers);
				1068
				1069	static inline void
				1070	link_dev_buffers(struct page page, struct buffer_head head)
				1071	{
				1072	struct buffer_head bh, tail;
				1073
				1074	bh = head;
				1075	do {
				1076	tail = bh;
				1077	bh = bh->b_this_page;
				1078	} while (bh);
				1079	tail->b_this_page = head;
				1080	attach_page_buffers(page, head);
				1081	}
				1082
				1083	/*
				1084	* Initialise the state of a blockdev page's buffers.
				1085	*/
				1086	static void
				1087	init_page_buffers(struct page page, struct block_device bdev,
				1088	sector_t block, int size)
				1089	{
				1090	struct buffer_head *head = page_buffers(page);
				1091	struct buffer_head *bh = head;
				1092	int uptodate = PageUptodate(page);
				1093
				1094	do {
				1095	if (!buffer_mapped(bh)) {
				1096	init_buffer(bh, NULL, NULL);
				1097	bh->b_bdev = bdev;
				1098	bh->b_blocknr = block;
				1099	if (uptodate)
				1100	set_buffer_uptodate(bh);
				1101	set_buffer_mapped(bh);
				1102	}
				1103	block++;
				1104	bh = bh->b_this_page;
				1105	} while (bh != head);
				1106	}
				1107
				1108	/*
				1109	* Create the page-cache page that contains the requested block.
				1110	*
				1111	* This is user purely for blockdev mappings.
				1112	*/
				1113	static struct page *
				1114	grow_dev_page(struct block_device *bdev, sector_t block,
				1115	pgoff_t index, int size)
				1116	{
				1117	struct inode *inode = bdev->bd_inode;
				1118	struct page *page;
				1119	struct buffer_head *bh;
				1120
				1121	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
				1122	if (!page)
				1123	return NULL;
				1124
				1125	if (!PageLocked(page))
				1126	BUG();
				1127
				1128	if (page_has_buffers(page)) {
				1129	bh = page_buffers(page);
				1130	if (bh->b_size == size) {
				1131	init_page_buffers(page, bdev, block, size);
				1132	return page;
				1133	}
				1134	if (!try_to_free_buffers(page))
				1135	goto failed;
				1136	}
				1137
				1138	/*
				1139	* Allocate some buffers for this page
				1140	*/
				1141	bh = alloc_page_buffers(page, size, 0);
				1142	if (!bh)
				1143	goto failed;
				1144
				1145	/*
				1146	* Link the page to the buffers and initialise them. Take the
				1147	* lock to be atomic wrt __find_get_block(), which does not
				1148	* run under the page lock.
				1149	*/
				1150	spin_lock(&inode->i_mapping->private_lock);
				1151	link_dev_buffers(page, bh);
				1152	init_page_buffers(page, bdev, block, size);
				1153	spin_unlock(&inode->i_mapping->private_lock);
				1154	return page;
				1155
				1156	failed:
				1157	BUG();
				1158	unlock_page(page);
				1159	page_cache_release(page);
				1160	return NULL;
				1161	}
				1162
				1163	/*
				1164	* Create buffers for the specified block device block's page. If
				1165	* that page was dirty, the buffers are set dirty also.
				1166	*
				1167	* Except that's a bug. Attaching dirty buffers to a dirty
				1168	* blockdev's page can result in filesystem corruption, because
				1169	* some of those buffers may be aliases of filesystem data.
				1170	* grow_dev_page() will go BUG() if this happens.
				1171	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1172	static int
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1173	grow_buffers(struct block_device *bdev, sector_t block, int size)
				1174	{
				1175	struct page *page;
				1176	pgoff_t index;
				1177	int sizebits;
				1178
				1179	sizebits = -1;
				1180	do {
				1181	sizebits++;
				1182	} while ((size << sizebits) < PAGE_SIZE);
				1183
				1184	index = block >> sizebits;
				1185	block = index << sizebits;
				1186
				1187	/* Create a page with the proper size buffers.. */
				1188	page = grow_dev_page(bdev, block, index, size);
				1189	if (!page)
				1190	return 0;
				1191	unlock_page(page);
				1192	page_cache_release(page);
				1193	return 1;
				1194	}
				1195
Adrian Bunk	75c96f8	2005-05-05 16:16:09 -0700	[diff] [blame]	1196	static struct buffer_head *
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1197	__getblk_slow(struct block_device *bdev, sector_t block, int size)
				1198	{
				1199	/* Size must be multiple of hard sectorsize */
				1200	if (unlikely(size & (bdev_hardsect_size(bdev)-1) \|\|
				1201	(size < 512 \|\| size > PAGE_SIZE))) {
				1202	printk(KERN_ERR "getblk(): invalid block size %d requested\n",
				1203	size);
				1204	printk(KERN_ERR "hardsect size: %d\n",
				1205	bdev_hardsect_size(bdev));
				1206
				1207	dump_stack();
				1208	return NULL;
				1209	}
				1210
				1211	for (;;) {
				1212	struct buffer_head * bh;
				1213
				1214	bh = __find_get_block(bdev, block, size);
				1215	if (bh)
				1216	return bh;
				1217
				1218	if (!grow_buffers(bdev, block, size))
				1219	free_more_memory();
				1220	}
				1221	}
				1222
				1223	/*
				1224	* The relationship between dirty buffers and dirty pages:
				1225	*
				1226	* Whenever a page has any dirty buffers, the page's dirty bit is set, and
				1227	* the page is tagged dirty in its radix tree.
				1228	*
				1229	* At all times, the dirtiness of the buffers represents the dirtiness of
				1230	* subsections of the page. If the page has buffers, the page dirty bit is
				1231	* merely a hint about the true dirty state.
				1232	*
				1233	* When a page is set dirty in its entirety, all its buffers are marked dirty
				1234	* (if the page has buffers).
				1235	*
				1236	* When a buffer is marked dirty, its page is dirtied, but the page's other
				1237	* buffers are not.
				1238	*
				1239	* Also. When blockdev buffers are explicitly read with bread(), they
				1240	* individually become uptodate. But their backing page remains not
				1241	* uptodate - even if all of its buffers are uptodate. A subsequent
				1242	* block_read_full_page() against that page will discover all the uptodate
				1243	* buffers, will set the page uptodate and will perform no I/O.
				1244	*/
				1245
				1246	/**
				1247	* mark_buffer_dirty - mark a buffer_head as needing writeout
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1248	* @bh: the buffer_head to mark dirty
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1249	*
				1250	* mark_buffer_dirty() will set the dirty bit against the buffer, then set its
				1251	* backing page dirty, then tag the page as dirty in its address_space's radix
				1252	* tree and then attach the address_space's inode to its superblock's dirty
				1253	* inode list.
				1254	*
				1255	* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
				1256	* mapping->tree_lock and the global inode_lock.
				1257	*/
				1258	void fastcall mark_buffer_dirty(struct buffer_head *bh)
				1259	{
				1260	if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
				1261	__set_page_dirty_nobuffers(bh->b_page);
				1262	}
				1263
				1264	/*
				1265	* Decrement a buffer_head's reference count. If all buffers against a page
				1266	* have zero reference count, are clean and unlocked, and if the page is clean
				1267	* and unlocked then try_to_free_buffers() may strip the buffers from the page
				1268	* in preparation for freeing it (sometimes, rarely, buffers are removed from
				1269	* a page but it ends up not being freed, and buffers may later be reattached).
				1270	*/
				1271	void __brelse(struct buffer_head * buf)
				1272	{
				1273	if (atomic_read(&buf->b_count)) {
				1274	put_bh(buf);
				1275	return;
				1276	}
				1277	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
				1278	WARN_ON(1);
				1279	}
				1280
				1281	/*
				1282	* bforget() is like brelse(), except it discards any
				1283	* potentially dirty data.
				1284	*/
				1285	void __bforget(struct buffer_head *bh)
				1286	{
				1287	clear_buffer_dirty(bh);
				1288	if (!list_empty(&bh->b_assoc_buffers)) {
				1289	struct address_space *buffer_mapping = bh->b_page->mapping;
				1290
				1291	spin_lock(&buffer_mapping->private_lock);
				1292	list_del_init(&bh->b_assoc_buffers);
				1293	spin_unlock(&buffer_mapping->private_lock);
				1294	}
				1295	__brelse(bh);
				1296	}
				1297
				1298	static struct buffer_head __bread_slow(struct buffer_head bh)
				1299	{
				1300	lock_buffer(bh);
				1301	if (buffer_uptodate(bh)) {
				1302	unlock_buffer(bh);
				1303	return bh;
				1304	} else {
				1305	get_bh(bh);
				1306	bh->b_end_io = end_buffer_read_sync;
				1307	submit_bh(READ, bh);
				1308	wait_on_buffer(bh);
				1309	if (buffer_uptodate(bh))
				1310	return bh;
				1311	}
				1312	brelse(bh);
				1313	return NULL;
				1314	}
				1315
				1316	/*
				1317	* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
				1318	* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
				1319	* refcount elevated by one when they're in an LRU. A buffer can only appear
				1320	* once in a particular CPU's LRU. A single buffer can be present in multiple
				1321	* CPU's LRUs at the same time.
				1322	*
				1323	* This is a transparent caching front-end to sb_bread(), sb_getblk() and
				1324	* sb_find_get_block().
				1325	*
				1326	* The LRUs themselves only need locking against invalidate_bh_lrus. We use
				1327	* a local interrupt disable for that.
				1328	*/
				1329
				1330	#define BH_LRU_SIZE 8
				1331
				1332	struct bh_lru {
				1333	struct buffer_head *bhs[BH_LRU_SIZE];
				1334	};
				1335
				1336	static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
				1337
				1338	#ifdef CONFIG_SMP
				1339	#define bh_lru_lock() local_irq_disable()
				1340	#define bh_lru_unlock() local_irq_enable()
				1341	#else
				1342	#define bh_lru_lock() preempt_disable()
				1343	#define bh_lru_unlock() preempt_enable()
				1344	#endif
				1345
				1346	static inline void check_irqs_on(void)
				1347	{
				1348	#ifdef irqs_disabled
				1349	BUG_ON(irqs_disabled());
				1350	#endif
				1351	}
				1352
				1353	/*
				1354	* The LRU management algorithm is dopey-but-simple. Sorry.
				1355	*/
				1356	static void bh_lru_install(struct buffer_head *bh)
				1357	{
				1358	struct buffer_head *evictee = NULL;
				1359	struct bh_lru *lru;
				1360
				1361	check_irqs_on();
				1362	bh_lru_lock();
				1363	lru = &__get_cpu_var(bh_lrus);
				1364	if (lru->bhs[0] != bh) {
				1365	struct buffer_head *bhs[BH_LRU_SIZE];
				1366	int in;
				1367	int out = 0;
				1368
				1369	get_bh(bh);
				1370	bhs[out++] = bh;
				1371	for (in = 0; in < BH_LRU_SIZE; in++) {
				1372	struct buffer_head *bh2 = lru->bhs[in];
				1373
				1374	if (bh2 == bh) {
				1375	__brelse(bh2);
				1376	} else {
				1377	if (out >= BH_LRU_SIZE) {
				1378	BUG_ON(evictee != NULL);
				1379	evictee = bh2;
				1380	} else {
				1381	bhs[out++] = bh2;
				1382	}
				1383	}
				1384	}
				1385	while (out < BH_LRU_SIZE)
				1386	bhs[out++] = NULL;
				1387	memcpy(lru->bhs, bhs, sizeof(bhs));
				1388	}
				1389	bh_lru_unlock();
				1390
				1391	if (evictee)
				1392	__brelse(evictee);
				1393	}
				1394
				1395	/*
				1396	* Look up the bh in this cpu's LRU. If it's there, move it to the head.
				1397	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1398	static struct buffer_head *
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1399	lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
				1400	{
				1401	struct buffer_head *ret = NULL;
				1402	struct bh_lru *lru;
				1403	int i;
				1404
				1405	check_irqs_on();
				1406	bh_lru_lock();
				1407	lru = &__get_cpu_var(bh_lrus);
				1408	for (i = 0; i < BH_LRU_SIZE; i++) {
				1409	struct buffer_head *bh = lru->bhs[i];
				1410
				1411	if (bh && bh->b_bdev == bdev &&
				1412	bh->b_blocknr == block && bh->b_size == size) {
				1413	if (i) {
				1414	while (i) {
				1415	lru->bhs[i] = lru->bhs[i - 1];
				1416	i--;
				1417	}
				1418	lru->bhs[0] = bh;
				1419	}
				1420	get_bh(bh);
				1421	ret = bh;
				1422	break;
				1423	}
				1424	}
				1425	bh_lru_unlock();
				1426	return ret;
				1427	}
				1428
				1429	/*
				1430	* Perform a pagecache lookup for the matching buffer. If it's there, refresh
				1431	* it in the LRU and mark it as accessed. If it is not present then return
				1432	* NULL
				1433	*/
				1434	struct buffer_head *
				1435	__find_get_block(struct block_device *bdev, sector_t block, int size)
				1436	{
				1437	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
				1438
				1439	if (bh == NULL) {
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1440	bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1441	if (bh)
				1442	bh_lru_install(bh);
				1443	}
				1444	if (bh)
				1445	touch_buffer(bh);
				1446	return bh;
				1447	}
				1448	EXPORT_SYMBOL(__find_get_block);
				1449
				1450	/*
				1451	* __getblk will locate (and, if necessary, create) the buffer_head
				1452	* which corresponds to the passed block_device, block and size. The
				1453	* returned buffer has its reference count incremented.
				1454	*
				1455	* __getblk() cannot fail - it just keeps trying. If you pass it an
				1456	* illegal block number, __getblk() will happily return a buffer_head
				1457	* which represents the non-existent block. Very weird.
				1458	*
				1459	* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
				1460	* attempt is failing. FIXME, perhaps?
				1461	*/
				1462	struct buffer_head *
				1463	__getblk(struct block_device *bdev, sector_t block, int size)
				1464	{
				1465	struct buffer_head *bh = __find_get_block(bdev, block, size);
				1466
				1467	might_sleep();
				1468	if (bh == NULL)
				1469	bh = __getblk_slow(bdev, block, size);
				1470	return bh;
				1471	}
				1472	EXPORT_SYMBOL(__getblk);
				1473
				1474	/*
				1475	* Do async read-ahead on a buffer..
				1476	*/
				1477	void __breadahead(struct block_device *bdev, sector_t block, int size)
				1478	{
				1479	struct buffer_head *bh = __getblk(bdev, block, size);
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1480	if (likely(bh)) {
				1481	ll_rw_block(READA, 1, &bh);
				1482	brelse(bh);
				1483	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1484	}
				1485	EXPORT_SYMBOL(__breadahead);
				1486
				1487	/**
				1488	* __bread() - reads a specified block and returns the bh
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1489	* @bdev: the block_device to read from
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1490	* @block: number of block
				1491	* @size: size (in bytes) to read
				1492	*
				1493	* Reads a specified block, and returns buffer head that contains it.
				1494	* It returns NULL if the block was unreadable.
				1495	*/
				1496	struct buffer_head *
				1497	__bread(struct block_device *bdev, sector_t block, int size)
				1498	{
				1499	struct buffer_head *bh = __getblk(bdev, block, size);
				1500
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1501	if (likely(bh) && !buffer_uptodate(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1502	bh = __bread_slow(bh);
				1503	return bh;
				1504	}
				1505	EXPORT_SYMBOL(__bread);
				1506
				1507	/*
				1508	* invalidate_bh_lrus() is called rarely - but not only at unmount.
				1509	* This doesn't race because it runs in each cpu either in irq
				1510	* or with preempt disabled.
				1511	*/
				1512	static void invalidate_bh_lru(void *arg)
				1513	{
				1514	struct bh_lru *b = &get_cpu_var(bh_lrus);
				1515	int i;
				1516
				1517	for (i = 0; i < BH_LRU_SIZE; i++) {
				1518	brelse(b->bhs[i]);
				1519	b->bhs[i] = NULL;
				1520	}
				1521	put_cpu_var(bh_lrus);
				1522	}
				1523
				1524	static void invalidate_bh_lrus(void)
				1525	{
				1526	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
				1527	}
				1528
				1529	void set_bh_page(struct buffer_head *bh,
				1530	struct page *page, unsigned long offset)
				1531	{
				1532	bh->b_page = page;
				1533	if (offset >= PAGE_SIZE)
				1534	BUG();
				1535	if (PageHighMem(page))
				1536	/*
				1537	* This catches illegal uses and preserves the offset:
				1538	*/
				1539	bh->b_data = (char *)(0 + offset);
				1540	else
				1541	bh->b_data = page_address(page) + offset;
				1542	}
				1543	EXPORT_SYMBOL(set_bh_page);
				1544
				1545	/*
				1546	* Called when truncating a buffer on a page completely.
				1547	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1548	static void discard_buffer(struct buffer_head * bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1549	{
				1550	lock_buffer(bh);
				1551	clear_buffer_dirty(bh);
				1552	bh->b_bdev = NULL;
				1553	clear_buffer_mapped(bh);
				1554	clear_buffer_req(bh);
				1555	clear_buffer_new(bh);
				1556	clear_buffer_delay(bh);
				1557	unlock_buffer(bh);
				1558	}
				1559
				1560	/**
				1561	* try_to_release_page() - release old fs-specific metadata on a page
				1562	*
				1563	* @page: the page which the kernel is trying to free
				1564	* @gfp_mask: memory allocation flags (and I/O mode)
				1565	*
				1566	* The address_space is to try to release any data against the page
				1567	* (presumably at page->private). If the release was successful, return `1'.
				1568	* Otherwise return zero.
				1569	*
				1570	* The @gfp_mask argument specifies whether I/O may be performed to release
				1571	* this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
				1572	*
				1573	* NOTE: @gfp_mask may go away, and this function may become non-blocking.
				1574	*/
Al Viro	27496a8	2005-10-21 03:20:48 -0400	[diff] [blame]	1575	int try_to_release_page(struct page *page, gfp_t gfp_mask)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1576	{
				1577	struct address_space * const mapping = page->mapping;
				1578
				1579	BUG_ON(!PageLocked(page));
				1580	if (PageWriteback(page))
				1581	return 0;
				1582
				1583	if (mapping && mapping->a_ops->releasepage)
				1584	return mapping->a_ops->releasepage(page, gfp_mask);
				1585	return try_to_free_buffers(page);
				1586	}
				1587	EXPORT_SYMBOL(try_to_release_page);
				1588
				1589	/**
				1590	* block_invalidatepage - invalidate part of all of a buffer-backed page
				1591	*
				1592	* @page: the page which is affected
				1593	* @offset: the index of the truncation point
				1594	*
				1595	* block_invalidatepage() is called when all or part of the page has become
				1596	* invalidatedby a truncate operation.
				1597	*
				1598	* block_invalidatepage() does not have to release all buffers, but it must
				1599	* ensure that no dirty buffer is left outside @offset and that no I/O
				1600	* is underway against any of the blocks which are outside the truncation
				1601	* point. Because the caller is about to free (and possibly reuse) those
				1602	* blocks on-disk.
				1603	*/
				1604	int block_invalidatepage(struct page *page, unsigned long offset)
				1605	{
				1606	struct buffer_head head, bh, *next;
				1607	unsigned int curr_off = 0;
				1608	int ret = 1;
				1609
				1610	BUG_ON(!PageLocked(page));
				1611	if (!page_has_buffers(page))
				1612	goto out;
				1613
				1614	head = page_buffers(page);
				1615	bh = head;
				1616	do {
				1617	unsigned int next_off = curr_off + bh->b_size;
				1618	next = bh->b_this_page;
				1619
				1620	/*
				1621	* is this block fully invalidated?
				1622	*/
				1623	if (offset <= curr_off)
				1624	discard_buffer(bh);
				1625	curr_off = next_off;
				1626	bh = next;
				1627	} while (bh != head);
				1628
				1629	/*
				1630	* We release buffers only if the entire page is being invalidated.
				1631	* The get_block cached value has been unconditionally invalidated,
				1632	* so real IO is not possible anymore.
				1633	*/
				1634	if (offset == 0)
				1635	ret = try_to_release_page(page, 0);
				1636	out:
				1637	return ret;
				1638	}
				1639	EXPORT_SYMBOL(block_invalidatepage);
				1640
Jan Kara	aaa4059	2005-10-30 15:00:16 -0800	[diff] [blame]	1641	int do_invalidatepage(struct page *page, unsigned long offset)
				1642	{
				1643	int (invalidatepage)(struct page , unsigned long);
				1644	invalidatepage = page->mapping->a_ops->invalidatepage;
				1645	if (invalidatepage == NULL)
				1646	invalidatepage = block_invalidatepage;
				1647	return (*invalidatepage)(page, offset);
				1648	}
				1649
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1650	/*
				1651	* We attach and possibly dirty the buffers atomically wrt
				1652	* __set_page_dirty_buffers() via private_lock. try_to_free_buffers
				1653	* is already excluded via the page lock.
				1654	*/
				1655	void create_empty_buffers(struct page *page,
				1656	unsigned long blocksize, unsigned long b_state)
				1657	{
				1658	struct buffer_head bh, head, *tail;
				1659
				1660	head = alloc_page_buffers(page, blocksize, 1);
				1661	bh = head;
				1662	do {
				1663	bh->b_state \|= b_state;
				1664	tail = bh;
				1665	bh = bh->b_this_page;
				1666	} while (bh);
				1667	tail->b_this_page = head;
				1668
				1669	spin_lock(&page->mapping->private_lock);
				1670	if (PageUptodate(page) \|\| PageDirty(page)) {
				1671	bh = head;
				1672	do {
				1673	if (PageDirty(page))
				1674	set_buffer_dirty(bh);
				1675	if (PageUptodate(page))
				1676	set_buffer_uptodate(bh);
				1677	bh = bh->b_this_page;
				1678	} while (bh != head);
				1679	}
				1680	attach_page_buffers(page, head);
				1681	spin_unlock(&page->mapping->private_lock);
				1682	}
				1683	EXPORT_SYMBOL(create_empty_buffers);
				1684
				1685	/*
				1686	* We are taking a block for data and we don't want any output from any
				1687	* buffer-cache aliases starting from return from that function and
				1688	* until the moment when something will explicitly mark the buffer
				1689	* dirty (hopefully that will not happen until we will free that block ;-)
				1690	* We don't even need to mark it not-uptodate - nobody can expect
				1691	* anything from a newly allocated buffer anyway. We used to used
				1692	* unmap_buffer() for such invalidation, but that was wrong. We definitely
				1693	* don't want to mark the alias unmapped, for example - it would confuse
				1694	* anyone who might pick it with bread() afterwards...
				1695	*
				1696	* Also.. Note that bforget() doesn't lock the buffer. So there can
				1697	* be writeout I/O going on against recently-freed buffers. We don't
				1698	* wait on that I/O in bforget() - it's more efficient to wait on the I/O
				1699	* only if we really need to. That happens here.
				1700	*/
				1701	void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
				1702	{
				1703	struct buffer_head *old_bh;
				1704
				1705	might_sleep();
				1706
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1707	old_bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1708	if (old_bh) {
				1709	clear_buffer_dirty(old_bh);
				1710	wait_on_buffer(old_bh);
				1711	clear_buffer_req(old_bh);
				1712	__brelse(old_bh);
				1713	}
				1714	}
				1715	EXPORT_SYMBOL(unmap_underlying_metadata);
				1716
				1717	/*
				1718	* NOTE! All mapped/uptodate combinations are valid:
				1719	*
				1720	* Mapped Uptodate Meaning
				1721	*
				1722	* No No "unknown" - must do get_block()
				1723	* No Yes "hole" - zero-filled
				1724	* Yes No "allocated" - allocated on disk, not read in
				1725	* Yes Yes "valid" - allocated and up-to-date in memory.
				1726	*
				1727	* "Dirty" is valid only with the last case (mapped+uptodate).
				1728	*/
				1729
				1730	/*
				1731	* While block_write_full_page is writing back the dirty buffers under
				1732	* the page lock, whoever dirtied the buffers may decide to clean them
				1733	* again at any time. We handle that by only looking at the buffer
				1734	* state inside lock_buffer().
				1735	*
				1736	* If block_write_full_page() is called for regular writeback
				1737	* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
				1738	* locked buffer. This only can happen if someone has written the buffer
				1739	* directly, with submit_bh(). At the address_space level PageWriteback
				1740	* prevents this contention from occurring.
				1741	*/
				1742	static int __block_write_full_page(struct inode inode, struct page page,
				1743	get_block_t get_block, struct writeback_control wbc)
				1744	{
				1745	int err;
				1746	sector_t block;
				1747	sector_t last_block;
Andrew Morton	f0fbd5f	2005-05-05 16:15:48 -0700	[diff] [blame]	1748	struct buffer_head bh, head;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1749	int nr_underway = 0;
				1750
				1751	BUG_ON(!PageLocked(page));
				1752
				1753	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
				1754
				1755	if (!page_has_buffers(page)) {
				1756	create_empty_buffers(page, 1 << inode->i_blkbits,
				1757	(1 << BH_Dirty)\|(1 << BH_Uptodate));
				1758	}
				1759
				1760	/*
				1761	* Be very careful. We have no exclusion from __set_page_dirty_buffers
				1762	* here, and the (potentially unmapped) buffers may become dirty at
				1763	* any time. If a buffer becomes dirty here after we've inspected it
				1764	* then we just miss that fact, and the page stays dirty.
				1765	*
				1766	* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
				1767	* handle that here by just cleaning them.
				1768	*/
				1769
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	1770	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1771	head = page_buffers(page);
				1772	bh = head;
				1773
				1774	/*
				1775	* Get all the dirty buffers mapped to disk addresses and
				1776	* handle any aliases from the underlying blockdev's mapping.
				1777	*/
				1778	do {
				1779	if (block > last_block) {
				1780	/*
				1781	* mapped buffers outside i_size will occur, because
				1782	* this page can be outside i_size when there is a
				1783	* truncate in progress.
				1784	*/
				1785	/*
				1786	* The buffer was zeroed by block_write_full_page()
				1787	*/
				1788	clear_buffer_dirty(bh);
				1789	set_buffer_uptodate(bh);
				1790	} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
				1791	err = get_block(inode, block, bh, 1);
				1792	if (err)
				1793	goto recover;
				1794	if (buffer_new(bh)) {
				1795	/* blockdev mappings never come here */
				1796	clear_buffer_new(bh);
				1797	unmap_underlying_metadata(bh->b_bdev,
				1798	bh->b_blocknr);
				1799	}
				1800	}
				1801	bh = bh->b_this_page;
				1802	block++;
				1803	} while (bh != head);
				1804
				1805	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1806	if (!buffer_mapped(bh))
				1807	continue;
				1808	/*
				1809	* If it's a fully non-blocking write attempt and we cannot
				1810	* lock the buffer then redirty the page. Note that this can
				1811	* potentially cause a busy-wait loop from pdflush and kswapd
				1812	* activity, but those code paths have their own higher-level
				1813	* throttling.
				1814	*/
				1815	if (wbc->sync_mode != WB_SYNC_NONE \|\| !wbc->nonblocking) {
				1816	lock_buffer(bh);
				1817	} else if (test_set_buffer_locked(bh)) {
				1818	redirty_page_for_writepage(wbc, page);
				1819	continue;
				1820	}
				1821	if (test_clear_buffer_dirty(bh)) {
				1822	mark_buffer_async_write(bh);
				1823	} else {
				1824	unlock_buffer(bh);
				1825	}
				1826	} while ((bh = bh->b_this_page) != head);
				1827
				1828	/*
				1829	* The page and its buffers are protected by PageWriteback(), so we can
				1830	* drop the bh refcounts early.
				1831	*/
				1832	BUG_ON(PageWriteback(page));
				1833	set_page_writeback(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1834
				1835	do {
				1836	struct buffer_head *next = bh->b_this_page;
				1837	if (buffer_async_write(bh)) {
				1838	submit_bh(WRITE, bh);
				1839	nr_underway++;
				1840	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1841	bh = next;
				1842	} while (bh != head);
Andrew Morton	05937ba	2005-05-05 16:15:47 -0700	[diff] [blame]	1843	unlock_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1844
				1845	err = 0;
				1846	done:
				1847	if (nr_underway == 0) {
				1848	/*
				1849	* The page was marked dirty, but the buffers were
				1850	* clean. Someone wrote them back by hand with
				1851	* ll_rw_block/submit_bh. A rare case.
				1852	*/
				1853	int uptodate = 1;
				1854	do {
				1855	if (!buffer_uptodate(bh)) {
				1856	uptodate = 0;
				1857	break;
				1858	}
				1859	bh = bh->b_this_page;
				1860	} while (bh != head);
				1861	if (uptodate)
				1862	SetPageUptodate(page);
				1863	end_page_writeback(page);
				1864	/*
				1865	* The page and buffer_heads can be released at any time from
				1866	* here on.
				1867	*/
				1868	wbc->pages_skipped++; /* We didn't write this page */
				1869	}
				1870	return err;
				1871
				1872	recover:
				1873	/*
				1874	* ENOSPC, or some other error. We may already have added some
				1875	* blocks to the file, so we need to write these out to avoid
				1876	* exposing stale data.
				1877	* The page is currently locked and not marked for writeback
				1878	*/
				1879	bh = head;
				1880	/* Recovery: lock and submit the mapped buffers */
				1881	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1882	if (buffer_mapped(bh) && buffer_dirty(bh)) {
				1883	lock_buffer(bh);
				1884	mark_buffer_async_write(bh);
				1885	} else {
				1886	/*
				1887	* The buffer may have been set dirty during
				1888	* attachment to a dirty page.
				1889	*/
				1890	clear_buffer_dirty(bh);
				1891	}
				1892	} while ((bh = bh->b_this_page) != head);
				1893	SetPageError(page);
				1894	BUG_ON(PageWriteback(page));
				1895	set_page_writeback(page);
				1896	unlock_page(page);
				1897	do {
				1898	struct buffer_head *next = bh->b_this_page;
				1899	if (buffer_async_write(bh)) {
				1900	clear_buffer_dirty(bh);
				1901	submit_bh(WRITE, bh);
				1902	nr_underway++;
				1903	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1904	bh = next;
				1905	} while (bh != head);
				1906	goto done;
				1907	}
				1908
				1909	static int __block_prepare_write(struct inode inode, struct page page,
				1910	unsigned from, unsigned to, get_block_t *get_block)
				1911	{
				1912	unsigned block_start, block_end;
				1913	sector_t block;
				1914	int err = 0;
				1915	unsigned blocksize, bbits;
				1916	struct buffer_head bh, head, wait[2], *wait_bh=wait;
				1917
				1918	BUG_ON(!PageLocked(page));
				1919	BUG_ON(from > PAGE_CACHE_SIZE);
				1920	BUG_ON(to > PAGE_CACHE_SIZE);
				1921	BUG_ON(from > to);
				1922
				1923	blocksize = 1 << inode->i_blkbits;
				1924	if (!page_has_buffers(page))
				1925	create_empty_buffers(page, blocksize, 0);
				1926	head = page_buffers(page);
				1927
				1928	bbits = inode->i_blkbits;
				1929	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
				1930
				1931	for(bh = head, block_start = 0; bh != head \|\| !block_start;
				1932	block++, block_start=block_end, bh = bh->b_this_page) {
				1933	block_end = block_start + blocksize;
				1934	if (block_end <= from \|\| block_start >= to) {
				1935	if (PageUptodate(page)) {
				1936	if (!buffer_uptodate(bh))
				1937	set_buffer_uptodate(bh);
				1938	}
				1939	continue;
				1940	}
				1941	if (buffer_new(bh))
				1942	clear_buffer_new(bh);
				1943	if (!buffer_mapped(bh)) {
				1944	err = get_block(inode, block, bh, 1);
				1945	if (err)
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1946	break;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1947	if (buffer_new(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1948	unmap_underlying_metadata(bh->b_bdev,
				1949	bh->b_blocknr);
				1950	if (PageUptodate(page)) {
				1951	set_buffer_uptodate(bh);
				1952	continue;
				1953	}
				1954	if (block_end > to \|\| block_start < from) {
				1955	void *kaddr;
				1956
				1957	kaddr = kmap_atomic(page, KM_USER0);
				1958	if (block_end > to)
				1959	memset(kaddr+to, 0,
				1960	block_end-to);
				1961	if (block_start < from)
				1962	memset(kaddr+block_start,
				1963	0, from-block_start);
				1964	flush_dcache_page(page);
				1965	kunmap_atomic(kaddr, KM_USER0);
				1966	}
				1967	continue;
				1968	}
				1969	}
				1970	if (PageUptodate(page)) {
				1971	if (!buffer_uptodate(bh))
				1972	set_buffer_uptodate(bh);
				1973	continue;
				1974	}
				1975	if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
				1976	(block_start < from \|\| block_end > to)) {
				1977	ll_rw_block(READ, 1, &bh);
				1978	*wait_bh++=bh;
				1979	}
				1980	}
				1981	/*
				1982	* If we issued read requests - let them complete.
				1983	*/
				1984	while(wait_bh > wait) {
				1985	wait_on_buffer(*--wait_bh);
				1986	if (!buffer_uptodate(*wait_bh))
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1987	err = -EIO;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1988	}
Anton Altaparmakov	152becd	2005-06-23 00:10:21 -0700	[diff] [blame]	1989	if (!err) {
				1990	bh = head;
				1991	do {
				1992	if (buffer_new(bh))
				1993	clear_buffer_new(bh);
				1994	} while ((bh = bh->b_this_page) != head);
				1995	return 0;
				1996	}
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1997	/* Error case: */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1998	/*
				1999	* Zero out any newly allocated blocks to avoid exposing stale
				2000	* data. If BH_New is set, we know that the block was newly
				2001	* allocated in the above loop.
				2002	*/
				2003	bh = head;
				2004	block_start = 0;
				2005	do {
				2006	block_end = block_start+blocksize;
				2007	if (block_end <= from)
				2008	goto next_bh;
				2009	if (block_start >= to)
				2010	break;
				2011	if (buffer_new(bh)) {
				2012	void *kaddr;
				2013
				2014	clear_buffer_new(bh);
				2015	kaddr = kmap_atomic(page, KM_USER0);
				2016	memset(kaddr+block_start, 0, bh->b_size);
				2017	kunmap_atomic(kaddr, KM_USER0);
				2018	set_buffer_uptodate(bh);
				2019	mark_buffer_dirty(bh);
				2020	}
				2021	next_bh:
				2022	block_start = block_end;
				2023	bh = bh->b_this_page;
				2024	} while (bh != head);
				2025	return err;
				2026	}
				2027
				2028	static int __block_commit_write(struct inode inode, struct page page,
				2029	unsigned from, unsigned to)
				2030	{
				2031	unsigned block_start, block_end;
				2032	int partial = 0;
				2033	unsigned blocksize;
				2034	struct buffer_head bh, head;
				2035
				2036	blocksize = 1 << inode->i_blkbits;
				2037
				2038	for(bh = head = page_buffers(page), block_start = 0;
				2039	bh != head \|\| !block_start;
				2040	block_start=block_end, bh = bh->b_this_page) {
				2041	block_end = block_start + blocksize;
				2042	if (block_end <= from \|\| block_start >= to) {
				2043	if (!buffer_uptodate(bh))
				2044	partial = 1;
				2045	} else {
				2046	set_buffer_uptodate(bh);
				2047	mark_buffer_dirty(bh);
				2048	}
				2049	}
				2050
				2051	/*
				2052	* If this is a partial write which happened to make all buffers
				2053	* uptodate then we can optimize away a bogus readpage() for
				2054	* the next read(). Here we 'discover' whether the page went
				2055	* uptodate as a result of this (potentially partial) write.
				2056	*/
				2057	if (!partial)
				2058	SetPageUptodate(page);
				2059	return 0;
				2060	}
				2061
				2062	/*
				2063	* Generic "read page" function for block devices that have the normal
				2064	* get_block functionality. This is most of the block device filesystems.
				2065	* Reads the page asynchronously --- the unlock_buffer() and
				2066	* set/clear_buffer_uptodate() functions propagate buffer state into the
				2067	* page struct once IO has completed.
				2068	*/
				2069	int block_read_full_page(struct page page, get_block_t get_block)
				2070	{
				2071	struct inode *inode = page->mapping->host;
				2072	sector_t iblock, lblock;
				2073	struct buffer_head bh, head, *arr[MAX_BUF_PER_PAGE];
				2074	unsigned int blocksize;
				2075	int nr, i;
				2076	int fully_mapped = 1;
				2077
Matt Mackall	cd7619d	2005-05-01 08:59:01 -0700	[diff] [blame]	2078	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2079	blocksize = 1 << inode->i_blkbits;
				2080	if (!page_has_buffers(page))
				2081	create_empty_buffers(page, blocksize, 0);
				2082	head = page_buffers(page);
				2083
				2084	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
				2085	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
				2086	bh = head;
				2087	nr = 0;
				2088	i = 0;
				2089
				2090	do {
				2091	if (buffer_uptodate(bh))
				2092	continue;
				2093
				2094	if (!buffer_mapped(bh)) {
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2095	int err = 0;
				2096
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2097	fully_mapped = 0;
				2098	if (iblock < lblock) {
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2099	err = get_block(inode, iblock, bh, 0);
				2100	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2101	SetPageError(page);
				2102	}
				2103	if (!buffer_mapped(bh)) {
				2104	void *kaddr = kmap_atomic(page, KM_USER0);
				2105	memset(kaddr + i * blocksize, 0, blocksize);
				2106	flush_dcache_page(page);
				2107	kunmap_atomic(kaddr, KM_USER0);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2108	if (!err)
				2109	set_buffer_uptodate(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2110	continue;
				2111	}
				2112	/*
				2113	* get_block() might have updated the buffer
				2114	* synchronously
				2115	*/
				2116	if (buffer_uptodate(bh))
				2117	continue;
				2118	}
				2119	arr[nr++] = bh;
				2120	} while (i++, iblock++, (bh = bh->b_this_page) != head);
				2121
				2122	if (fully_mapped)
				2123	SetPageMappedToDisk(page);
				2124
				2125	if (!nr) {
				2126	/*
				2127	* All buffers are uptodate - we can set the page uptodate
				2128	* as well. But not if get_block() returned an error.
				2129	*/
				2130	if (!PageError(page))
				2131	SetPageUptodate(page);
				2132	unlock_page(page);
				2133	return 0;
				2134	}
				2135
				2136	/* Stage two: lock the buffers */
				2137	for (i = 0; i < nr; i++) {
				2138	bh = arr[i];
				2139	lock_buffer(bh);
				2140	mark_buffer_async_read(bh);
				2141	}
				2142
				2143	/*
				2144	* Stage 3: start the IO. Check for uptodateness
				2145	* inside the buffer lock in case another process reading
				2146	* the underlying blockdev brought it uptodate (the sct fix).
				2147	*/
				2148	for (i = 0; i < nr; i++) {
				2149	bh = arr[i];
				2150	if (buffer_uptodate(bh))
				2151	end_buffer_async_read(bh, 1);
				2152	else
				2153	submit_bh(READ, bh);
				2154	}
				2155	return 0;
				2156	}
				2157
				2158	/* utility function for filesystems that need to do work on expanding
				2159	* truncates. Uses prepare/commit_write to allow the filesystem to
				2160	* deal with the hole.
				2161	*/
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2162	static int __generic_cont_expand(struct inode *inode, loff_t size,
				2163	pgoff_t index, unsigned int offset)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2164	{
				2165	struct address_space *mapping = inode->i_mapping;
				2166	struct page *page;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2167	unsigned long limit;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2168	int err;
				2169
				2170	err = -EFBIG;
				2171	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
				2172	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
				2173	send_sig(SIGXFSZ, current, 0);
				2174	goto out;
				2175	}
				2176	if (size > inode->i_sb->s_maxbytes)
				2177	goto out;
				2178
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2179	err = -ENOMEM;
				2180	page = grab_cache_page(mapping, index);
				2181	if (!page)
				2182	goto out;
				2183	err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2184	if (err) {
				2185	/*
				2186	* ->prepare_write() may have instantiated a few blocks
				2187	* outside i_size. Trim these off again.
				2188	*/
				2189	unlock_page(page);
				2190	page_cache_release(page);
				2191	vmtruncate(inode, inode->i_size);
				2192	goto out;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2193	}
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2194
				2195	err = mapping->a_ops->commit_write(NULL, page, offset, offset);
				2196
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2197	unlock_page(page);
				2198	page_cache_release(page);
				2199	if (err > 0)
				2200	err = 0;
				2201	out:
				2202	return err;
				2203	}
				2204
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2205	int generic_cont_expand(struct inode *inode, loff_t size)
				2206	{
				2207	pgoff_t index;
				2208	unsigned int offset;
				2209
				2210	offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
				2211
				2212	/* ugh. in prepare/commit_write, if from==to==start of block, we
				2213	** skip the prepare. make sure we never send an offset for the start
				2214	** of a block
				2215	*/
				2216	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
				2217	/* caller must handle this extra byte. */
				2218	offset++;
				2219	}
				2220	index = size >> PAGE_CACHE_SHIFT;
				2221
				2222	return __generic_cont_expand(inode, size, index, offset);
				2223	}
				2224
				2225	int generic_cont_expand_simple(struct inode *inode, loff_t size)
				2226	{
				2227	loff_t pos = size - 1;
				2228	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
				2229	unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
				2230
				2231	/* prepare/commit_write can handle even if from==to==start of block. */
				2232	return __generic_cont_expand(inode, size, index, offset);
				2233	}
				2234
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2235	/*
				2236	* For moronic filesystems that do not allow holes in file.
				2237	* We may have to extend the file.
				2238	*/
				2239
				2240	int cont_prepare_write(struct page *page, unsigned offset,
				2241	unsigned to, get_block_t get_block, loff_t bytes)
				2242	{
				2243	struct address_space *mapping = page->mapping;
				2244	struct inode *inode = mapping->host;
				2245	struct page *new_page;
				2246	pgoff_t pgpos;
				2247	long status;
				2248	unsigned zerofrom;
				2249	unsigned blocksize = 1 << inode->i_blkbits;
				2250	void *kaddr;
				2251
				2252	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
				2253	status = -ENOMEM;
				2254	new_page = grab_cache_page(mapping, pgpos);
				2255	if (!new_page)
				2256	goto out;
				2257	/* we might sleep */
				2258	if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
				2259	unlock_page(new_page);
				2260	page_cache_release(new_page);
				2261	continue;
				2262	}
				2263	zerofrom = *bytes & ~PAGE_CACHE_MASK;
				2264	if (zerofrom & (blocksize-1)) {
				2265	*bytes \|= (blocksize-1);
				2266	(*bytes)++;
				2267	}
				2268	status = __block_prepare_write(inode, new_page, zerofrom,
				2269	PAGE_CACHE_SIZE, get_block);
				2270	if (status)
				2271	goto out_unmap;
				2272	kaddr = kmap_atomic(new_page, KM_USER0);
				2273	memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
				2274	flush_dcache_page(new_page);
				2275	kunmap_atomic(kaddr, KM_USER0);
				2276	generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
				2277	unlock_page(new_page);
				2278	page_cache_release(new_page);
				2279	}
				2280
				2281	if (page->index < pgpos) {
				2282	/* completely inside the area */
				2283	zerofrom = offset;
				2284	} else {
				2285	/* page covers the boundary, find the boundary offset */
				2286	zerofrom = *bytes & ~PAGE_CACHE_MASK;
				2287
				2288	/* if we will expand the thing last block will be filled */
				2289	if (to > zerofrom && (zerofrom & (blocksize-1))) {
				2290	*bytes \|= (blocksize-1);
				2291	(*bytes)++;
				2292	}
				2293
				2294	/* starting below the boundary? Nothing to zero out */
				2295	if (offset <= zerofrom)
				2296	zerofrom = offset;
				2297	}
				2298	status = __block_prepare_write(inode, page, zerofrom, to, get_block);
				2299	if (status)
				2300	goto out1;
				2301	if (zerofrom < offset) {
				2302	kaddr = kmap_atomic(page, KM_USER0);
				2303	memset(kaddr+zerofrom, 0, offset-zerofrom);
				2304	flush_dcache_page(page);
				2305	kunmap_atomic(kaddr, KM_USER0);
				2306	__block_commit_write(inode, page, zerofrom, offset);
				2307	}
				2308	return 0;
				2309	out1:
				2310	ClearPageUptodate(page);
				2311	return status;
				2312
				2313	out_unmap:
				2314	ClearPageUptodate(new_page);
				2315	unlock_page(new_page);
				2316	page_cache_release(new_page);
				2317	out:
				2318	return status;
				2319	}
				2320
				2321	int block_prepare_write(struct page *page, unsigned from, unsigned to,
				2322	get_block_t *get_block)
				2323	{
				2324	struct inode *inode = page->mapping->host;
				2325	int err = __block_prepare_write(inode, page, from, to, get_block);
				2326	if (err)
				2327	ClearPageUptodate(page);
				2328	return err;
				2329	}
				2330
				2331	int block_commit_write(struct page *page, unsigned from, unsigned to)
				2332	{
				2333	struct inode *inode = page->mapping->host;
				2334	__block_commit_write(inode,page,from,to);
				2335	return 0;
				2336	}
				2337
				2338	int generic_commit_write(struct file file, struct page page,
				2339	unsigned from, unsigned to)
				2340	{
				2341	struct inode *inode = page->mapping->host;
				2342	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				2343	__block_commit_write(inode,page,from,to);
				2344	/*
				2345	* No need to use i_size_read() here, the i_size
Jes Sorensen	1b1dcc1	2006-01-09 15:59:24 -0800	[diff] [blame]	2346	* cannot change under us because we hold i_mutex.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2347	*/
				2348	if (pos > inode->i_size) {
				2349	i_size_write(inode, pos);
				2350	mark_inode_dirty(inode);
				2351	}
				2352	return 0;
				2353	}
				2354
				2355
				2356	/*
				2357	* nobh_prepare_write()'s prereads are special: the buffer_heads are freed
				2358	* immediately, while under the page lock. So it needs a special end_io
				2359	* handler which does not touch the bh after unlocking it.
				2360	*
				2361	* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
				2362	* a race there is benign: unlock_buffer() only use the bh's address for
				2363	* hashing after unlocking the buffer, so it doesn't actually touch the bh
				2364	* itself.
				2365	*/
				2366	static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
				2367	{
				2368	if (uptodate) {
				2369	set_buffer_uptodate(bh);
				2370	} else {
				2371	/* This happens, due to failed READA attempts. */
				2372	clear_buffer_uptodate(bh);
				2373	}
				2374	unlock_buffer(bh);
				2375	}
				2376
				2377	/*
				2378	* On entry, the page is fully not uptodate.
				2379	* On exit the page is fully uptodate in the areas outside (from,to)
				2380	*/
				2381	int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
				2382	get_block_t *get_block)
				2383	{
				2384	struct inode *inode = page->mapping->host;
				2385	const unsigned blkbits = inode->i_blkbits;
				2386	const unsigned blocksize = 1 << blkbits;
				2387	struct buffer_head map_bh;
				2388	struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
				2389	unsigned block_in_page;
				2390	unsigned block_start;
				2391	sector_t block_in_file;
				2392	char *kaddr;
				2393	int nr_reads = 0;
				2394	int i;
				2395	int ret = 0;
				2396	int is_mapped_to_disk = 1;
				2397	int dirtied_it = 0;
				2398
				2399	if (PageMappedToDisk(page))
				2400	return 0;
				2401
				2402	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
				2403	map_bh.b_page = page;
				2404
				2405	/*
				2406	* We loop across all blocks in the page, whether or not they are
				2407	* part of the affected region. This is so we can discover if the
				2408	* page is fully mapped-to-disk.
				2409	*/
				2410	for (block_start = 0, block_in_page = 0;
				2411	block_start < PAGE_CACHE_SIZE;
				2412	block_in_page++, block_start += blocksize) {
				2413	unsigned block_end = block_start + blocksize;
				2414	int create;
				2415
				2416	map_bh.b_state = 0;
				2417	create = 1;
				2418	if (block_start >= to)
				2419	create = 0;
				2420	ret = get_block(inode, block_in_file + block_in_page,
				2421	&map_bh, create);
				2422	if (ret)
				2423	goto failed;
				2424	if (!buffer_mapped(&map_bh))
				2425	is_mapped_to_disk = 0;
				2426	if (buffer_new(&map_bh))
				2427	unmap_underlying_metadata(map_bh.b_bdev,
				2428	map_bh.b_blocknr);
				2429	if (PageUptodate(page))
				2430	continue;
				2431	if (buffer_new(&map_bh) \|\| !buffer_mapped(&map_bh)) {
				2432	kaddr = kmap_atomic(page, KM_USER0);
				2433	if (block_start < from) {
				2434	memset(kaddr+block_start, 0, from-block_start);
				2435	dirtied_it = 1;
				2436	}
				2437	if (block_end > to) {
				2438	memset(kaddr + to, 0, block_end - to);
				2439	dirtied_it = 1;
				2440	}
				2441	flush_dcache_page(page);
				2442	kunmap_atomic(kaddr, KM_USER0);
				2443	continue;
				2444	}
				2445	if (buffer_uptodate(&map_bh))
				2446	continue; /* reiserfs does this */
				2447	if (block_start < from \|\| block_end > to) {
				2448	struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
				2449
				2450	if (!bh) {
				2451	ret = -ENOMEM;
				2452	goto failed;
				2453	}
				2454	bh->b_state = map_bh.b_state;
				2455	atomic_set(&bh->b_count, 0);
				2456	bh->b_this_page = NULL;
				2457	bh->b_page = page;
				2458	bh->b_blocknr = map_bh.b_blocknr;
				2459	bh->b_size = blocksize;
				2460	bh->b_data = (char *)(long)block_start;
				2461	bh->b_bdev = map_bh.b_bdev;
				2462	bh->b_private = NULL;
				2463	read_bh[nr_reads++] = bh;
				2464	}
				2465	}
				2466
				2467	if (nr_reads) {
				2468	struct buffer_head *bh;
				2469
				2470	/*
				2471	* The page is locked, so these buffers are protected from
				2472	* any VM or truncate activity. Hence we don't need to care
				2473	* for the buffer_head refcounts.
				2474	*/
				2475	for (i = 0; i < nr_reads; i++) {
				2476	bh = read_bh[i];
				2477	lock_buffer(bh);
				2478	bh->b_end_io = end_buffer_read_nobh;
				2479	submit_bh(READ, bh);
				2480	}
				2481	for (i = 0; i < nr_reads; i++) {
				2482	bh = read_bh[i];
				2483	wait_on_buffer(bh);
				2484	if (!buffer_uptodate(bh))
				2485	ret = -EIO;
				2486	free_buffer_head(bh);
				2487	read_bh[i] = NULL;
				2488	}
				2489	if (ret)
				2490	goto failed;
				2491	}
				2492
				2493	if (is_mapped_to_disk)
				2494	SetPageMappedToDisk(page);
				2495	SetPageUptodate(page);
				2496
				2497	/*
				2498	* Setting the page dirty here isn't necessary for the prepare_write
				2499	* function - commit_write will do that. But if/when this function is
				2500	* used within the pagefault handler to ensure that all mmapped pages
				2501	* have backing space in the filesystem, we will need to dirty the page
				2502	* if its contents were altered.
				2503	*/
				2504	if (dirtied_it)
				2505	set_page_dirty(page);
				2506
				2507	return 0;
				2508
				2509	failed:
				2510	for (i = 0; i < nr_reads; i++) {
				2511	if (read_bh[i])
				2512	free_buffer_head(read_bh[i]);
				2513	}
				2514
				2515	/*
				2516	* Error recovery is pretty slack. Clear the page and mark it dirty
				2517	* so we'll later zero out any blocks which _were_ allocated.
				2518	*/
				2519	kaddr = kmap_atomic(page, KM_USER0);
				2520	memset(kaddr, 0, PAGE_CACHE_SIZE);
				2521	kunmap_atomic(kaddr, KM_USER0);
				2522	SetPageUptodate(page);
				2523	set_page_dirty(page);
				2524	return ret;
				2525	}
				2526	EXPORT_SYMBOL(nobh_prepare_write);
				2527
				2528	int nobh_commit_write(struct file file, struct page page,
				2529	unsigned from, unsigned to)
				2530	{
				2531	struct inode *inode = page->mapping->host;
				2532	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				2533
				2534	set_page_dirty(page);
				2535	if (pos > inode->i_size) {
				2536	i_size_write(inode, pos);
				2537	mark_inode_dirty(inode);
				2538	}
				2539	return 0;
				2540	}
				2541	EXPORT_SYMBOL(nobh_commit_write);
				2542
				2543	/*
				2544	* nobh_writepage() - based on block_full_write_page() except
				2545	* that it tries to operate without attaching bufferheads to
				2546	* the page.
				2547	*/
				2548	int nobh_writepage(struct page page, get_block_t get_block,
				2549	struct writeback_control *wbc)
				2550	{
				2551	struct inode * const inode = page->mapping->host;
				2552	loff_t i_size = i_size_read(inode);
				2553	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2554	unsigned offset;
				2555	void *kaddr;
				2556	int ret;
				2557
				2558	/* Is the page fully inside i_size? */
				2559	if (page->index < end_index)
				2560	goto out;
				2561
				2562	/* Is the page fully outside i_size? (truncate in progress) */
				2563	offset = i_size & (PAGE_CACHE_SIZE-1);
				2564	if (page->index >= end_index+1 \|\| !offset) {
				2565	/*
				2566	* The page may have dirty, unmapped buffers. For example,
				2567	* they may have been added in ext3_writepage(). Make them
				2568	* freeable here, so the page does not leak.
				2569	*/
				2570	#if 0
				2571	/* Not really sure about this - do we need this ? */
				2572	if (page->mapping->a_ops->invalidatepage)
				2573	page->mapping->a_ops->invalidatepage(page, offset);
				2574	#endif
				2575	unlock_page(page);
				2576	return 0; /* don't care */
				2577	}
				2578
				2579	/*
				2580	* The page straddles i_size. It must be zeroed out on each and every
				2581	* writepage invocation because it may be mmapped. "A file is mapped
				2582	* in multiples of the page size. For a file that is not a multiple of
				2583	* the page size, the remaining memory is zeroed when mapped, and
				2584	* writes to that region are not written out to the file."
				2585	*/
				2586	kaddr = kmap_atomic(page, KM_USER0);
				2587	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2588	flush_dcache_page(page);
				2589	kunmap_atomic(kaddr, KM_USER0);
				2590	out:
				2591	ret = mpage_writepage(page, get_block, wbc);
				2592	if (ret == -EAGAIN)
				2593	ret = __block_write_full_page(inode, page, get_block, wbc);
				2594	return ret;
				2595	}
				2596	EXPORT_SYMBOL(nobh_writepage);
				2597
				2598	/*
				2599	* This function assumes that ->prepare_write() uses nobh_prepare_write().
				2600	*/
				2601	int nobh_truncate_page(struct address_space *mapping, loff_t from)
				2602	{
				2603	struct inode *inode = mapping->host;
				2604	unsigned blocksize = 1 << inode->i_blkbits;
				2605	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2606	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				2607	unsigned to;
				2608	struct page *page;
				2609	struct address_space_operations *a_ops = mapping->a_ops;
				2610	char *kaddr;
				2611	int ret = 0;
				2612
				2613	if ((offset & (blocksize - 1)) == 0)
				2614	goto out;
				2615
				2616	ret = -ENOMEM;
				2617	page = grab_cache_page(mapping, index);
				2618	if (!page)
				2619	goto out;
				2620
				2621	to = (offset + blocksize) & ~(blocksize - 1);
				2622	ret = a_ops->prepare_write(NULL, page, offset, to);
				2623	if (ret == 0) {
				2624	kaddr = kmap_atomic(page, KM_USER0);
				2625	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2626	flush_dcache_page(page);
				2627	kunmap_atomic(kaddr, KM_USER0);
				2628	set_page_dirty(page);
				2629	}
				2630	unlock_page(page);
				2631	page_cache_release(page);
				2632	out:
				2633	return ret;
				2634	}
				2635	EXPORT_SYMBOL(nobh_truncate_page);
				2636
				2637	int block_truncate_page(struct address_space *mapping,
				2638	loff_t from, get_block_t *get_block)
				2639	{
				2640	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2641	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				2642	unsigned blocksize;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2643	sector_t iblock;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2644	unsigned length, pos;
				2645	struct inode *inode = mapping->host;
				2646	struct page *page;
				2647	struct buffer_head *bh;
				2648	void *kaddr;
				2649	int err;
				2650
				2651	blocksize = 1 << inode->i_blkbits;
				2652	length = offset & (blocksize - 1);
				2653
				2654	/* Block boundary? Nothing to do */
				2655	if (!length)
				2656	return 0;
				2657
				2658	length = blocksize - length;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2659	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2660
				2661	page = grab_cache_page(mapping, index);
				2662	err = -ENOMEM;
				2663	if (!page)
				2664	goto out;
				2665
				2666	if (!page_has_buffers(page))
				2667	create_empty_buffers(page, blocksize, 0);
				2668
				2669	/* Find the buffer that contains "offset" */
				2670	bh = page_buffers(page);
				2671	pos = blocksize;
				2672	while (offset >= pos) {
				2673	bh = bh->b_this_page;
				2674	iblock++;
				2675	pos += blocksize;
				2676	}
				2677
				2678	err = 0;
				2679	if (!buffer_mapped(bh)) {
				2680	err = get_block(inode, iblock, bh, 0);
				2681	if (err)
				2682	goto unlock;
				2683	/* unmapped? It's a hole - nothing to do */
				2684	if (!buffer_mapped(bh))
				2685	goto unlock;
				2686	}
				2687
				2688	/* Ok, it's mapped. Make sure it's up-to-date */
				2689	if (PageUptodate(page))
				2690	set_buffer_uptodate(bh);
				2691
				2692	if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
				2693	err = -EIO;
				2694	ll_rw_block(READ, 1, &bh);
				2695	wait_on_buffer(bh);
				2696	/* Uhhuh. Read error. Complain and punt. */
				2697	if (!buffer_uptodate(bh))
				2698	goto unlock;
				2699	}
				2700
				2701	kaddr = kmap_atomic(page, KM_USER0);
				2702	memset(kaddr + offset, 0, length);
				2703	flush_dcache_page(page);
				2704	kunmap_atomic(kaddr, KM_USER0);
				2705
				2706	mark_buffer_dirty(bh);
				2707	err = 0;
				2708
				2709	unlock:
				2710	unlock_page(page);
				2711	page_cache_release(page);
				2712	out:
				2713	return err;
				2714	}
				2715
				2716	/*
				2717	* The generic ->writepage function for buffer-backed address_spaces
				2718	*/
				2719	int block_write_full_page(struct page page, get_block_t get_block,
				2720	struct writeback_control *wbc)
				2721	{
				2722	struct inode * const inode = page->mapping->host;
				2723	loff_t i_size = i_size_read(inode);
				2724	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2725	unsigned offset;
				2726	void *kaddr;
				2727
				2728	/* Is the page fully inside i_size? */
				2729	if (page->index < end_index)
				2730	return __block_write_full_page(inode, page, get_block, wbc);
				2731
				2732	/* Is the page fully outside i_size? (truncate in progress) */
				2733	offset = i_size & (PAGE_CACHE_SIZE-1);
				2734	if (page->index >= end_index+1 \|\| !offset) {
				2735	/*
				2736	* The page may have dirty, unmapped buffers. For example,
				2737	* they may have been added in ext3_writepage(). Make them
				2738	* freeable here, so the page does not leak.
				2739	*/
Jan Kara	aaa4059	2005-10-30 15:00:16 -0800	[diff] [blame]	2740	do_invalidatepage(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2741	unlock_page(page);
				2742	return 0; /* don't care */
				2743	}
				2744
				2745	/*
				2746	* The page straddles i_size. It must be zeroed out on each and every
				2747	* writepage invokation because it may be mmapped. "A file is mapped
				2748	* in multiples of the page size. For a file that is not a multiple of
				2749	* the page size, the remaining memory is zeroed when mapped, and
				2750	* writes to that region are not written out to the file."
				2751	*/
				2752	kaddr = kmap_atomic(page, KM_USER0);
				2753	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2754	flush_dcache_page(page);
				2755	kunmap_atomic(kaddr, KM_USER0);
				2756	return __block_write_full_page(inode, page, get_block, wbc);
				2757	}
				2758
				2759	sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
				2760	get_block_t *get_block)
				2761	{
				2762	struct buffer_head tmp;
				2763	struct inode *inode = mapping->host;
				2764	tmp.b_state = 0;
				2765	tmp.b_blocknr = 0;
				2766	get_block(inode, block, &tmp, 0);
				2767	return tmp.b_blocknr;
				2768	}
				2769
				2770	static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
				2771	{
				2772	struct buffer_head *bh = bio->bi_private;
				2773
				2774	if (bio->bi_size)
				2775	return 1;
				2776
				2777	if (err == -EOPNOTSUPP) {
				2778	set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
				2779	set_bit(BH_Eopnotsupp, &bh->b_state);
				2780	}
				2781
				2782	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
				2783	bio_put(bio);
				2784	return 0;
				2785	}
				2786
				2787	int submit_bh(int rw, struct buffer_head * bh)
				2788	{
				2789	struct bio *bio;
				2790	int ret = 0;
				2791
				2792	BUG_ON(!buffer_locked(bh));
				2793	BUG_ON(!buffer_mapped(bh));
				2794	BUG_ON(!bh->b_end_io);
				2795
				2796	if (buffer_ordered(bh) && (rw == WRITE))
				2797	rw = WRITE_BARRIER;
				2798
				2799	/*
				2800	* Only clear out a write error when rewriting, should this
				2801	* include WRITE_SYNC as well?
				2802	*/
				2803	if (test_set_buffer_req(bh) && (rw == WRITE \|\| rw == WRITE_BARRIER))
				2804	clear_buffer_write_io_error(bh);
				2805
				2806	/*
				2807	* from here on down, it's all bio -- do the initial mapping,
				2808	* submit_bio -> generic_make_request may further map this bio around
				2809	*/
				2810	bio = bio_alloc(GFP_NOIO, 1);
				2811
				2812	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
				2813	bio->bi_bdev = bh->b_bdev;
				2814	bio->bi_io_vec[0].bv_page = bh->b_page;
				2815	bio->bi_io_vec[0].bv_len = bh->b_size;
				2816	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
				2817
				2818	bio->bi_vcnt = 1;
				2819	bio->bi_idx = 0;
				2820	bio->bi_size = bh->b_size;
				2821
				2822	bio->bi_end_io = end_bio_bh_io_sync;
				2823	bio->bi_private = bh;
				2824
				2825	bio_get(bio);
				2826	submit_bio(rw, bio);
				2827
				2828	if (bio_flagged(bio, BIO_EOPNOTSUPP))
				2829	ret = -EOPNOTSUPP;
				2830
				2831	bio_put(bio);
				2832	return ret;
				2833	}
				2834
				2835	/**
				2836	* ll_rw_block: low-level access to block devices (DEPRECATED)
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2837	* @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2838	* @nr: number of &struct buffer_heads in the array
				2839	* @bhs: array of pointers to &struct buffer_head
				2840	*
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2841	* ll_rw_block() takes an array of pointers to &struct buffer_heads, and
				2842	* requests an I/O operation on them, either a %READ or a %WRITE. The third
				2843	* %SWRITE is like %WRITE only we make sure that the current data in buffers
				2844	* are sent to disk. The fourth %READA option is described in the documentation
				2845	* for generic_make_request() which ll_rw_block() calls.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2846	*
				2847	* This function drops any buffer that it cannot get a lock on (with the
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2848	* BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
				2849	* clean when doing a write request, and any buffer that appears to be
				2850	* up-to-date when doing read request. Further it marks as clean buffers that
				2851	* are processed for writing (the buffer cache won't assume that they are
				2852	* actually clean until the buffer gets unlocked).
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2853	*
				2854	* ll_rw_block sets b_end_io to simple completion handler that marks
				2855	* the buffer up-to-date (if approriate), unlocks the buffer and wakes
				2856	* any waiters.
				2857	*
				2858	* All of the buffers must be for the same device, and must also be a
				2859	* multiple of the current approved size for the device.
				2860	*/
				2861	void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
				2862	{
				2863	int i;
				2864
				2865	for (i = 0; i < nr; i++) {
				2866	struct buffer_head *bh = bhs[i];
				2867
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2868	if (rw == SWRITE)
				2869	lock_buffer(bh);
				2870	else if (test_set_buffer_locked(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2871	continue;
				2872
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2873	if (rw == WRITE \|\| rw == SWRITE) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2874	if (test_clear_buffer_dirty(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	2875	bh->b_end_io = end_buffer_write_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	2876	get_bh(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2877	submit_bh(WRITE, bh);
				2878	continue;
				2879	}
				2880	} else {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2881	if (!buffer_uptodate(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	2882	bh->b_end_io = end_buffer_read_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	2883	get_bh(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2884	submit_bh(rw, bh);
				2885	continue;
				2886	}
				2887	}
				2888	unlock_buffer(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2889	}
				2890	}
				2891
				2892	/*
				2893	* For a data-integrity writeout, we need to wait upon any in-progress I/O
				2894	* and then start new I/O and then wait upon it. The caller must have a ref on
				2895	* the buffer_head.
				2896	*/
				2897	int sync_dirty_buffer(struct buffer_head *bh)
				2898	{
				2899	int ret = 0;
				2900
				2901	WARN_ON(atomic_read(&bh->b_count) < 1);
				2902	lock_buffer(bh);
				2903	if (test_clear_buffer_dirty(bh)) {
				2904	get_bh(bh);
				2905	bh->b_end_io = end_buffer_write_sync;
				2906	ret = submit_bh(WRITE, bh);
				2907	wait_on_buffer(bh);
				2908	if (buffer_eopnotsupp(bh)) {
				2909	clear_buffer_eopnotsupp(bh);
				2910	ret = -EOPNOTSUPP;
				2911	}
				2912	if (!ret && !buffer_uptodate(bh))
				2913	ret = -EIO;
				2914	} else {
				2915	unlock_buffer(bh);
				2916	}
				2917	return ret;
				2918	}
				2919
				2920	/*
				2921	* try_to_free_buffers() checks if all the buffers on this particular page
				2922	* are unused, and releases them if so.
				2923	*
				2924	* Exclusion against try_to_free_buffers may be obtained by either
				2925	* locking the page or by holding its mapping's private_lock.
				2926	*
				2927	* If the page is dirty but all the buffers are clean then we need to
				2928	* be sure to mark the page clean as well. This is because the page
				2929	* may be against a block device, and a later reattachment of buffers
				2930	* to a dirty page will set all buffers dirty. Which would corrupt
				2931	* filesystem data on the same device.
				2932	*
				2933	* The same applies to regular filesystem pages: if all the buffers are
				2934	* clean then we set the page clean and proceed. To do that, we require
				2935	* total exclusion from __set_page_dirty_buffers(). That is obtained with
				2936	* private_lock.
				2937	*
				2938	* try_to_free_buffers() is non-blocking.
				2939	*/
				2940	static inline int buffer_busy(struct buffer_head *bh)
				2941	{
				2942	return atomic_read(&bh->b_count) \|
				2943	(bh->b_state & ((1 << BH_Dirty) \| (1 << BH_Lock)));
				2944	}
				2945
				2946	static int
				2947	drop_buffers(struct page page, struct buffer_head *buffers_to_free)
				2948	{
				2949	struct buffer_head *head = page_buffers(page);
				2950	struct buffer_head *bh;
				2951
				2952	bh = head;
				2953	do {
akpm@osdl.org	de7d5a3	2005-05-01 08:58:39 -0700	[diff] [blame]	2954	if (buffer_write_io_error(bh) && page->mapping)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2955	set_bit(AS_EIO, &page->mapping->flags);
				2956	if (buffer_busy(bh))
				2957	goto failed;
				2958	bh = bh->b_this_page;
				2959	} while (bh != head);
				2960
				2961	do {
				2962	struct buffer_head *next = bh->b_this_page;
				2963
				2964	if (!list_empty(&bh->b_assoc_buffers))
				2965	__remove_assoc_queue(bh);
				2966	bh = next;
				2967	} while (bh != head);
				2968	*buffers_to_free = head;
				2969	__clear_page_buffers(page);
				2970	return 1;
				2971	failed:
				2972	return 0;
				2973	}
				2974
				2975	int try_to_free_buffers(struct page *page)
				2976	{
				2977	struct address_space * const mapping = page->mapping;
				2978	struct buffer_head *buffers_to_free = NULL;
				2979	int ret = 0;
				2980
				2981	BUG_ON(!PageLocked(page));
				2982	if (PageWriteback(page))
				2983	return 0;
				2984
				2985	if (mapping == NULL) { /* can this still happen? */
				2986	ret = drop_buffers(page, &buffers_to_free);
				2987	goto out;
				2988	}
				2989
				2990	spin_lock(&mapping->private_lock);
				2991	ret = drop_buffers(page, &buffers_to_free);
				2992	if (ret) {
				2993	/*
				2994	* If the filesystem writes its buffers by hand (eg ext3)
				2995	* then we can have clean buffers against a dirty page. We
				2996	* clean the page here; otherwise later reattachment of buffers
				2997	* could encounter a non-uptodate page, which is unresolvable.
				2998	* This only applies in the rare case where try_to_free_buffers
				2999	* succeeds but the page is not freed.
				3000	*/
				3001	clear_page_dirty(page);
				3002	}
				3003	spin_unlock(&mapping->private_lock);
				3004	out:
				3005	if (buffers_to_free) {
				3006	struct buffer_head *bh = buffers_to_free;
				3007
				3008	do {
				3009	struct buffer_head *next = bh->b_this_page;
				3010	free_buffer_head(bh);
				3011	bh = next;
				3012	} while (bh != buffers_to_free);
				3013	}
				3014	return ret;
				3015	}
				3016	EXPORT_SYMBOL(try_to_free_buffers);
				3017
				3018	int block_sync_page(struct page *page)
				3019	{
				3020	struct address_space *mapping;
				3021
				3022	smp_mb();
				3023	mapping = page_mapping(page);
				3024	if (mapping)
				3025	blk_run_backing_dev(mapping->backing_dev_info, page);
				3026	return 0;
				3027	}
				3028
				3029	/*
				3030	* There are no bdflush tunables left. But distributions are
				3031	* still running obsolete flush daemons, so we terminate them here.
				3032	*
				3033	* Use of bdflush() is deprecated and will be removed in a future kernel.
				3034	* The `pdflush' kernel threads fully replace bdflush daemons and this call.
				3035	*/
				3036	asmlinkage long sys_bdflush(int func, long data)
				3037	{
				3038	static int msg_count;
				3039
				3040	if (!capable(CAP_SYS_ADMIN))
				3041	return -EPERM;
				3042
				3043	if (msg_count < 5) {
				3044	msg_count++;
				3045	printk(KERN_INFO
				3046	"warning: process `%s' used the obsolete bdflush"
				3047	" system call\n", current->comm);
				3048	printk(KERN_INFO "Fix your initscripts?\n");
				3049	}
				3050
				3051	if (func == 1)
				3052	do_exit(0);
				3053	return 0;
				3054	}
				3055
				3056	/*
				3057	* Buffer-head allocation
				3058	*/
				3059	static kmem_cache_t *bh_cachep;
				3060
				3061	/*
				3062	* Once the number of bh's in the machine exceeds this level, we start
				3063	* stripping them in writeback.
				3064	*/
				3065	static int max_buffer_heads;
				3066
				3067	int buffer_heads_over_limit;
				3068
				3069	struct bh_accounting {
				3070	int nr; /* Number of live bh's */
				3071	int ratelimit; /* Limit cacheline bouncing */
				3072	};
				3073
				3074	static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
				3075
				3076	static void recalc_bh_state(void)
				3077	{
				3078	int i;
				3079	int tot = 0;
				3080
				3081	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
				3082	return;
				3083	__get_cpu_var(bh_accounting).ratelimit = 0;
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	3084	for_each_online_cpu(i)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3085	tot += per_cpu(bh_accounting, i).nr;
				3086	buffer_heads_over_limit = (tot > max_buffer_heads);
				3087	}
				3088
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	3089	struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3090	{
				3091	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
				3092	if (ret) {
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3093	get_cpu_var(bh_accounting).nr++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3094	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3095	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3096	}
				3097	return ret;
				3098	}
				3099	EXPORT_SYMBOL(alloc_buffer_head);
				3100
				3101	void free_buffer_head(struct buffer_head *bh)
				3102	{
				3103	BUG_ON(!list_empty(&bh->b_assoc_buffers));
				3104	kmem_cache_free(bh_cachep, bh);
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3105	get_cpu_var(bh_accounting).nr--;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3106	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3107	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3108	}
				3109	EXPORT_SYMBOL(free_buffer_head);
				3110
				3111	static void
				3112	init_buffer_head(void data, kmem_cache_t cachep, unsigned long flags)
				3113	{
				3114	if ((flags & (SLAB_CTOR_VERIFY\|SLAB_CTOR_CONSTRUCTOR)) ==
				3115	SLAB_CTOR_CONSTRUCTOR) {
				3116	struct buffer_head * bh = (struct buffer_head *)data;
				3117
				3118	memset(bh, 0, sizeof(*bh));
				3119	INIT_LIST_HEAD(&bh->b_assoc_buffers);
				3120	}
				3121	}
				3122
				3123	#ifdef CONFIG_HOTPLUG_CPU
				3124	static void buffer_exit_cpu(int cpu)
				3125	{
				3126	int i;
				3127	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
				3128
				3129	for (i = 0; i < BH_LRU_SIZE; i++) {
				3130	brelse(b->bhs[i]);
				3131	b->bhs[i] = NULL;
				3132	}
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	3133	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
				3134	per_cpu(bh_accounting, cpu).nr = 0;
				3135	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3136	}
				3137
				3138	static int buffer_cpu_notify(struct notifier_block *self,
				3139	unsigned long action, void *hcpu)
				3140	{
				3141	if (action == CPU_DEAD)
				3142	buffer_exit_cpu((unsigned long)hcpu);
				3143	return NOTIFY_OK;
				3144	}
				3145	#endif /* CONFIG_HOTPLUG_CPU */
				3146
				3147	void __init buffer_init(void)
				3148	{
				3149	int nrpages;
				3150
				3151	bh_cachep = kmem_cache_create("buffer_head",
Paul Jackson	b019600	2006-03-24 03:16:09 -0800	[diff] [blame]	3152	sizeof(struct buffer_head), 0,
				3153	(SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|
				3154	SLAB_MEM_SPREAD),
				3155	init_buffer_head,
				3156	NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3157
				3158	/*
				3159	* Limit the bh occupancy to 10% of ZONE_NORMAL
				3160	*/
				3161	nrpages = (nr_free_buffer_pages() * 10) / 100;
				3162	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
				3163	hotcpu_notifier(buffer_cpu_notify, 0);
				3164	}
				3165
				3166	EXPORT_SYMBOL(__bforget);
				3167	EXPORT_SYMBOL(__brelse);
				3168	EXPORT_SYMBOL(__wait_on_buffer);
				3169	EXPORT_SYMBOL(block_commit_write);
				3170	EXPORT_SYMBOL(block_prepare_write);
				3171	EXPORT_SYMBOL(block_read_full_page);
				3172	EXPORT_SYMBOL(block_sync_page);
				3173	EXPORT_SYMBOL(block_truncate_page);
				3174	EXPORT_SYMBOL(block_write_full_page);
				3175	EXPORT_SYMBOL(cont_prepare_write);
				3176	EXPORT_SYMBOL(end_buffer_async_write);
				3177	EXPORT_SYMBOL(end_buffer_read_sync);
				3178	EXPORT_SYMBOL(end_buffer_write_sync);
				3179	EXPORT_SYMBOL(file_fsync);
				3180	EXPORT_SYMBOL(fsync_bdev);
				3181	EXPORT_SYMBOL(generic_block_bmap);
				3182	EXPORT_SYMBOL(generic_commit_write);
				3183	EXPORT_SYMBOL(generic_cont_expand);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	3184	EXPORT_SYMBOL(generic_cont_expand_simple);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3185	EXPORT_SYMBOL(init_buffer);
				3186	EXPORT_SYMBOL(invalidate_bdev);
				3187	EXPORT_SYMBOL(ll_rw_block);
				3188	EXPORT_SYMBOL(mark_buffer_dirty);
				3189	EXPORT_SYMBOL(submit_bh);
				3190	EXPORT_SYMBOL(sync_dirty_buffer);
				3191	EXPORT_SYMBOL(unlock_buffer);