Blame - fs/buffer.c - kernel/msm-4.9

blob: 71649ef9b6586696c695e2d80c000ceedaa51282 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/fs/buffer.c
				3	*
				4	* Copyright (C) 1991, 1992, 2002 Linus Torvalds
				5	*/
				6
				7	/*
				8	* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
				9	*
				10	* Removed a lot of unnecessary code and simplified things now that
				11	* the buffer cache isn't our primary cache - Andrew Tridgell 12/96
				12	*
				13	* Speed up hash, lru, and free list operations. Use gfp() for allocating
				14	* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
				15	*
				16	* Added 32k buffer block sizes - these are required older ARM systems. - RMK
				17	*
				18	* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
				19	*/
				20
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	21	#include <linux/kernel.h>
				22	#include <linux/syscalls.h>
				23	#include <linux/fs.h>
				24	#include <linux/mm.h>
				25	#include <linux/percpu.h>
				26	#include <linux/slab.h>
				27	#include <linux/smp_lock.h>
Randy Dunlap	16f7e0f	2006-01-11 12:17:46 -0800	[diff] [blame]	28	#include <linux/capability.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	29	#include <linux/blkdev.h>
				30	#include <linux/file.h>
				31	#include <linux/quotaops.h>
				32	#include <linux/highmem.h>
				33	#include <linux/module.h>
				34	#include <linux/writeback.h>
				35	#include <linux/hash.h>
				36	#include <linux/suspend.h>
				37	#include <linux/buffer_head.h>
				38	#include <linux/bio.h>
				39	#include <linux/notifier.h>
				40	#include <linux/cpu.h>
				41	#include <linux/bitops.h>
				42	#include <linux/mpage.h>
Ingo Molnar	fb1c8f9	2005-09-10 00:25:56 -0700	[diff] [blame]	43	#include <linux/bit_spinlock.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	44
				45	static int fsync_buffers_list(spinlock_t lock, struct list_head list);
				46	static void invalidate_bh_lrus(void);
				47
				48	#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
				49
				50	inline void
				51	init_buffer(struct buffer_head bh, bh_end_io_t handler, void *private)
				52	{
				53	bh->b_end_io = handler;
				54	bh->b_private = private;
				55	}
				56
				57	static int sync_buffer(void *word)
				58	{
				59	struct block_device *bd;
				60	struct buffer_head *bh
				61	= container_of(word, struct buffer_head, b_state);
				62
				63	smp_mb();
				64	bd = bh->b_bdev;
				65	if (bd)
				66	blk_run_address_space(bd->bd_inode->i_mapping);
				67	io_schedule();
				68	return 0;
				69	}
				70
				71	void fastcall __lock_buffer(struct buffer_head *bh)
				72	{
				73	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
				74	TASK_UNINTERRUPTIBLE);
				75	}
				76	EXPORT_SYMBOL(__lock_buffer);
				77
				78	void fastcall unlock_buffer(struct buffer_head *bh)
				79	{
				80	clear_buffer_locked(bh);
				81	smp_mb__after_clear_bit();
				82	wake_up_bit(&bh->b_state, BH_Lock);
				83	}
				84
				85	/*
				86	* Block until a buffer comes unlocked. This doesn't stop it
				87	* from becoming locked again - you have to lock it yourself
				88	* if you want to preserve its state.
				89	*/
				90	void __wait_on_buffer(struct buffer_head * bh)
				91	{
				92	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
				93	}
				94
				95	static void
				96	__clear_page_buffers(struct page *page)
				97	{
				98	ClearPagePrivate(page);
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	99	set_page_private(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	100	page_cache_release(page);
				101	}
				102
				103	static void buffer_io_error(struct buffer_head *bh)
				104	{
				105	char b[BDEVNAME_SIZE];
				106
				107	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
				108	bdevname(bh->b_bdev, b),
				109	(unsigned long long)bh->b_blocknr);
				110	}
				111
				112	/*
				113	* Default synchronous end-of-IO handler.. Just mark it up-to-date and
				114	* unlock the buffer. This is what ll_rw_block uses too.
				115	*/
				116	void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
				117	{
				118	if (uptodate) {
				119	set_buffer_uptodate(bh);
				120	} else {
				121	/* This happens, due to failed READA attempts. */
				122	clear_buffer_uptodate(bh);
				123	}
				124	unlock_buffer(bh);
				125	put_bh(bh);
				126	}
				127
				128	void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
				129	{
				130	char b[BDEVNAME_SIZE];
				131
				132	if (uptodate) {
				133	set_buffer_uptodate(bh);
				134	} else {
				135	if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
				136	buffer_io_error(bh);
				137	printk(KERN_WARNING "lost page write due to "
				138	"I/O error on %s\n",
				139	bdevname(bh->b_bdev, b));
				140	}
				141	set_buffer_write_io_error(bh);
				142	clear_buffer_uptodate(bh);
				143	}
				144	unlock_buffer(bh);
				145	put_bh(bh);
				146	}
				147
				148	/*
				149	* Write out and wait upon all the dirty data associated with a block
				150	* device via its mapping. Does not take the superblock lock.
				151	*/
				152	int sync_blockdev(struct block_device *bdev)
				153	{
				154	int ret = 0;
				155
OGAWA Hirofumi	28fd129	2006-01-08 01:02:14 -0800	[diff] [blame]	156	if (bdev)
				157	ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	158	return ret;
				159	}
				160	EXPORT_SYMBOL(sync_blockdev);
				161
OGAWA Hirofumi	d25b9a1	2006-03-25 03:07:44 -0800	[diff] [blame]	162	static void __fsync_super(struct super_block *sb)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	163	{
				164	sync_inodes_sb(sb, 0);
				165	DQUOT_SYNC(sb);
				166	lock_super(sb);
				167	if (sb->s_dirt && sb->s_op->write_super)
				168	sb->s_op->write_super(sb);
				169	unlock_super(sb);
				170	if (sb->s_op->sync_fs)
				171	sb->s_op->sync_fs(sb, 1);
				172	sync_blockdev(sb->s_bdev);
				173	sync_inodes_sb(sb, 1);
OGAWA Hirofumi	d25b9a1	2006-03-25 03:07:44 -0800	[diff] [blame]	174	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	175
OGAWA Hirofumi	d25b9a1	2006-03-25 03:07:44 -0800	[diff] [blame]	176	/*
				177	* Write out and wait upon all dirty data associated with this
				178	* superblock. Filesystem data as well as the underlying block
				179	* device. Takes the superblock lock.
				180	*/
				181	int fsync_super(struct super_block *sb)
				182	{
				183	__fsync_super(sb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	184	return sync_blockdev(sb->s_bdev);
				185	}
				186
				187	/*
				188	* Write out and wait upon all dirty data associated with this
				189	* device. Filesystem data as well as the underlying block
				190	* device. Takes the superblock lock.
				191	*/
				192	int fsync_bdev(struct block_device *bdev)
				193	{
				194	struct super_block *sb = get_super(bdev);
				195	if (sb) {
				196	int res = fsync_super(sb);
				197	drop_super(sb);
				198	return res;
				199	}
				200	return sync_blockdev(bdev);
				201	}
				202
				203	/**
				204	* freeze_bdev -- lock a filesystem and force it into a consistent state
				205	* @bdev: blockdevice to lock
				206	*
Arjan van de Ven	c039e31	2006-03-23 03:00:28 -0800	[diff] [blame]	207	* This takes the block device bd_mount_mutex to make sure no new mounts
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	208	* happen on bdev until thaw_bdev() is called.
				209	* If a superblock is found on this device, we take the s_umount semaphore
				210	* on it to make sure nobody unmounts until the snapshot creation is done.
				211	*/
				212	struct super_block freeze_bdev(struct block_device bdev)
				213	{
				214	struct super_block *sb;
				215
Arjan van de Ven	c039e31	2006-03-23 03:00:28 -0800	[diff] [blame]	216	mutex_lock(&bdev->bd_mount_mutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	217	sb = get_super(bdev);
				218	if (sb && !(sb->s_flags & MS_RDONLY)) {
				219	sb->s_frozen = SB_FREEZE_WRITE;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	220	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	221
OGAWA Hirofumi	d25b9a1	2006-03-25 03:07:44 -0800	[diff] [blame]	222	__fsync_super(sb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	223
				224	sb->s_frozen = SB_FREEZE_TRANS;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	225	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	226
				227	sync_blockdev(sb->s_bdev);
				228
				229	if (sb->s_op->write_super_lockfs)
				230	sb->s_op->write_super_lockfs(sb);
				231	}
				232
				233	sync_blockdev(bdev);
				234	return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
				235	}
				236	EXPORT_SYMBOL(freeze_bdev);
				237
				238	/**
				239	* thaw_bdev -- unlock filesystem
				240	* @bdev: blockdevice to unlock
				241	* @sb: associated superblock
				242	*
				243	* Unlocks the filesystem and marks it writeable again after freeze_bdev().
				244	*/
				245	void thaw_bdev(struct block_device bdev, struct super_block sb)
				246	{
				247	if (sb) {
				248	BUG_ON(sb->s_bdev != bdev);
				249
				250	if (sb->s_op->unlockfs)
				251	sb->s_op->unlockfs(sb);
				252	sb->s_frozen = SB_UNFROZEN;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	253	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	254	wake_up(&sb->s_wait_unfrozen);
				255	drop_super(sb);
				256	}
				257
Arjan van de Ven	c039e31	2006-03-23 03:00:28 -0800	[diff] [blame]	258	mutex_unlock(&bdev->bd_mount_mutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	259	}
				260	EXPORT_SYMBOL(thaw_bdev);
				261
				262	/*
				263	* sync everything. Start out by waking pdflush, because that writes back
				264	* all queues in parallel.
				265	*/
				266	static void do_sync(unsigned long wait)
				267	{
Pekka J Enberg	687a21c	2005-06-28 20:44:55 -0700	[diff] [blame]	268	wakeup_pdflush(0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	269	sync_inodes(0); /* All mappings, inodes and their blockdevs */
				270	DQUOT_SYNC(NULL);
				271	sync_supers(); /* Write the superblocks */
				272	sync_filesystems(0); /* Start syncing the filesystems */
				273	sync_filesystems(wait); /* Waitingly sync the filesystems */
				274	sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */
				275	if (!wait)
				276	printk("Emergency Sync complete\n");
				277	if (unlikely(laptop_mode))
				278	laptop_sync_completion();
				279	}
				280
				281	asmlinkage long sys_sync(void)
				282	{
				283	do_sync(1);
				284	return 0;
				285	}
				286
				287	void emergency_sync(void)
				288	{
				289	pdflush_operation(do_sync, 0);
				290	}
				291
				292	/*
				293	* Generic function to fsync a file.
				294	*
				295	* filp may be NULL if called via the msync of a vma.
				296	*/
				297
				298	int file_fsync(struct file filp, struct dentry dentry, int datasync)
				299	{
				300	struct inode * inode = dentry->d_inode;
				301	struct super_block * sb;
				302	int ret, err;
				303
				304	/* sync the inode to buffers */
				305	ret = write_inode_now(inode, 0);
				306
				307	/* sync the superblock to buffers */
				308	sb = inode->i_sb;
				309	lock_super(sb);
				310	if (sb->s_op->write_super)
				311	sb->s_op->write_super(sb);
				312	unlock_super(sb);
				313
				314	/* .. finally sync the buffers to disk */
				315	err = sync_blockdev(sb->s_bdev);
				316	if (!ret)
				317	ret = err;
				318	return ret;
				319	}
				320
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame]	321	long do_fsync(struct file *file, int datasync)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	322	{
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame]	323	int ret;
				324	int err;
				325	struct address_space *mapping = file->f_mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	326
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	327	if (!file->f_op \|\| !file->f_op->fsync) {
				328	/* Why? We can still call filemap_fdatawrite */
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame]	329	ret = -EINVAL;
				330	goto out;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	331	}
				332
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	333	ret = filemap_fdatawrite(mapping);
				334
				335	/*
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame]	336	* We need to protect against concurrent writers, which could cause
				337	* livelocks in fsync_buffers_list().
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	338	*/
Jes Sorensen	1b1dcc1	2006-01-09 15:59:24 -0800	[diff] [blame]	339	mutex_lock(&mapping->host->i_mutex);
Oleg Nesterov	dfb388b	2005-06-23 00:10:02 -0700	[diff] [blame]	340	err = file->f_op->fsync(file, file->f_dentry, datasync);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	341	if (!ret)
				342	ret = err;
Jes Sorensen	1b1dcc1	2006-01-09 15:59:24 -0800	[diff] [blame]	343	mutex_unlock(&mapping->host->i_mutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	344	err = filemap_fdatawait(mapping);
				345	if (!ret)
				346	ret = err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	347	out:
				348	return ret;
				349	}
				350
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame]	351	static long __do_fsync(unsigned int fd, int datasync)
				352	{
				353	struct file *file;
				354	int ret = -EBADF;
				355
				356	file = fget(fd);
				357	if (file) {
				358	ret = do_fsync(file, datasync);
				359	fput(file);
				360	}
				361	return ret;
				362	}
				363
Oleg Nesterov	dfb388b	2005-06-23 00:10:02 -0700	[diff] [blame]	364	asmlinkage long sys_fsync(unsigned int fd)
				365	{
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame]	366	return __do_fsync(fd, 0);
Oleg Nesterov	dfb388b	2005-06-23 00:10:02 -0700	[diff] [blame]	367	}
				368
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	369	asmlinkage long sys_fdatasync(unsigned int fd)
				370	{
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame]	371	return __do_fsync(fd, 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	372	}
				373
				374	/*
				375	* Various filesystems appear to want __find_get_block to be non-blocking.
				376	* But it's the page lock which protects the buffers. To get around this,
				377	* we get exclusion from try_to_free_buffers with the blockdev mapping's
				378	* private_lock.
				379	*
				380	* Hack idea: for the blockdev mapping, i_bufferlist_lock contention
				381	* may be quite high. This code could TryLock the page, and if that
				382	* succeeds, there is no need to take private_lock. (But if
				383	* private_lock is contended then so is mapping->tree_lock).
				384	*/
				385	static struct buffer_head *
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	386	__find_get_block_slow(struct block_device *bdev, sector_t block)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	387	{
				388	struct inode *bd_inode = bdev->bd_inode;
				389	struct address_space *bd_mapping = bd_inode->i_mapping;
				390	struct buffer_head *ret = NULL;
				391	pgoff_t index;
				392	struct buffer_head *bh;
				393	struct buffer_head *head;
				394	struct page *page;
				395	int all_mapped = 1;
				396
				397	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
				398	page = find_get_page(bd_mapping, index);
				399	if (!page)
				400	goto out;
				401
				402	spin_lock(&bd_mapping->private_lock);
				403	if (!page_has_buffers(page))
				404	goto out_unlock;
				405	head = page_buffers(page);
				406	bh = head;
				407	do {
				408	if (bh->b_blocknr == block) {
				409	ret = bh;
				410	get_bh(bh);
				411	goto out_unlock;
				412	}
				413	if (!buffer_mapped(bh))
				414	all_mapped = 0;
				415	bh = bh->b_this_page;
				416	} while (bh != head);
				417
				418	/* we might be here because some of the buffers on this page are
				419	* not mapped. This is due to various races between
				420	* file io on the block device and getblk. It gets dealt with
				421	* elsewhere, don't buffer_error if we had some unmapped buffers
				422	*/
				423	if (all_mapped) {
				424	printk("__find_get_block_slow() failed. "
				425	"block=%llu, b_blocknr=%llu\n",
Badari Pulavarty	205f87f	2006-03-26 01:38:00 -0800	[diff] [blame]	426	(unsigned long long)block,
				427	(unsigned long long)bh->b_blocknr);
				428	printk("b_state=0x%08lx, b_size=%zu\n",
				429	bh->b_state, bh->b_size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	430	printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
				431	}
				432	out_unlock:
				433	spin_unlock(&bd_mapping->private_lock);
				434	page_cache_release(page);
				435	out:
				436	return ret;
				437	}
				438
				439	/* If invalidate_buffers() will trash dirty buffers, it means some kind
				440	of fs corruption is going on. Trashing dirty data always imply losing
				441	information that was supposed to be just stored on the physical layer
				442	by the user.
				443
				444	Thus invalidate_buffers in general usage is not allwowed to trash
				445	dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
				446	be preserved. These buffers are simply skipped.
				447
				448	We also skip buffers which are still in use. For example this can
				449	happen if a userspace program is reading the block device.
				450
				451	NOTE: In the case where the user removed a removable-media-disk even if
				452	there's still dirty data not synced on disk (due a bug in the device driver
				453	or due an error of the user), by not destroying the dirty buffers we could
				454	generate corruption also on the next media inserted, thus a parameter is
				455	necessary to handle this case in the most safe way possible (trying
				456	to not corrupt also the new disk inserted with the data belonging to
				457	the old now corrupted disk). Also for the ramdisk the natural thing
				458	to do in order to release the ramdisk memory is to destroy dirty buffers.
				459
				460	These are two special cases. Normal usage imply the device driver
				461	to issue a sync on the device (without waiting I/O completion) and
				462	then an invalidate_buffers call that doesn't trash dirty buffers.
				463
				464	For handling cache coherency with the blkdev pagecache the 'update' case
				465	is been introduced. It is needed to re-read from disk any pinned
				466	buffer. NOTE: re-reading from disk is destructive so we can do it only
				467	when we assume nobody is changing the buffercache under our I/O and when
				468	we think the disk contains more recent information than the buffercache.
				469	The update == 1 pass marks the buffers we need to update, the update == 2
				470	pass does the actual I/O. */
				471	void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
				472	{
Andrew Morton	0e1dfc6	2006-07-30 03:03:28 -0700	[diff] [blame]	473	struct address_space *mapping = bdev->bd_inode->i_mapping;
				474
				475	if (mapping->nrpages == 0)
				476	return;
				477
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	478	invalidate_bh_lrus();
				479	/*
				480	* FIXME: what about destroy_dirty_buffers?
				481	* We really want to use invalidate_inode_pages2() for
				482	* that, but not until that's cleaned up.
				483	*/
Andrew Morton	0e1dfc6	2006-07-30 03:03:28 -0700	[diff] [blame]	484	invalidate_inode_pages(mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	485	}
				486
				487	/*
				488	* Kick pdflush then try to free up some ZONE_NORMAL memory.
				489	*/
				490	static void free_more_memory(void)
				491	{
				492	struct zone **zones;
				493	pg_data_t *pgdat;
				494
Pekka J Enberg	687a21c	2005-06-28 20:44:55 -0700	[diff] [blame]	495	wakeup_pdflush(1024);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	496	yield();
				497
KAMEZAWA Hiroyuki	ec936fc	2006-03-27 01:15:59 -0800	[diff] [blame]	498	for_each_online_pgdat(pgdat) {
Al Viro	af4ca45	2005-10-21 02:55:38 -0400	[diff] [blame]	499	zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	500	if (*zones)
Darren Hart	1ad539b	2005-06-21 17:14:53 -0700	[diff] [blame]	501	try_to_free_pages(zones, GFP_NOFS);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	502	}
				503	}
				504
				505	/*
				506	* I/O completion handler for block_read_full_page() - pages
				507	* which come unlocked at the end of I/O.
				508	*/
				509	static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
				510	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	511	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	512	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	513	struct buffer_head *tmp;
				514	struct page *page;
				515	int page_uptodate = 1;
				516
				517	BUG_ON(!buffer_async_read(bh));
				518
				519	page = bh->b_page;
				520	if (uptodate) {
				521	set_buffer_uptodate(bh);
				522	} else {
				523	clear_buffer_uptodate(bh);
				524	if (printk_ratelimit())
				525	buffer_io_error(bh);
				526	SetPageError(page);
				527	}
				528
				529	/*
				530	* Be _very_ careful from here on. Bad things can happen if
				531	* two buffer heads end IO at almost the same time and both
				532	* decide that the page is now completely done.
				533	*/
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	534	first = page_buffers(page);
				535	local_irq_save(flags);
				536	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	537	clear_buffer_async_read(bh);
				538	unlock_buffer(bh);
				539	tmp = bh;
				540	do {
				541	if (!buffer_uptodate(tmp))
				542	page_uptodate = 0;
				543	if (buffer_async_read(tmp)) {
				544	BUG_ON(!buffer_locked(tmp));
				545	goto still_busy;
				546	}
				547	tmp = tmp->b_this_page;
				548	} while (tmp != bh);
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	549	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				550	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	551
				552	/*
				553	* If none of the buffers had errors and they are all
				554	* uptodate then we can set the page uptodate.
				555	*/
				556	if (page_uptodate && !PageError(page))
				557	SetPageUptodate(page);
				558	unlock_page(page);
				559	return;
				560
				561	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	562	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				563	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	564	return;
				565	}
				566
				567	/*
				568	* Completion handler for block_write_full_page() - pages which are unlocked
				569	* during I/O, and which have PageWriteback cleared upon I/O completion.
				570	*/
Adrian Bunk	b6cd0b7	2006-06-27 02:53:54 -0700	[diff] [blame]	571	static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	572	{
				573	char b[BDEVNAME_SIZE];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	574	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	575	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	576	struct buffer_head *tmp;
				577	struct page *page;
				578
				579	BUG_ON(!buffer_async_write(bh));
				580
				581	page = bh->b_page;
				582	if (uptodate) {
				583	set_buffer_uptodate(bh);
				584	} else {
				585	if (printk_ratelimit()) {
				586	buffer_io_error(bh);
				587	printk(KERN_WARNING "lost page write due to "
				588	"I/O error on %s\n",
				589	bdevname(bh->b_bdev, b));
				590	}
				591	set_bit(AS_EIO, &page->mapping->flags);
				592	clear_buffer_uptodate(bh);
				593	SetPageError(page);
				594	}
				595
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	596	first = page_buffers(page);
				597	local_irq_save(flags);
				598	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
				599
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	600	clear_buffer_async_write(bh);
				601	unlock_buffer(bh);
				602	tmp = bh->b_this_page;
				603	while (tmp != bh) {
				604	if (buffer_async_write(tmp)) {
				605	BUG_ON(!buffer_locked(tmp));
				606	goto still_busy;
				607	}
				608	tmp = tmp->b_this_page;
				609	}
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	610	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				611	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	612	end_page_writeback(page);
				613	return;
				614
				615	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	616	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				617	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	618	return;
				619	}
				620
				621	/*
				622	* If a page's buffers are under async readin (end_buffer_async_read
				623	* completion) then there is a possibility that another thread of
				624	* control could lock one of the buffers after it has completed
				625	* but while some of the other buffers have not completed. This
				626	* locked buffer would confuse end_buffer_async_read() into not unlocking
				627	* the page. So the absence of BH_Async_Read tells end_buffer_async_read()
				628	* that this buffer is not under async I/O.
				629	*
				630	* The page comes unlocked when it has no locked buffer_async buffers
				631	* left.
				632	*
				633	* PageLocked prevents anyone starting new async I/O reads any of
				634	* the buffers.
				635	*
				636	* PageWriteback is used to prevent simultaneous writeout of the same
				637	* page.
				638	*
				639	* PageLocked prevents anyone from starting writeback of a page which is
				640	* under read I/O (PageWriteback is only ever set against a locked page).
				641	*/
				642	static void mark_buffer_async_read(struct buffer_head *bh)
				643	{
				644	bh->b_end_io = end_buffer_async_read;
				645	set_buffer_async_read(bh);
				646	}
				647
				648	void mark_buffer_async_write(struct buffer_head *bh)
				649	{
				650	bh->b_end_io = end_buffer_async_write;
				651	set_buffer_async_write(bh);
				652	}
				653	EXPORT_SYMBOL(mark_buffer_async_write);
				654
				655
				656	/*
				657	* fs/buffer.c contains helper functions for buffer-backed address space's
				658	* fsync functions. A common requirement for buffer-based filesystems is
				659	* that certain data from the backing blockdev needs to be written out for
				660	* a successful fsync(). For example, ext2 indirect blocks need to be
				661	* written back and waited upon before fsync() returns.
				662	*
				663	* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
				664	* inode_has_buffers() and invalidate_inode_buffers() are provided for the
				665	* management of a list of dependent buffers at ->i_mapping->private_list.
				666	*
				667	* Locking is a little subtle: try_to_free_buffers() will remove buffers
				668	* from their controlling inode's queue when they are being freed. But
				669	* try_to_free_buffers() will be operating against the blockdev mapping
				670	* at the time, not against the S_ISREG file which depends on those buffers.
				671	* So the locking for private_list is via the private_lock in the address_space
				672	* which backs the buffers. Which is different from the address_space
				673	* against which the buffers are listed. So for a particular address_space,
				674	* mapping->private_lock does not protect mapping->private_list! In fact,
				675	* mapping->private_list will always be protected by the backing blockdev's
				676	* ->private_lock.
				677	*
				678	* Which introduces a requirement: all buffers on an address_space's
				679	* ->private_list must be from the same address_space: the blockdev's.
				680	*
				681	* address_spaces which do not place buffers at ->private_list via these
				682	* utility functions are free to use private_lock and private_list for
				683	* whatever they want. The only requirement is that list_empty(private_list)
				684	* be true at clear_inode() time.
				685	*
				686	* FIXME: clear_inode should not call invalidate_inode_buffers(). The
				687	* filesystems should do that. invalidate_inode_buffers() should just go
				688	* BUG_ON(!list_empty).
				689	*
				690	* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
				691	* take an address_space, not an inode. And it should be called
				692	* mark_buffer_dirty_fsync() to clearly define why those buffers are being
				693	* queued up.
				694	*
				695	* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
				696	* list if it is already on a list. Because if the buffer is on a list,
				697	* it must already be on the right one. If not, the filesystem is being
				698	* silly. This will save a ton of locking. But first we have to ensure
				699	* that buffers are taken off the old inode's list when they are freed
				700	* (presumably in truncate). That requires careful auditing of all
				701	* filesystems (do it inside bforget()). It could also be done by bringing
				702	* b_inode back.
				703	*/
				704
				705	/*
				706	* The buffer's backing address_space's private_lock must be held
				707	*/
				708	static inline void __remove_assoc_queue(struct buffer_head *bh)
				709	{
				710	list_del_init(&bh->b_assoc_buffers);
				711	}
				712
				713	int inode_has_buffers(struct inode *inode)
				714	{
				715	return !list_empty(&inode->i_data.private_list);
				716	}
				717
				718	/*
				719	* osync is designed to support O_SYNC io. It waits synchronously for
				720	* all already-submitted IO to complete, but does not queue any new
				721	* writes to the disk.
				722	*
				723	* To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
				724	* you dirty the buffers, and then use osync_inode_buffers to wait for
				725	* completion. Any other dirty buffers which are not yet queued for
				726	* write will not be flushed to disk by the osync.
				727	*/
				728	static int osync_buffers_list(spinlock_t lock, struct list_head list)
				729	{
				730	struct buffer_head *bh;
				731	struct list_head *p;
				732	int err = 0;
				733
				734	spin_lock(lock);
				735	repeat:
				736	list_for_each_prev(p, list) {
				737	bh = BH_ENTRY(p);
				738	if (buffer_locked(bh)) {
				739	get_bh(bh);
				740	spin_unlock(lock);
				741	wait_on_buffer(bh);
				742	if (!buffer_uptodate(bh))
				743	err = -EIO;
				744	brelse(bh);
				745	spin_lock(lock);
				746	goto repeat;
				747	}
				748	}
				749	spin_unlock(lock);
				750	return err;
				751	}
				752
				753	/**
				754	* sync_mapping_buffers - write out and wait upon a mapping's "associated"
				755	* buffers
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	756	* @mapping: the mapping which wants those buffers written
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	757	*
				758	* Starts I/O against the buffers at mapping->private_list, and waits upon
				759	* that I/O.
				760	*
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	761	* Basically, this is a convenience function for fsync().
				762	* @mapping is a file or directory which needs those buffers to be written for
				763	* a successful fsync().
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	764	*/
				765	int sync_mapping_buffers(struct address_space *mapping)
				766	{
				767	struct address_space *buffer_mapping = mapping->assoc_mapping;
				768
				769	if (buffer_mapping == NULL \|\| list_empty(&mapping->private_list))
				770	return 0;
				771
				772	return fsync_buffers_list(&buffer_mapping->private_lock,
				773	&mapping->private_list);
				774	}
				775	EXPORT_SYMBOL(sync_mapping_buffers);
				776
				777	/*
				778	* Called when we've recently written block `bblock', and it is known that
				779	* `bblock' was for a buffer_boundary() buffer. This means that the block at
				780	* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
				781	* dirty, schedule it for IO. So that indirects merge nicely with their data.
				782	*/
				783	void write_boundary_block(struct block_device *bdev,
				784	sector_t bblock, unsigned blocksize)
				785	{
				786	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
				787	if (bh) {
				788	if (buffer_dirty(bh))
				789	ll_rw_block(WRITE, 1, &bh);
				790	put_bh(bh);
				791	}
				792	}
				793
				794	void mark_buffer_dirty_inode(struct buffer_head bh, struct inode inode)
				795	{
				796	struct address_space *mapping = inode->i_mapping;
				797	struct address_space *buffer_mapping = bh->b_page->mapping;
				798
				799	mark_buffer_dirty(bh);
				800	if (!mapping->assoc_mapping) {
				801	mapping->assoc_mapping = buffer_mapping;
				802	} else {
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	803	BUG_ON(mapping->assoc_mapping != buffer_mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	804	}
				805	if (list_empty(&bh->b_assoc_buffers)) {
				806	spin_lock(&buffer_mapping->private_lock);
				807	list_move_tail(&bh->b_assoc_buffers,
				808	&mapping->private_list);
				809	spin_unlock(&buffer_mapping->private_lock);
				810	}
				811	}
				812	EXPORT_SYMBOL(mark_buffer_dirty_inode);
				813
				814	/*
				815	* Add a page to the dirty page list.
				816	*
				817	* It is a sad fact of life that this function is called from several places
				818	* deeply under spinlocking. It may not sleep.
				819	*
				820	* If the page has buffers, the uptodate buffers are set dirty, to preserve
				821	* dirty-state coherency between the page and the buffers. It the page does
				822	* not have buffers then when they are later attached they will all be set
				823	* dirty.
				824	*
				825	* The buffers are dirtied before the page is dirtied. There's a small race
				826	* window in which a writepage caller may see the page cleanness but not the
				827	* buffer dirtiness. That's fine. If this code were to set the page dirty
				828	* before the buffers, a concurrent writepage caller could clear the page dirty
				829	* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
				830	* page on the dirty page list.
				831	*
				832	* We use private_lock to lock against try_to_free_buffers while using the
				833	* page's buffer list. Also use this to protect against clean buffers being
				834	* added to the page after it was set dirty.
				835	*
				836	* FIXME: may need to call ->reservepage here as well. That's rather up to the
				837	* address_space though.
				838	*/
				839	int __set_page_dirty_buffers(struct page *page)
				840	{
				841	struct address_space * const mapping = page->mapping;
				842
				843	spin_lock(&mapping->private_lock);
				844	if (page_has_buffers(page)) {
				845	struct buffer_head *head = page_buffers(page);
				846	struct buffer_head *bh = head;
				847
				848	do {
				849	set_buffer_dirty(bh);
				850	bh = bh->b_this_page;
				851	} while (bh != head);
				852	}
				853	spin_unlock(&mapping->private_lock);
				854
				855	if (!TestSetPageDirty(page)) {
				856	write_lock_irq(&mapping->tree_lock);
				857	if (page->mapping) { /* Race with truncate? */
				858	if (mapping_cap_account_dirty(mapping))
Christoph Lameter	b1e7a8f	2006-06-30 01:55:39 -0700	[diff] [blame]	859	__inc_zone_page_state(page, NR_FILE_DIRTY);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	860	radix_tree_tag_set(&mapping->page_tree,
				861	page_index(page),
				862	PAGECACHE_TAG_DIRTY);
				863	}
				864	write_unlock_irq(&mapping->tree_lock);
				865	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
Andrew Morton	4741c9f	2006-03-24 03:18:11 -0800	[diff] [blame]	866	return 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	867	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	868	return 0;
				869	}
				870	EXPORT_SYMBOL(__set_page_dirty_buffers);
				871
				872	/*
				873	* Write out and wait upon a list of buffers.
				874	*
				875	* We have conflicting pressures: we want to make sure that all
				876	* initially dirty buffers get waited on, but that any subsequently
				877	* dirtied buffers don't. After all, we don't want fsync to last
				878	* forever if somebody is actively writing to the file.
				879	*
				880	* Do this in two main stages: first we copy dirty buffers to a
				881	* temporary inode list, queueing the writes as we go. Then we clean
				882	* up, waiting for those writes to complete.
				883	*
				884	* During this second stage, any subsequent updates to the file may end
				885	* up refiling the buffer on the original inode's dirty list again, so
				886	* there is a chance we will end up with a buffer queued for write but
				887	* not yet completed on that list. So, as a final cleanup we go through
				888	* the osync code to catch these locked, dirty buffers without requeuing
				889	* any newly dirty buffers for write.
				890	*/
				891	static int fsync_buffers_list(spinlock_t lock, struct list_head list)
				892	{
				893	struct buffer_head *bh;
				894	struct list_head tmp;
				895	int err = 0, err2;
				896
				897	INIT_LIST_HEAD(&tmp);
				898
				899	spin_lock(lock);
				900	while (!list_empty(list)) {
				901	bh = BH_ENTRY(list->next);
				902	list_del_init(&bh->b_assoc_buffers);
				903	if (buffer_dirty(bh) \|\| buffer_locked(bh)) {
				904	list_add(&bh->b_assoc_buffers, &tmp);
				905	if (buffer_dirty(bh)) {
				906	get_bh(bh);
				907	spin_unlock(lock);
				908	/*
				909	* Ensure any pending I/O completes so that
				910	* ll_rw_block() actually writes the current
				911	* contents - it is a noop if I/O is still in
				912	* flight on potentially older contents.
				913	*/
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	914	ll_rw_block(SWRITE, 1, &bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	915	brelse(bh);
				916	spin_lock(lock);
				917	}
				918	}
				919	}
				920
				921	while (!list_empty(&tmp)) {
				922	bh = BH_ENTRY(tmp.prev);
				923	__remove_assoc_queue(bh);
				924	get_bh(bh);
				925	spin_unlock(lock);
				926	wait_on_buffer(bh);
				927	if (!buffer_uptodate(bh))
				928	err = -EIO;
				929	brelse(bh);
				930	spin_lock(lock);
				931	}
				932
				933	spin_unlock(lock);
				934	err2 = osync_buffers_list(lock, list);
				935	if (err)
				936	return err;
				937	else
				938	return err2;
				939	}
				940
				941	/*
				942	* Invalidate any and all dirty buffers on a given inode. We are
				943	* probably unmounting the fs, but that doesn't mean we have already
				944	* done a sync(). Just drop the buffers from the inode list.
				945	*
				946	* NOTE: we take the inode's blockdev's mapping's private_lock. Which
				947	* assumes that all the buffers are against the blockdev. Not true
				948	* for reiserfs.
				949	*/
				950	void invalidate_inode_buffers(struct inode *inode)
				951	{
				952	if (inode_has_buffers(inode)) {
				953	struct address_space *mapping = &inode->i_data;
				954	struct list_head *list = &mapping->private_list;
				955	struct address_space *buffer_mapping = mapping->assoc_mapping;
				956
				957	spin_lock(&buffer_mapping->private_lock);
				958	while (!list_empty(list))
				959	__remove_assoc_queue(BH_ENTRY(list->next));
				960	spin_unlock(&buffer_mapping->private_lock);
				961	}
				962	}
				963
				964	/*
				965	* Remove any clean buffers from the inode's buffer list. This is called
				966	* when we're trying to free the inode itself. Those buffers can pin it.
				967	*
				968	* Returns true if all buffers were removed.
				969	*/
				970	int remove_inode_buffers(struct inode *inode)
				971	{
				972	int ret = 1;
				973
				974	if (inode_has_buffers(inode)) {
				975	struct address_space *mapping = &inode->i_data;
				976	struct list_head *list = &mapping->private_list;
				977	struct address_space *buffer_mapping = mapping->assoc_mapping;
				978
				979	spin_lock(&buffer_mapping->private_lock);
				980	while (!list_empty(list)) {
				981	struct buffer_head *bh = BH_ENTRY(list->next);
				982	if (buffer_dirty(bh)) {
				983	ret = 0;
				984	break;
				985	}
				986	__remove_assoc_queue(bh);
				987	}
				988	spin_unlock(&buffer_mapping->private_lock);
				989	}
				990	return ret;
				991	}
				992
				993	/*
				994	* Create the appropriate buffers when given a page for data area and
				995	* the size of each buffer.. Use the bh->b_this_page linked list to
				996	* follow the buffers created. Return NULL if unable to create more
				997	* buffers.
				998	*
				999	* The retry flag is used to differentiate async IO (paging, swapping)
				1000	* which may not fail from ordinary buffer allocations.
				1001	*/
				1002	struct buffer_head alloc_page_buffers(struct page page, unsigned long size,
				1003	int retry)
				1004	{
				1005	struct buffer_head bh, head;
				1006	long offset;
				1007
				1008	try_again:
				1009	head = NULL;
				1010	offset = PAGE_SIZE;
				1011	while ((offset -= size) >= 0) {
				1012	bh = alloc_buffer_head(GFP_NOFS);
				1013	if (!bh)
				1014	goto no_grow;
				1015
				1016	bh->b_bdev = NULL;
				1017	bh->b_this_page = head;
				1018	bh->b_blocknr = -1;
				1019	head = bh;
				1020
				1021	bh->b_state = 0;
				1022	atomic_set(&bh->b_count, 0);
Chris Mason	fc5cd58	2006-02-01 03:06:48 -0800	[diff] [blame]	1023	bh->b_private = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1024	bh->b_size = size;
				1025
				1026	/* Link the buffer to its page */
				1027	set_bh_page(bh, page, offset);
				1028
Nathan Scott	01ffe33	2006-01-17 09:02:07 +1100	[diff] [blame]	1029	init_buffer(bh, NULL, NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1030	}
				1031	return head;
				1032	/*
				1033	* In case anything failed, we just free everything we got.
				1034	*/
				1035	no_grow:
				1036	if (head) {
				1037	do {
				1038	bh = head;
				1039	head = head->b_this_page;
				1040	free_buffer_head(bh);
				1041	} while (head);
				1042	}
				1043
				1044	/*
				1045	* Return failure for non-async IO requests. Async IO requests
				1046	* are not allowed to fail, so we have to wait until buffer heads
				1047	* become available. But we don't want tasks sleeping with
				1048	* partially complete buffers, so all were released above.
				1049	*/
				1050	if (!retry)
				1051	return NULL;
				1052
				1053	/* We're _really_ low on memory. Now we just
				1054	* wait for old buffer heads to become free due to
				1055	* finishing IO. Since this is an async request and
				1056	* the reserve list is empty, we're sure there are
				1057	* async buffer heads in use.
				1058	*/
				1059	free_more_memory();
				1060	goto try_again;
				1061	}
				1062	EXPORT_SYMBOL_GPL(alloc_page_buffers);
				1063
				1064	static inline void
				1065	link_dev_buffers(struct page page, struct buffer_head head)
				1066	{
				1067	struct buffer_head bh, tail;
				1068
				1069	bh = head;
				1070	do {
				1071	tail = bh;
				1072	bh = bh->b_this_page;
				1073	} while (bh);
				1074	tail->b_this_page = head;
				1075	attach_page_buffers(page, head);
				1076	}
				1077
				1078	/*
				1079	* Initialise the state of a blockdev page's buffers.
				1080	*/
				1081	static void
				1082	init_page_buffers(struct page page, struct block_device bdev,
				1083	sector_t block, int size)
				1084	{
				1085	struct buffer_head *head = page_buffers(page);
				1086	struct buffer_head *bh = head;
				1087	int uptodate = PageUptodate(page);
				1088
				1089	do {
				1090	if (!buffer_mapped(bh)) {
				1091	init_buffer(bh, NULL, NULL);
				1092	bh->b_bdev = bdev;
				1093	bh->b_blocknr = block;
				1094	if (uptodate)
				1095	set_buffer_uptodate(bh);
				1096	set_buffer_mapped(bh);
				1097	}
				1098	block++;
				1099	bh = bh->b_this_page;
				1100	} while (bh != head);
				1101	}
				1102
				1103	/*
				1104	* Create the page-cache page that contains the requested block.
				1105	*
				1106	* This is user purely for blockdev mappings.
				1107	*/
				1108	static struct page *
				1109	grow_dev_page(struct block_device *bdev, sector_t block,
				1110	pgoff_t index, int size)
				1111	{
				1112	struct inode *inode = bdev->bd_inode;
				1113	struct page *page;
				1114	struct buffer_head *bh;
				1115
				1116	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
				1117	if (!page)
				1118	return NULL;
				1119
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	1120	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1121
				1122	if (page_has_buffers(page)) {
				1123	bh = page_buffers(page);
				1124	if (bh->b_size == size) {
				1125	init_page_buffers(page, bdev, block, size);
				1126	return page;
				1127	}
				1128	if (!try_to_free_buffers(page))
				1129	goto failed;
				1130	}
				1131
				1132	/*
				1133	* Allocate some buffers for this page
				1134	*/
				1135	bh = alloc_page_buffers(page, size, 0);
				1136	if (!bh)
				1137	goto failed;
				1138
				1139	/*
				1140	* Link the page to the buffers and initialise them. Take the
				1141	* lock to be atomic wrt __find_get_block(), which does not
				1142	* run under the page lock.
				1143	*/
				1144	spin_lock(&inode->i_mapping->private_lock);
				1145	link_dev_buffers(page, bh);
				1146	init_page_buffers(page, bdev, block, size);
				1147	spin_unlock(&inode->i_mapping->private_lock);
				1148	return page;
				1149
				1150	failed:
				1151	BUG();
				1152	unlock_page(page);
				1153	page_cache_release(page);
				1154	return NULL;
				1155	}
				1156
				1157	/*
				1158	* Create buffers for the specified block device block's page. If
				1159	* that page was dirty, the buffers are set dirty also.
				1160	*
				1161	* Except that's a bug. Attaching dirty buffers to a dirty
				1162	* blockdev's page can result in filesystem corruption, because
				1163	* some of those buffers may be aliases of filesystem data.
				1164	* grow_dev_page() will go BUG() if this happens.
				1165	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1166	static int
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1167	grow_buffers(struct block_device *bdev, sector_t block, int size)
				1168	{
				1169	struct page *page;
				1170	pgoff_t index;
				1171	int sizebits;
				1172
				1173	sizebits = -1;
				1174	do {
				1175	sizebits++;
				1176	} while ((size << sizebits) < PAGE_SIZE);
				1177
				1178	index = block >> sizebits;
				1179	block = index << sizebits;
				1180
				1181	/* Create a page with the proper size buffers.. */
				1182	page = grow_dev_page(bdev, block, index, size);
				1183	if (!page)
				1184	return 0;
				1185	unlock_page(page);
				1186	page_cache_release(page);
				1187	return 1;
				1188	}
				1189
Adrian Bunk	75c96f8	2005-05-05 16:16:09 -0700	[diff] [blame]	1190	static struct buffer_head *
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1191	__getblk_slow(struct block_device *bdev, sector_t block, int size)
				1192	{
				1193	/* Size must be multiple of hard sectorsize */
				1194	if (unlikely(size & (bdev_hardsect_size(bdev)-1) \|\|
				1195	(size < 512 \|\| size > PAGE_SIZE))) {
				1196	printk(KERN_ERR "getblk(): invalid block size %d requested\n",
				1197	size);
				1198	printk(KERN_ERR "hardsect size: %d\n",
				1199	bdev_hardsect_size(bdev));
				1200
				1201	dump_stack();
				1202	return NULL;
				1203	}
				1204
				1205	for (;;) {
				1206	struct buffer_head * bh;
				1207
				1208	bh = __find_get_block(bdev, block, size);
				1209	if (bh)
				1210	return bh;
				1211
				1212	if (!grow_buffers(bdev, block, size))
				1213	free_more_memory();
				1214	}
				1215	}
				1216
				1217	/*
				1218	* The relationship between dirty buffers and dirty pages:
				1219	*
				1220	* Whenever a page has any dirty buffers, the page's dirty bit is set, and
				1221	* the page is tagged dirty in its radix tree.
				1222	*
				1223	* At all times, the dirtiness of the buffers represents the dirtiness of
				1224	* subsections of the page. If the page has buffers, the page dirty bit is
				1225	* merely a hint about the true dirty state.
				1226	*
				1227	* When a page is set dirty in its entirety, all its buffers are marked dirty
				1228	* (if the page has buffers).
				1229	*
				1230	* When a buffer is marked dirty, its page is dirtied, but the page's other
				1231	* buffers are not.
				1232	*
				1233	* Also. When blockdev buffers are explicitly read with bread(), they
				1234	* individually become uptodate. But their backing page remains not
				1235	* uptodate - even if all of its buffers are uptodate. A subsequent
				1236	* block_read_full_page() against that page will discover all the uptodate
				1237	* buffers, will set the page uptodate and will perform no I/O.
				1238	*/
				1239
				1240	/**
				1241	* mark_buffer_dirty - mark a buffer_head as needing writeout
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1242	* @bh: the buffer_head to mark dirty
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1243	*
				1244	* mark_buffer_dirty() will set the dirty bit against the buffer, then set its
				1245	* backing page dirty, then tag the page as dirty in its address_space's radix
				1246	* tree and then attach the address_space's inode to its superblock's dirty
				1247	* inode list.
				1248	*
				1249	* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
				1250	* mapping->tree_lock and the global inode_lock.
				1251	*/
				1252	void fastcall mark_buffer_dirty(struct buffer_head *bh)
				1253	{
				1254	if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
				1255	__set_page_dirty_nobuffers(bh->b_page);
				1256	}
				1257
				1258	/*
				1259	* Decrement a buffer_head's reference count. If all buffers against a page
				1260	* have zero reference count, are clean and unlocked, and if the page is clean
				1261	* and unlocked then try_to_free_buffers() may strip the buffers from the page
				1262	* in preparation for freeing it (sometimes, rarely, buffers are removed from
				1263	* a page but it ends up not being freed, and buffers may later be reattached).
				1264	*/
				1265	void __brelse(struct buffer_head * buf)
				1266	{
				1267	if (atomic_read(&buf->b_count)) {
				1268	put_bh(buf);
				1269	return;
				1270	}
				1271	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
				1272	WARN_ON(1);
				1273	}
				1274
				1275	/*
				1276	* bforget() is like brelse(), except it discards any
				1277	* potentially dirty data.
				1278	*/
				1279	void __bforget(struct buffer_head *bh)
				1280	{
				1281	clear_buffer_dirty(bh);
				1282	if (!list_empty(&bh->b_assoc_buffers)) {
				1283	struct address_space *buffer_mapping = bh->b_page->mapping;
				1284
				1285	spin_lock(&buffer_mapping->private_lock);
				1286	list_del_init(&bh->b_assoc_buffers);
				1287	spin_unlock(&buffer_mapping->private_lock);
				1288	}
				1289	__brelse(bh);
				1290	}
				1291
				1292	static struct buffer_head __bread_slow(struct buffer_head bh)
				1293	{
				1294	lock_buffer(bh);
				1295	if (buffer_uptodate(bh)) {
				1296	unlock_buffer(bh);
				1297	return bh;
				1298	} else {
				1299	get_bh(bh);
				1300	bh->b_end_io = end_buffer_read_sync;
				1301	submit_bh(READ, bh);
				1302	wait_on_buffer(bh);
				1303	if (buffer_uptodate(bh))
				1304	return bh;
				1305	}
				1306	brelse(bh);
				1307	return NULL;
				1308	}
				1309
				1310	/*
				1311	* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
				1312	* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
				1313	* refcount elevated by one when they're in an LRU. A buffer can only appear
				1314	* once in a particular CPU's LRU. A single buffer can be present in multiple
				1315	* CPU's LRUs at the same time.
				1316	*
				1317	* This is a transparent caching front-end to sb_bread(), sb_getblk() and
				1318	* sb_find_get_block().
				1319	*
				1320	* The LRUs themselves only need locking against invalidate_bh_lrus. We use
				1321	* a local interrupt disable for that.
				1322	*/
				1323
				1324	#define BH_LRU_SIZE 8
				1325
				1326	struct bh_lru {
				1327	struct buffer_head *bhs[BH_LRU_SIZE];
				1328	};
				1329
				1330	static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
				1331
				1332	#ifdef CONFIG_SMP
				1333	#define bh_lru_lock() local_irq_disable()
				1334	#define bh_lru_unlock() local_irq_enable()
				1335	#else
				1336	#define bh_lru_lock() preempt_disable()
				1337	#define bh_lru_unlock() preempt_enable()
				1338	#endif
				1339
				1340	static inline void check_irqs_on(void)
				1341	{
				1342	#ifdef irqs_disabled
				1343	BUG_ON(irqs_disabled());
				1344	#endif
				1345	}
				1346
				1347	/*
				1348	* The LRU management algorithm is dopey-but-simple. Sorry.
				1349	*/
				1350	static void bh_lru_install(struct buffer_head *bh)
				1351	{
				1352	struct buffer_head *evictee = NULL;
				1353	struct bh_lru *lru;
				1354
				1355	check_irqs_on();
				1356	bh_lru_lock();
				1357	lru = &__get_cpu_var(bh_lrus);
				1358	if (lru->bhs[0] != bh) {
				1359	struct buffer_head *bhs[BH_LRU_SIZE];
				1360	int in;
				1361	int out = 0;
				1362
				1363	get_bh(bh);
				1364	bhs[out++] = bh;
				1365	for (in = 0; in < BH_LRU_SIZE; in++) {
				1366	struct buffer_head *bh2 = lru->bhs[in];
				1367
				1368	if (bh2 == bh) {
				1369	__brelse(bh2);
				1370	} else {
				1371	if (out >= BH_LRU_SIZE) {
				1372	BUG_ON(evictee != NULL);
				1373	evictee = bh2;
				1374	} else {
				1375	bhs[out++] = bh2;
				1376	}
				1377	}
				1378	}
				1379	while (out < BH_LRU_SIZE)
				1380	bhs[out++] = NULL;
				1381	memcpy(lru->bhs, bhs, sizeof(bhs));
				1382	}
				1383	bh_lru_unlock();
				1384
				1385	if (evictee)
				1386	__brelse(evictee);
				1387	}
				1388
				1389	/*
				1390	* Look up the bh in this cpu's LRU. If it's there, move it to the head.
				1391	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1392	static struct buffer_head *
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1393	lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
				1394	{
				1395	struct buffer_head *ret = NULL;
				1396	struct bh_lru *lru;
				1397	int i;
				1398
				1399	check_irqs_on();
				1400	bh_lru_lock();
				1401	lru = &__get_cpu_var(bh_lrus);
				1402	for (i = 0; i < BH_LRU_SIZE; i++) {
				1403	struct buffer_head *bh = lru->bhs[i];
				1404
				1405	if (bh && bh->b_bdev == bdev &&
				1406	bh->b_blocknr == block && bh->b_size == size) {
				1407	if (i) {
				1408	while (i) {
				1409	lru->bhs[i] = lru->bhs[i - 1];
				1410	i--;
				1411	}
				1412	lru->bhs[0] = bh;
				1413	}
				1414	get_bh(bh);
				1415	ret = bh;
				1416	break;
				1417	}
				1418	}
				1419	bh_lru_unlock();
				1420	return ret;
				1421	}
				1422
				1423	/*
				1424	* Perform a pagecache lookup for the matching buffer. If it's there, refresh
				1425	* it in the LRU and mark it as accessed. If it is not present then return
				1426	* NULL
				1427	*/
				1428	struct buffer_head *
				1429	__find_get_block(struct block_device *bdev, sector_t block, int size)
				1430	{
				1431	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
				1432
				1433	if (bh == NULL) {
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1434	bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1435	if (bh)
				1436	bh_lru_install(bh);
				1437	}
				1438	if (bh)
				1439	touch_buffer(bh);
				1440	return bh;
				1441	}
				1442	EXPORT_SYMBOL(__find_get_block);
				1443
				1444	/*
				1445	* __getblk will locate (and, if necessary, create) the buffer_head
				1446	* which corresponds to the passed block_device, block and size. The
				1447	* returned buffer has its reference count incremented.
				1448	*
				1449	* __getblk() cannot fail - it just keeps trying. If you pass it an
				1450	* illegal block number, __getblk() will happily return a buffer_head
				1451	* which represents the non-existent block. Very weird.
				1452	*
				1453	* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
				1454	* attempt is failing. FIXME, perhaps?
				1455	*/
				1456	struct buffer_head *
				1457	__getblk(struct block_device *bdev, sector_t block, int size)
				1458	{
				1459	struct buffer_head *bh = __find_get_block(bdev, block, size);
				1460
				1461	might_sleep();
				1462	if (bh == NULL)
				1463	bh = __getblk_slow(bdev, block, size);
				1464	return bh;
				1465	}
				1466	EXPORT_SYMBOL(__getblk);
				1467
				1468	/*
				1469	* Do async read-ahead on a buffer..
				1470	*/
				1471	void __breadahead(struct block_device *bdev, sector_t block, int size)
				1472	{
				1473	struct buffer_head *bh = __getblk(bdev, block, size);
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1474	if (likely(bh)) {
				1475	ll_rw_block(READA, 1, &bh);
				1476	brelse(bh);
				1477	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1478	}
				1479	EXPORT_SYMBOL(__breadahead);
				1480
				1481	/**
				1482	* __bread() - reads a specified block and returns the bh
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1483	* @bdev: the block_device to read from
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1484	* @block: number of block
				1485	* @size: size (in bytes) to read
				1486	*
				1487	* Reads a specified block, and returns buffer head that contains it.
				1488	* It returns NULL if the block was unreadable.
				1489	*/
				1490	struct buffer_head *
				1491	__bread(struct block_device *bdev, sector_t block, int size)
				1492	{
				1493	struct buffer_head *bh = __getblk(bdev, block, size);
				1494
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1495	if (likely(bh) && !buffer_uptodate(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1496	bh = __bread_slow(bh);
				1497	return bh;
				1498	}
				1499	EXPORT_SYMBOL(__bread);
				1500
				1501	/*
				1502	* invalidate_bh_lrus() is called rarely - but not only at unmount.
				1503	* This doesn't race because it runs in each cpu either in irq
				1504	* or with preempt disabled.
				1505	*/
				1506	static void invalidate_bh_lru(void *arg)
				1507	{
				1508	struct bh_lru *b = &get_cpu_var(bh_lrus);
				1509	int i;
				1510
				1511	for (i = 0; i < BH_LRU_SIZE; i++) {
				1512	brelse(b->bhs[i]);
				1513	b->bhs[i] = NULL;
				1514	}
				1515	put_cpu_var(bh_lrus);
				1516	}
				1517
				1518	static void invalidate_bh_lrus(void)
				1519	{
				1520	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
				1521	}
				1522
				1523	void set_bh_page(struct buffer_head *bh,
				1524	struct page *page, unsigned long offset)
				1525	{
				1526	bh->b_page = page;
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	1527	BUG_ON(offset >= PAGE_SIZE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1528	if (PageHighMem(page))
				1529	/*
				1530	* This catches illegal uses and preserves the offset:
				1531	*/
				1532	bh->b_data = (char *)(0 + offset);
				1533	else
				1534	bh->b_data = page_address(page) + offset;
				1535	}
				1536	EXPORT_SYMBOL(set_bh_page);
				1537
				1538	/*
				1539	* Called when truncating a buffer on a page completely.
				1540	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1541	static void discard_buffer(struct buffer_head * bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1542	{
				1543	lock_buffer(bh);
				1544	clear_buffer_dirty(bh);
				1545	bh->b_bdev = NULL;
				1546	clear_buffer_mapped(bh);
				1547	clear_buffer_req(bh);
				1548	clear_buffer_new(bh);
				1549	clear_buffer_delay(bh);
				1550	unlock_buffer(bh);
				1551	}
				1552
				1553	/**
				1554	* try_to_release_page() - release old fs-specific metadata on a page
				1555	*
				1556	* @page: the page which the kernel is trying to free
				1557	* @gfp_mask: memory allocation flags (and I/O mode)
				1558	*
				1559	* The address_space is to try to release any data against the page
				1560	* (presumably at page->private). If the release was successful, return `1'.
				1561	* Otherwise return zero.
				1562	*
				1563	* The @gfp_mask argument specifies whether I/O may be performed to release
				1564	* this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
				1565	*
				1566	* NOTE: @gfp_mask may go away, and this function may become non-blocking.
				1567	*/
Al Viro	27496a8	2005-10-21 03:20:48 -0400	[diff] [blame]	1568	int try_to_release_page(struct page *page, gfp_t gfp_mask)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1569	{
				1570	struct address_space * const mapping = page->mapping;
				1571
				1572	BUG_ON(!PageLocked(page));
				1573	if (PageWriteback(page))
				1574	return 0;
				1575
				1576	if (mapping && mapping->a_ops->releasepage)
				1577	return mapping->a_ops->releasepage(page, gfp_mask);
				1578	return try_to_free_buffers(page);
				1579	}
				1580	EXPORT_SYMBOL(try_to_release_page);
				1581
				1582	/**
				1583	* block_invalidatepage - invalidate part of all of a buffer-backed page
				1584	*
				1585	* @page: the page which is affected
				1586	* @offset: the index of the truncation point
				1587	*
				1588	* block_invalidatepage() is called when all or part of the page has become
				1589	* invalidatedby a truncate operation.
				1590	*
				1591	* block_invalidatepage() does not have to release all buffers, but it must
				1592	* ensure that no dirty buffer is left outside @offset and that no I/O
				1593	* is underway against any of the blocks which are outside the truncation
				1594	* point. Because the caller is about to free (and possibly reuse) those
				1595	* blocks on-disk.
				1596	*/
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1597	void block_invalidatepage(struct page *page, unsigned long offset)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1598	{
				1599	struct buffer_head head, bh, *next;
				1600	unsigned int curr_off = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1601
				1602	BUG_ON(!PageLocked(page));
				1603	if (!page_has_buffers(page))
				1604	goto out;
				1605
				1606	head = page_buffers(page);
				1607	bh = head;
				1608	do {
				1609	unsigned int next_off = curr_off + bh->b_size;
				1610	next = bh->b_this_page;
				1611
				1612	/*
				1613	* is this block fully invalidated?
				1614	*/
				1615	if (offset <= curr_off)
				1616	discard_buffer(bh);
				1617	curr_off = next_off;
				1618	bh = next;
				1619	} while (bh != head);
				1620
				1621	/*
				1622	* We release buffers only if the entire page is being invalidated.
				1623	* The get_block cached value has been unconditionally invalidated,
				1624	* so real IO is not possible anymore.
				1625	*/
				1626	if (offset == 0)
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1627	try_to_release_page(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1628	out:
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1629	return;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1630	}
				1631	EXPORT_SYMBOL(block_invalidatepage);
				1632
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1633	void do_invalidatepage(struct page *page, unsigned long offset)
Jan Kara	aaa4059	2005-10-30 15:00:16 -0800	[diff] [blame]	1634	{
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1635	void (invalidatepage)(struct page , unsigned long);
				1636	invalidatepage = page->mapping->a_ops->invalidatepage ? :
				1637	block_invalidatepage;
				1638	(*invalidatepage)(page, offset);
Jan Kara	aaa4059	2005-10-30 15:00:16 -0800	[diff] [blame]	1639	}
				1640
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1641	/*
				1642	* We attach and possibly dirty the buffers atomically wrt
				1643	* __set_page_dirty_buffers() via private_lock. try_to_free_buffers
				1644	* is already excluded via the page lock.
				1645	*/
				1646	void create_empty_buffers(struct page *page,
				1647	unsigned long blocksize, unsigned long b_state)
				1648	{
				1649	struct buffer_head bh, head, *tail;
				1650
				1651	head = alloc_page_buffers(page, blocksize, 1);
				1652	bh = head;
				1653	do {
				1654	bh->b_state \|= b_state;
				1655	tail = bh;
				1656	bh = bh->b_this_page;
				1657	} while (bh);
				1658	tail->b_this_page = head;
				1659
				1660	spin_lock(&page->mapping->private_lock);
				1661	if (PageUptodate(page) \|\| PageDirty(page)) {
				1662	bh = head;
				1663	do {
				1664	if (PageDirty(page))
				1665	set_buffer_dirty(bh);
				1666	if (PageUptodate(page))
				1667	set_buffer_uptodate(bh);
				1668	bh = bh->b_this_page;
				1669	} while (bh != head);
				1670	}
				1671	attach_page_buffers(page, head);
				1672	spin_unlock(&page->mapping->private_lock);
				1673	}
				1674	EXPORT_SYMBOL(create_empty_buffers);
				1675
				1676	/*
				1677	* We are taking a block for data and we don't want any output from any
				1678	* buffer-cache aliases starting from return from that function and
				1679	* until the moment when something will explicitly mark the buffer
				1680	* dirty (hopefully that will not happen until we will free that block ;-)
				1681	* We don't even need to mark it not-uptodate - nobody can expect
				1682	* anything from a newly allocated buffer anyway. We used to used
				1683	* unmap_buffer() for such invalidation, but that was wrong. We definitely
				1684	* don't want to mark the alias unmapped, for example - it would confuse
				1685	* anyone who might pick it with bread() afterwards...
				1686	*
				1687	* Also.. Note that bforget() doesn't lock the buffer. So there can
				1688	* be writeout I/O going on against recently-freed buffers. We don't
				1689	* wait on that I/O in bforget() - it's more efficient to wait on the I/O
				1690	* only if we really need to. That happens here.
				1691	*/
				1692	void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
				1693	{
				1694	struct buffer_head *old_bh;
				1695
				1696	might_sleep();
				1697
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1698	old_bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1699	if (old_bh) {
				1700	clear_buffer_dirty(old_bh);
				1701	wait_on_buffer(old_bh);
				1702	clear_buffer_req(old_bh);
				1703	__brelse(old_bh);
				1704	}
				1705	}
				1706	EXPORT_SYMBOL(unmap_underlying_metadata);
				1707
				1708	/*
				1709	* NOTE! All mapped/uptodate combinations are valid:
				1710	*
				1711	* Mapped Uptodate Meaning
				1712	*
				1713	* No No "unknown" - must do get_block()
				1714	* No Yes "hole" - zero-filled
				1715	* Yes No "allocated" - allocated on disk, not read in
				1716	* Yes Yes "valid" - allocated and up-to-date in memory.
				1717	*
				1718	* "Dirty" is valid only with the last case (mapped+uptodate).
				1719	*/
				1720
				1721	/*
				1722	* While block_write_full_page is writing back the dirty buffers under
				1723	* the page lock, whoever dirtied the buffers may decide to clean them
				1724	* again at any time. We handle that by only looking at the buffer
				1725	* state inside lock_buffer().
				1726	*
				1727	* If block_write_full_page() is called for regular writeback
				1728	* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
				1729	* locked buffer. This only can happen if someone has written the buffer
				1730	* directly, with submit_bh(). At the address_space level PageWriteback
				1731	* prevents this contention from occurring.
				1732	*/
				1733	static int __block_write_full_page(struct inode inode, struct page page,
				1734	get_block_t get_block, struct writeback_control wbc)
				1735	{
				1736	int err;
				1737	sector_t block;
				1738	sector_t last_block;
Andrew Morton	f0fbd5f	2005-05-05 16:15:48 -0700	[diff] [blame]	1739	struct buffer_head bh, head;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1740	const unsigned blocksize = 1 << inode->i_blkbits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1741	int nr_underway = 0;
				1742
				1743	BUG_ON(!PageLocked(page));
				1744
				1745	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
				1746
				1747	if (!page_has_buffers(page)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1748	create_empty_buffers(page, blocksize,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1749	(1 << BH_Dirty)\|(1 << BH_Uptodate));
				1750	}
				1751
				1752	/*
				1753	* Be very careful. We have no exclusion from __set_page_dirty_buffers
				1754	* here, and the (potentially unmapped) buffers may become dirty at
				1755	* any time. If a buffer becomes dirty here after we've inspected it
				1756	* then we just miss that fact, and the page stays dirty.
				1757	*
				1758	* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
				1759	* handle that here by just cleaning them.
				1760	*/
				1761
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	1762	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1763	head = page_buffers(page);
				1764	bh = head;
				1765
				1766	/*
				1767	* Get all the dirty buffers mapped to disk addresses and
				1768	* handle any aliases from the underlying blockdev's mapping.
				1769	*/
				1770	do {
				1771	if (block > last_block) {
				1772	/*
				1773	* mapped buffers outside i_size will occur, because
				1774	* this page can be outside i_size when there is a
				1775	* truncate in progress.
				1776	*/
				1777	/*
				1778	* The buffer was zeroed by block_write_full_page()
				1779	*/
				1780	clear_buffer_dirty(bh);
				1781	set_buffer_uptodate(bh);
				1782	} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1783	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1784	err = get_block(inode, block, bh, 1);
				1785	if (err)
				1786	goto recover;
				1787	if (buffer_new(bh)) {
				1788	/* blockdev mappings never come here */
				1789	clear_buffer_new(bh);
				1790	unmap_underlying_metadata(bh->b_bdev,
				1791	bh->b_blocknr);
				1792	}
				1793	}
				1794	bh = bh->b_this_page;
				1795	block++;
				1796	} while (bh != head);
				1797
				1798	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1799	if (!buffer_mapped(bh))
				1800	continue;
				1801	/*
				1802	* If it's a fully non-blocking write attempt and we cannot
				1803	* lock the buffer then redirty the page. Note that this can
				1804	* potentially cause a busy-wait loop from pdflush and kswapd
				1805	* activity, but those code paths have their own higher-level
				1806	* throttling.
				1807	*/
				1808	if (wbc->sync_mode != WB_SYNC_NONE \|\| !wbc->nonblocking) {
				1809	lock_buffer(bh);
				1810	} else if (test_set_buffer_locked(bh)) {
				1811	redirty_page_for_writepage(wbc, page);
				1812	continue;
				1813	}
				1814	if (test_clear_buffer_dirty(bh)) {
				1815	mark_buffer_async_write(bh);
				1816	} else {
				1817	unlock_buffer(bh);
				1818	}
				1819	} while ((bh = bh->b_this_page) != head);
				1820
				1821	/*
				1822	* The page and its buffers are protected by PageWriteback(), so we can
				1823	* drop the bh refcounts early.
				1824	*/
				1825	BUG_ON(PageWriteback(page));
				1826	set_page_writeback(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1827
				1828	do {
				1829	struct buffer_head *next = bh->b_this_page;
				1830	if (buffer_async_write(bh)) {
				1831	submit_bh(WRITE, bh);
				1832	nr_underway++;
				1833	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1834	bh = next;
				1835	} while (bh != head);
Andrew Morton	05937ba	2005-05-05 16:15:47 -0700	[diff] [blame]	1836	unlock_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1837
				1838	err = 0;
				1839	done:
				1840	if (nr_underway == 0) {
				1841	/*
				1842	* The page was marked dirty, but the buffers were
				1843	* clean. Someone wrote them back by hand with
				1844	* ll_rw_block/submit_bh. A rare case.
				1845	*/
				1846	int uptodate = 1;
				1847	do {
				1848	if (!buffer_uptodate(bh)) {
				1849	uptodate = 0;
				1850	break;
				1851	}
				1852	bh = bh->b_this_page;
				1853	} while (bh != head);
				1854	if (uptodate)
				1855	SetPageUptodate(page);
				1856	end_page_writeback(page);
				1857	/*
				1858	* The page and buffer_heads can be released at any time from
				1859	* here on.
				1860	*/
				1861	wbc->pages_skipped++; /* We didn't write this page */
				1862	}
				1863	return err;
				1864
				1865	recover:
				1866	/*
				1867	* ENOSPC, or some other error. We may already have added some
				1868	* blocks to the file, so we need to write these out to avoid
				1869	* exposing stale data.
				1870	* The page is currently locked and not marked for writeback
				1871	*/
				1872	bh = head;
				1873	/* Recovery: lock and submit the mapped buffers */
				1874	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1875	if (buffer_mapped(bh) && buffer_dirty(bh)) {
				1876	lock_buffer(bh);
				1877	mark_buffer_async_write(bh);
				1878	} else {
				1879	/*
				1880	* The buffer may have been set dirty during
				1881	* attachment to a dirty page.
				1882	*/
				1883	clear_buffer_dirty(bh);
				1884	}
				1885	} while ((bh = bh->b_this_page) != head);
				1886	SetPageError(page);
				1887	BUG_ON(PageWriteback(page));
				1888	set_page_writeback(page);
				1889	unlock_page(page);
				1890	do {
				1891	struct buffer_head *next = bh->b_this_page;
				1892	if (buffer_async_write(bh)) {
				1893	clear_buffer_dirty(bh);
				1894	submit_bh(WRITE, bh);
				1895	nr_underway++;
				1896	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1897	bh = next;
				1898	} while (bh != head);
				1899	goto done;
				1900	}
				1901
				1902	static int __block_prepare_write(struct inode inode, struct page page,
				1903	unsigned from, unsigned to, get_block_t *get_block)
				1904	{
				1905	unsigned block_start, block_end;
				1906	sector_t block;
				1907	int err = 0;
				1908	unsigned blocksize, bbits;
				1909	struct buffer_head bh, head, wait[2], *wait_bh=wait;
				1910
				1911	BUG_ON(!PageLocked(page));
				1912	BUG_ON(from > PAGE_CACHE_SIZE);
				1913	BUG_ON(to > PAGE_CACHE_SIZE);
				1914	BUG_ON(from > to);
				1915
				1916	blocksize = 1 << inode->i_blkbits;
				1917	if (!page_has_buffers(page))
				1918	create_empty_buffers(page, blocksize, 0);
				1919	head = page_buffers(page);
				1920
				1921	bbits = inode->i_blkbits;
				1922	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
				1923
				1924	for(bh = head, block_start = 0; bh != head \|\| !block_start;
				1925	block++, block_start=block_end, bh = bh->b_this_page) {
				1926	block_end = block_start + blocksize;
				1927	if (block_end <= from \|\| block_start >= to) {
				1928	if (PageUptodate(page)) {
				1929	if (!buffer_uptodate(bh))
				1930	set_buffer_uptodate(bh);
				1931	}
				1932	continue;
				1933	}
				1934	if (buffer_new(bh))
				1935	clear_buffer_new(bh);
				1936	if (!buffer_mapped(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1937	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1938	err = get_block(inode, block, bh, 1);
				1939	if (err)
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1940	break;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1941	if (buffer_new(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1942	unmap_underlying_metadata(bh->b_bdev,
				1943	bh->b_blocknr);
				1944	if (PageUptodate(page)) {
				1945	set_buffer_uptodate(bh);
				1946	continue;
				1947	}
				1948	if (block_end > to \|\| block_start < from) {
				1949	void *kaddr;
				1950
				1951	kaddr = kmap_atomic(page, KM_USER0);
				1952	if (block_end > to)
				1953	memset(kaddr+to, 0,
				1954	block_end-to);
				1955	if (block_start < from)
				1956	memset(kaddr+block_start,
				1957	0, from-block_start);
				1958	flush_dcache_page(page);
				1959	kunmap_atomic(kaddr, KM_USER0);
				1960	}
				1961	continue;
				1962	}
				1963	}
				1964	if (PageUptodate(page)) {
				1965	if (!buffer_uptodate(bh))
				1966	set_buffer_uptodate(bh);
				1967	continue;
				1968	}
				1969	if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
				1970	(block_start < from \|\| block_end > to)) {
				1971	ll_rw_block(READ, 1, &bh);
				1972	*wait_bh++=bh;
				1973	}
				1974	}
				1975	/*
				1976	* If we issued read requests - let them complete.
				1977	*/
				1978	while(wait_bh > wait) {
				1979	wait_on_buffer(*--wait_bh);
				1980	if (!buffer_uptodate(*wait_bh))
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1981	err = -EIO;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1982	}
Anton Altaparmakov	152becd	2005-06-23 00:10:21 -0700	[diff] [blame]	1983	if (!err) {
				1984	bh = head;
				1985	do {
				1986	if (buffer_new(bh))
				1987	clear_buffer_new(bh);
				1988	} while ((bh = bh->b_this_page) != head);
				1989	return 0;
				1990	}
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1991	/* Error case: */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1992	/*
				1993	* Zero out any newly allocated blocks to avoid exposing stale
				1994	* data. If BH_New is set, we know that the block was newly
				1995	* allocated in the above loop.
				1996	*/
				1997	bh = head;
				1998	block_start = 0;
				1999	do {
				2000	block_end = block_start+blocksize;
				2001	if (block_end <= from)
				2002	goto next_bh;
				2003	if (block_start >= to)
				2004	break;
				2005	if (buffer_new(bh)) {
				2006	void *kaddr;
				2007
				2008	clear_buffer_new(bh);
				2009	kaddr = kmap_atomic(page, KM_USER0);
				2010	memset(kaddr+block_start, 0, bh->b_size);
				2011	kunmap_atomic(kaddr, KM_USER0);
				2012	set_buffer_uptodate(bh);
				2013	mark_buffer_dirty(bh);
				2014	}
				2015	next_bh:
				2016	block_start = block_end;
				2017	bh = bh->b_this_page;
				2018	} while (bh != head);
				2019	return err;
				2020	}
				2021
				2022	static int __block_commit_write(struct inode inode, struct page page,
				2023	unsigned from, unsigned to)
				2024	{
				2025	unsigned block_start, block_end;
				2026	int partial = 0;
				2027	unsigned blocksize;
				2028	struct buffer_head bh, head;
				2029
				2030	blocksize = 1 << inode->i_blkbits;
				2031
				2032	for(bh = head = page_buffers(page), block_start = 0;
				2033	bh != head \|\| !block_start;
				2034	block_start=block_end, bh = bh->b_this_page) {
				2035	block_end = block_start + blocksize;
				2036	if (block_end <= from \|\| block_start >= to) {
				2037	if (!buffer_uptodate(bh))
				2038	partial = 1;
				2039	} else {
				2040	set_buffer_uptodate(bh);
				2041	mark_buffer_dirty(bh);
				2042	}
				2043	}
				2044
				2045	/*
				2046	* If this is a partial write which happened to make all buffers
				2047	* uptodate then we can optimize away a bogus readpage() for
				2048	* the next read(). Here we 'discover' whether the page went
				2049	* uptodate as a result of this (potentially partial) write.
				2050	*/
				2051	if (!partial)
				2052	SetPageUptodate(page);
				2053	return 0;
				2054	}
				2055
				2056	/*
				2057	* Generic "read page" function for block devices that have the normal
				2058	* get_block functionality. This is most of the block device filesystems.
				2059	* Reads the page asynchronously --- the unlock_buffer() and
				2060	* set/clear_buffer_uptodate() functions propagate buffer state into the
				2061	* page struct once IO has completed.
				2062	*/
				2063	int block_read_full_page(struct page page, get_block_t get_block)
				2064	{
				2065	struct inode *inode = page->mapping->host;
				2066	sector_t iblock, lblock;
				2067	struct buffer_head bh, head, *arr[MAX_BUF_PER_PAGE];
				2068	unsigned int blocksize;
				2069	int nr, i;
				2070	int fully_mapped = 1;
				2071
Matt Mackall	cd7619d	2005-05-01 08:59:01 -0700	[diff] [blame]	2072	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2073	blocksize = 1 << inode->i_blkbits;
				2074	if (!page_has_buffers(page))
				2075	create_empty_buffers(page, blocksize, 0);
				2076	head = page_buffers(page);
				2077
				2078	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
				2079	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
				2080	bh = head;
				2081	nr = 0;
				2082	i = 0;
				2083
				2084	do {
				2085	if (buffer_uptodate(bh))
				2086	continue;
				2087
				2088	if (!buffer_mapped(bh)) {
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2089	int err = 0;
				2090
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2091	fully_mapped = 0;
				2092	if (iblock < lblock) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2093	WARN_ON(bh->b_size != blocksize);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2094	err = get_block(inode, iblock, bh, 0);
				2095	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2096	SetPageError(page);
				2097	}
				2098	if (!buffer_mapped(bh)) {
				2099	void *kaddr = kmap_atomic(page, KM_USER0);
				2100	memset(kaddr + i * blocksize, 0, blocksize);
				2101	flush_dcache_page(page);
				2102	kunmap_atomic(kaddr, KM_USER0);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2103	if (!err)
				2104	set_buffer_uptodate(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2105	continue;
				2106	}
				2107	/*
				2108	* get_block() might have updated the buffer
				2109	* synchronously
				2110	*/
				2111	if (buffer_uptodate(bh))
				2112	continue;
				2113	}
				2114	arr[nr++] = bh;
				2115	} while (i++, iblock++, (bh = bh->b_this_page) != head);
				2116
				2117	if (fully_mapped)
				2118	SetPageMappedToDisk(page);
				2119
				2120	if (!nr) {
				2121	/*
				2122	* All buffers are uptodate - we can set the page uptodate
				2123	* as well. But not if get_block() returned an error.
				2124	*/
				2125	if (!PageError(page))
				2126	SetPageUptodate(page);
				2127	unlock_page(page);
				2128	return 0;
				2129	}
				2130
				2131	/* Stage two: lock the buffers */
				2132	for (i = 0; i < nr; i++) {
				2133	bh = arr[i];
				2134	lock_buffer(bh);
				2135	mark_buffer_async_read(bh);
				2136	}
				2137
				2138	/*
				2139	* Stage 3: start the IO. Check for uptodateness
				2140	* inside the buffer lock in case another process reading
				2141	* the underlying blockdev brought it uptodate (the sct fix).
				2142	*/
				2143	for (i = 0; i < nr; i++) {
				2144	bh = arr[i];
				2145	if (buffer_uptodate(bh))
				2146	end_buffer_async_read(bh, 1);
				2147	else
				2148	submit_bh(READ, bh);
				2149	}
				2150	return 0;
				2151	}
				2152
				2153	/* utility function for filesystems that need to do work on expanding
				2154	* truncates. Uses prepare/commit_write to allow the filesystem to
				2155	* deal with the hole.
				2156	*/
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2157	static int __generic_cont_expand(struct inode *inode, loff_t size,
				2158	pgoff_t index, unsigned int offset)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2159	{
				2160	struct address_space *mapping = inode->i_mapping;
				2161	struct page *page;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2162	unsigned long limit;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2163	int err;
				2164
				2165	err = -EFBIG;
				2166	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
				2167	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
				2168	send_sig(SIGXFSZ, current, 0);
				2169	goto out;
				2170	}
				2171	if (size > inode->i_sb->s_maxbytes)
				2172	goto out;
				2173
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2174	err = -ENOMEM;
				2175	page = grab_cache_page(mapping, index);
				2176	if (!page)
				2177	goto out;
				2178	err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2179	if (err) {
				2180	/*
				2181	* ->prepare_write() may have instantiated a few blocks
				2182	* outside i_size. Trim these off again.
				2183	*/
				2184	unlock_page(page);
				2185	page_cache_release(page);
				2186	vmtruncate(inode, inode->i_size);
				2187	goto out;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2188	}
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2189
				2190	err = mapping->a_ops->commit_write(NULL, page, offset, offset);
				2191
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2192	unlock_page(page);
				2193	page_cache_release(page);
				2194	if (err > 0)
				2195	err = 0;
				2196	out:
				2197	return err;
				2198	}
				2199
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2200	int generic_cont_expand(struct inode *inode, loff_t size)
				2201	{
				2202	pgoff_t index;
				2203	unsigned int offset;
				2204
				2205	offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
				2206
				2207	/* ugh. in prepare/commit_write, if from==to==start of block, we
				2208	** skip the prepare. make sure we never send an offset for the start
				2209	** of a block
				2210	*/
				2211	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
				2212	/* caller must handle this extra byte. */
				2213	offset++;
				2214	}
				2215	index = size >> PAGE_CACHE_SHIFT;
				2216
				2217	return __generic_cont_expand(inode, size, index, offset);
				2218	}
				2219
				2220	int generic_cont_expand_simple(struct inode *inode, loff_t size)
				2221	{
				2222	loff_t pos = size - 1;
				2223	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
				2224	unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
				2225
				2226	/* prepare/commit_write can handle even if from==to==start of block. */
				2227	return __generic_cont_expand(inode, size, index, offset);
				2228	}
				2229
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2230	/*
				2231	* For moronic filesystems that do not allow holes in file.
				2232	* We may have to extend the file.
				2233	*/
				2234
				2235	int cont_prepare_write(struct page *page, unsigned offset,
				2236	unsigned to, get_block_t get_block, loff_t bytes)
				2237	{
				2238	struct address_space *mapping = page->mapping;
				2239	struct inode *inode = mapping->host;
				2240	struct page *new_page;
				2241	pgoff_t pgpos;
				2242	long status;
				2243	unsigned zerofrom;
				2244	unsigned blocksize = 1 << inode->i_blkbits;
				2245	void *kaddr;
				2246
				2247	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
				2248	status = -ENOMEM;
				2249	new_page = grab_cache_page(mapping, pgpos);
				2250	if (!new_page)
				2251	goto out;
				2252	/* we might sleep */
				2253	if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
				2254	unlock_page(new_page);
				2255	page_cache_release(new_page);
				2256	continue;
				2257	}
				2258	zerofrom = *bytes & ~PAGE_CACHE_MASK;
				2259	if (zerofrom & (blocksize-1)) {
				2260	*bytes \|= (blocksize-1);
				2261	(*bytes)++;
				2262	}
				2263	status = __block_prepare_write(inode, new_page, zerofrom,
				2264	PAGE_CACHE_SIZE, get_block);
				2265	if (status)
				2266	goto out_unmap;
				2267	kaddr = kmap_atomic(new_page, KM_USER0);
				2268	memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
				2269	flush_dcache_page(new_page);
				2270	kunmap_atomic(kaddr, KM_USER0);
				2271	generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
				2272	unlock_page(new_page);
				2273	page_cache_release(new_page);
				2274	}
				2275
				2276	if (page->index < pgpos) {
				2277	/* completely inside the area */
				2278	zerofrom = offset;
				2279	} else {
				2280	/* page covers the boundary, find the boundary offset */
				2281	zerofrom = *bytes & ~PAGE_CACHE_MASK;
				2282
				2283	/* if we will expand the thing last block will be filled */
				2284	if (to > zerofrom && (zerofrom & (blocksize-1))) {
				2285	*bytes \|= (blocksize-1);
				2286	(*bytes)++;
				2287	}
				2288
				2289	/* starting below the boundary? Nothing to zero out */
				2290	if (offset <= zerofrom)
				2291	zerofrom = offset;
				2292	}
				2293	status = __block_prepare_write(inode, page, zerofrom, to, get_block);
				2294	if (status)
				2295	goto out1;
				2296	if (zerofrom < offset) {
				2297	kaddr = kmap_atomic(page, KM_USER0);
				2298	memset(kaddr+zerofrom, 0, offset-zerofrom);
				2299	flush_dcache_page(page);
				2300	kunmap_atomic(kaddr, KM_USER0);
				2301	__block_commit_write(inode, page, zerofrom, offset);
				2302	}
				2303	return 0;
				2304	out1:
				2305	ClearPageUptodate(page);
				2306	return status;
				2307
				2308	out_unmap:
				2309	ClearPageUptodate(new_page);
				2310	unlock_page(new_page);
				2311	page_cache_release(new_page);
				2312	out:
				2313	return status;
				2314	}
				2315
				2316	int block_prepare_write(struct page *page, unsigned from, unsigned to,
				2317	get_block_t *get_block)
				2318	{
				2319	struct inode *inode = page->mapping->host;
				2320	int err = __block_prepare_write(inode, page, from, to, get_block);
				2321	if (err)
				2322	ClearPageUptodate(page);
				2323	return err;
				2324	}
				2325
				2326	int block_commit_write(struct page *page, unsigned from, unsigned to)
				2327	{
				2328	struct inode *inode = page->mapping->host;
				2329	__block_commit_write(inode,page,from,to);
				2330	return 0;
				2331	}
				2332
				2333	int generic_commit_write(struct file file, struct page page,
				2334	unsigned from, unsigned to)
				2335	{
				2336	struct inode *inode = page->mapping->host;
				2337	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				2338	__block_commit_write(inode,page,from,to);
				2339	/*
				2340	* No need to use i_size_read() here, the i_size
Jes Sorensen	1b1dcc1	2006-01-09 15:59:24 -0800	[diff] [blame]	2341	* cannot change under us because we hold i_mutex.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2342	*/
				2343	if (pos > inode->i_size) {
				2344	i_size_write(inode, pos);
				2345	mark_inode_dirty(inode);
				2346	}
				2347	return 0;
				2348	}
				2349
				2350
				2351	/*
				2352	* nobh_prepare_write()'s prereads are special: the buffer_heads are freed
				2353	* immediately, while under the page lock. So it needs a special end_io
				2354	* handler which does not touch the bh after unlocking it.
				2355	*
				2356	* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
				2357	* a race there is benign: unlock_buffer() only use the bh's address for
				2358	* hashing after unlocking the buffer, so it doesn't actually touch the bh
				2359	* itself.
				2360	*/
				2361	static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
				2362	{
				2363	if (uptodate) {
				2364	set_buffer_uptodate(bh);
				2365	} else {
				2366	/* This happens, due to failed READA attempts. */
				2367	clear_buffer_uptodate(bh);
				2368	}
				2369	unlock_buffer(bh);
				2370	}
				2371
				2372	/*
				2373	* On entry, the page is fully not uptodate.
				2374	* On exit the page is fully uptodate in the areas outside (from,to)
				2375	*/
				2376	int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
				2377	get_block_t *get_block)
				2378	{
				2379	struct inode *inode = page->mapping->host;
				2380	const unsigned blkbits = inode->i_blkbits;
				2381	const unsigned blocksize = 1 << blkbits;
				2382	struct buffer_head map_bh;
				2383	struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
				2384	unsigned block_in_page;
				2385	unsigned block_start;
				2386	sector_t block_in_file;
				2387	char *kaddr;
				2388	int nr_reads = 0;
				2389	int i;
				2390	int ret = 0;
				2391	int is_mapped_to_disk = 1;
				2392	int dirtied_it = 0;
				2393
				2394	if (PageMappedToDisk(page))
				2395	return 0;
				2396
				2397	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
				2398	map_bh.b_page = page;
				2399
				2400	/*
				2401	* We loop across all blocks in the page, whether or not they are
				2402	* part of the affected region. This is so we can discover if the
				2403	* page is fully mapped-to-disk.
				2404	*/
				2405	for (block_start = 0, block_in_page = 0;
				2406	block_start < PAGE_CACHE_SIZE;
				2407	block_in_page++, block_start += blocksize) {
				2408	unsigned block_end = block_start + blocksize;
				2409	int create;
				2410
				2411	map_bh.b_state = 0;
				2412	create = 1;
				2413	if (block_start >= to)
				2414	create = 0;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2415	map_bh.b_size = blocksize;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2416	ret = get_block(inode, block_in_file + block_in_page,
				2417	&map_bh, create);
				2418	if (ret)
				2419	goto failed;
				2420	if (!buffer_mapped(&map_bh))
				2421	is_mapped_to_disk = 0;
				2422	if (buffer_new(&map_bh))
				2423	unmap_underlying_metadata(map_bh.b_bdev,
				2424	map_bh.b_blocknr);
				2425	if (PageUptodate(page))
				2426	continue;
				2427	if (buffer_new(&map_bh) \|\| !buffer_mapped(&map_bh)) {
				2428	kaddr = kmap_atomic(page, KM_USER0);
				2429	if (block_start < from) {
				2430	memset(kaddr+block_start, 0, from-block_start);
				2431	dirtied_it = 1;
				2432	}
				2433	if (block_end > to) {
				2434	memset(kaddr + to, 0, block_end - to);
				2435	dirtied_it = 1;
				2436	}
				2437	flush_dcache_page(page);
				2438	kunmap_atomic(kaddr, KM_USER0);
				2439	continue;
				2440	}
				2441	if (buffer_uptodate(&map_bh))
				2442	continue; /* reiserfs does this */
				2443	if (block_start < from \|\| block_end > to) {
				2444	struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
				2445
				2446	if (!bh) {
				2447	ret = -ENOMEM;
				2448	goto failed;
				2449	}
				2450	bh->b_state = map_bh.b_state;
				2451	atomic_set(&bh->b_count, 0);
				2452	bh->b_this_page = NULL;
				2453	bh->b_page = page;
				2454	bh->b_blocknr = map_bh.b_blocknr;
				2455	bh->b_size = blocksize;
				2456	bh->b_data = (char *)(long)block_start;
				2457	bh->b_bdev = map_bh.b_bdev;
				2458	bh->b_private = NULL;
				2459	read_bh[nr_reads++] = bh;
				2460	}
				2461	}
				2462
				2463	if (nr_reads) {
				2464	struct buffer_head *bh;
				2465
				2466	/*
				2467	* The page is locked, so these buffers are protected from
				2468	* any VM or truncate activity. Hence we don't need to care
				2469	* for the buffer_head refcounts.
				2470	*/
				2471	for (i = 0; i < nr_reads; i++) {
				2472	bh = read_bh[i];
				2473	lock_buffer(bh);
				2474	bh->b_end_io = end_buffer_read_nobh;
				2475	submit_bh(READ, bh);
				2476	}
				2477	for (i = 0; i < nr_reads; i++) {
				2478	bh = read_bh[i];
				2479	wait_on_buffer(bh);
				2480	if (!buffer_uptodate(bh))
				2481	ret = -EIO;
				2482	free_buffer_head(bh);
				2483	read_bh[i] = NULL;
				2484	}
				2485	if (ret)
				2486	goto failed;
				2487	}
				2488
				2489	if (is_mapped_to_disk)
				2490	SetPageMappedToDisk(page);
				2491	SetPageUptodate(page);
				2492
				2493	/*
				2494	* Setting the page dirty here isn't necessary for the prepare_write
				2495	* function - commit_write will do that. But if/when this function is
				2496	* used within the pagefault handler to ensure that all mmapped pages
				2497	* have backing space in the filesystem, we will need to dirty the page
				2498	* if its contents were altered.
				2499	*/
				2500	if (dirtied_it)
				2501	set_page_dirty(page);
				2502
				2503	return 0;
				2504
				2505	failed:
				2506	for (i = 0; i < nr_reads; i++) {
				2507	if (read_bh[i])
				2508	free_buffer_head(read_bh[i]);
				2509	}
				2510
				2511	/*
				2512	* Error recovery is pretty slack. Clear the page and mark it dirty
				2513	* so we'll later zero out any blocks which _were_ allocated.
				2514	*/
				2515	kaddr = kmap_atomic(page, KM_USER0);
				2516	memset(kaddr, 0, PAGE_CACHE_SIZE);
				2517	kunmap_atomic(kaddr, KM_USER0);
				2518	SetPageUptodate(page);
				2519	set_page_dirty(page);
				2520	return ret;
				2521	}
				2522	EXPORT_SYMBOL(nobh_prepare_write);
				2523
				2524	int nobh_commit_write(struct file file, struct page page,
				2525	unsigned from, unsigned to)
				2526	{
				2527	struct inode *inode = page->mapping->host;
				2528	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				2529
				2530	set_page_dirty(page);
				2531	if (pos > inode->i_size) {
				2532	i_size_write(inode, pos);
				2533	mark_inode_dirty(inode);
				2534	}
				2535	return 0;
				2536	}
				2537	EXPORT_SYMBOL(nobh_commit_write);
				2538
				2539	/*
				2540	* nobh_writepage() - based on block_full_write_page() except
				2541	* that it tries to operate without attaching bufferheads to
				2542	* the page.
				2543	*/
				2544	int nobh_writepage(struct page page, get_block_t get_block,
				2545	struct writeback_control *wbc)
				2546	{
				2547	struct inode * const inode = page->mapping->host;
				2548	loff_t i_size = i_size_read(inode);
				2549	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2550	unsigned offset;
				2551	void *kaddr;
				2552	int ret;
				2553
				2554	/* Is the page fully inside i_size? */
				2555	if (page->index < end_index)
				2556	goto out;
				2557
				2558	/* Is the page fully outside i_size? (truncate in progress) */
				2559	offset = i_size & (PAGE_CACHE_SIZE-1);
				2560	if (page->index >= end_index+1 \|\| !offset) {
				2561	/*
				2562	* The page may have dirty, unmapped buffers. For example,
				2563	* they may have been added in ext3_writepage(). Make them
				2564	* freeable here, so the page does not leak.
				2565	*/
				2566	#if 0
				2567	/* Not really sure about this - do we need this ? */
				2568	if (page->mapping->a_ops->invalidatepage)
				2569	page->mapping->a_ops->invalidatepage(page, offset);
				2570	#endif
				2571	unlock_page(page);
				2572	return 0; /* don't care */
				2573	}
				2574
				2575	/*
				2576	* The page straddles i_size. It must be zeroed out on each and every
				2577	* writepage invocation because it may be mmapped. "A file is mapped
				2578	* in multiples of the page size. For a file that is not a multiple of
				2579	* the page size, the remaining memory is zeroed when mapped, and
				2580	* writes to that region are not written out to the file."
				2581	*/
				2582	kaddr = kmap_atomic(page, KM_USER0);
				2583	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2584	flush_dcache_page(page);
				2585	kunmap_atomic(kaddr, KM_USER0);
				2586	out:
				2587	ret = mpage_writepage(page, get_block, wbc);
				2588	if (ret == -EAGAIN)
				2589	ret = __block_write_full_page(inode, page, get_block, wbc);
				2590	return ret;
				2591	}
				2592	EXPORT_SYMBOL(nobh_writepage);
				2593
				2594	/*
				2595	* This function assumes that ->prepare_write() uses nobh_prepare_write().
				2596	*/
				2597	int nobh_truncate_page(struct address_space *mapping, loff_t from)
				2598	{
				2599	struct inode *inode = mapping->host;
				2600	unsigned blocksize = 1 << inode->i_blkbits;
				2601	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2602	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				2603	unsigned to;
				2604	struct page *page;
Christoph Hellwig	f5e54d6	2006-06-28 04:26:44 -0700	[diff] [blame]	2605	const struct address_space_operations *a_ops = mapping->a_ops;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2606	char *kaddr;
				2607	int ret = 0;
				2608
				2609	if ((offset & (blocksize - 1)) == 0)
				2610	goto out;
				2611
				2612	ret = -ENOMEM;
				2613	page = grab_cache_page(mapping, index);
				2614	if (!page)
				2615	goto out;
				2616
				2617	to = (offset + blocksize) & ~(blocksize - 1);
				2618	ret = a_ops->prepare_write(NULL, page, offset, to);
				2619	if (ret == 0) {
				2620	kaddr = kmap_atomic(page, KM_USER0);
				2621	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2622	flush_dcache_page(page);
				2623	kunmap_atomic(kaddr, KM_USER0);
				2624	set_page_dirty(page);
				2625	}
				2626	unlock_page(page);
				2627	page_cache_release(page);
				2628	out:
				2629	return ret;
				2630	}
				2631	EXPORT_SYMBOL(nobh_truncate_page);
				2632
				2633	int block_truncate_page(struct address_space *mapping,
				2634	loff_t from, get_block_t *get_block)
				2635	{
				2636	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2637	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				2638	unsigned blocksize;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2639	sector_t iblock;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2640	unsigned length, pos;
				2641	struct inode *inode = mapping->host;
				2642	struct page *page;
				2643	struct buffer_head *bh;
				2644	void *kaddr;
				2645	int err;
				2646
				2647	blocksize = 1 << inode->i_blkbits;
				2648	length = offset & (blocksize - 1);
				2649
				2650	/* Block boundary? Nothing to do */
				2651	if (!length)
				2652	return 0;
				2653
				2654	length = blocksize - length;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2655	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2656
				2657	page = grab_cache_page(mapping, index);
				2658	err = -ENOMEM;
				2659	if (!page)
				2660	goto out;
				2661
				2662	if (!page_has_buffers(page))
				2663	create_empty_buffers(page, blocksize, 0);
				2664
				2665	/* Find the buffer that contains "offset" */
				2666	bh = page_buffers(page);
				2667	pos = blocksize;
				2668	while (offset >= pos) {
				2669	bh = bh->b_this_page;
				2670	iblock++;
				2671	pos += blocksize;
				2672	}
				2673
				2674	err = 0;
				2675	if (!buffer_mapped(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2676	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2677	err = get_block(inode, iblock, bh, 0);
				2678	if (err)
				2679	goto unlock;
				2680	/* unmapped? It's a hole - nothing to do */
				2681	if (!buffer_mapped(bh))
				2682	goto unlock;
				2683	}
				2684
				2685	/* Ok, it's mapped. Make sure it's up-to-date */
				2686	if (PageUptodate(page))
				2687	set_buffer_uptodate(bh);
				2688
				2689	if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
				2690	err = -EIO;
				2691	ll_rw_block(READ, 1, &bh);
				2692	wait_on_buffer(bh);
				2693	/* Uhhuh. Read error. Complain and punt. */
				2694	if (!buffer_uptodate(bh))
				2695	goto unlock;
				2696	}
				2697
				2698	kaddr = kmap_atomic(page, KM_USER0);
				2699	memset(kaddr + offset, 0, length);
				2700	flush_dcache_page(page);
				2701	kunmap_atomic(kaddr, KM_USER0);
				2702
				2703	mark_buffer_dirty(bh);
				2704	err = 0;
				2705
				2706	unlock:
				2707	unlock_page(page);
				2708	page_cache_release(page);
				2709	out:
				2710	return err;
				2711	}
				2712
				2713	/*
				2714	* The generic ->writepage function for buffer-backed address_spaces
				2715	*/
				2716	int block_write_full_page(struct page page, get_block_t get_block,
				2717	struct writeback_control *wbc)
				2718	{
				2719	struct inode * const inode = page->mapping->host;
				2720	loff_t i_size = i_size_read(inode);
				2721	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2722	unsigned offset;
				2723	void *kaddr;
				2724
				2725	/* Is the page fully inside i_size? */
				2726	if (page->index < end_index)
				2727	return __block_write_full_page(inode, page, get_block, wbc);
				2728
				2729	/* Is the page fully outside i_size? (truncate in progress) */
				2730	offset = i_size & (PAGE_CACHE_SIZE-1);
				2731	if (page->index >= end_index+1 \|\| !offset) {
				2732	/*
				2733	* The page may have dirty, unmapped buffers. For example,
				2734	* they may have been added in ext3_writepage(). Make them
				2735	* freeable here, so the page does not leak.
				2736	*/
Jan Kara	aaa4059	2005-10-30 15:00:16 -0800	[diff] [blame]	2737	do_invalidatepage(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2738	unlock_page(page);
				2739	return 0; /* don't care */
				2740	}
				2741
				2742	/*
				2743	* The page straddles i_size. It must be zeroed out on each and every
				2744	* writepage invokation because it may be mmapped. "A file is mapped
				2745	* in multiples of the page size. For a file that is not a multiple of
				2746	* the page size, the remaining memory is zeroed when mapped, and
				2747	* writes to that region are not written out to the file."
				2748	*/
				2749	kaddr = kmap_atomic(page, KM_USER0);
				2750	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2751	flush_dcache_page(page);
				2752	kunmap_atomic(kaddr, KM_USER0);
				2753	return __block_write_full_page(inode, page, get_block, wbc);
				2754	}
				2755
				2756	sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
				2757	get_block_t *get_block)
				2758	{
				2759	struct buffer_head tmp;
				2760	struct inode *inode = mapping->host;
				2761	tmp.b_state = 0;
				2762	tmp.b_blocknr = 0;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2763	tmp.b_size = 1 << inode->i_blkbits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2764	get_block(inode, block, &tmp, 0);
				2765	return tmp.b_blocknr;
				2766	}
				2767
				2768	static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
				2769	{
				2770	struct buffer_head *bh = bio->bi_private;
				2771
				2772	if (bio->bi_size)
				2773	return 1;
				2774
				2775	if (err == -EOPNOTSUPP) {
				2776	set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
				2777	set_bit(BH_Eopnotsupp, &bh->b_state);
				2778	}
				2779
				2780	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
				2781	bio_put(bio);
				2782	return 0;
				2783	}
				2784
				2785	int submit_bh(int rw, struct buffer_head * bh)
				2786	{
				2787	struct bio *bio;
				2788	int ret = 0;
				2789
				2790	BUG_ON(!buffer_locked(bh));
				2791	BUG_ON(!buffer_mapped(bh));
				2792	BUG_ON(!bh->b_end_io);
				2793
				2794	if (buffer_ordered(bh) && (rw == WRITE))
				2795	rw = WRITE_BARRIER;
				2796
				2797	/*
				2798	* Only clear out a write error when rewriting, should this
				2799	* include WRITE_SYNC as well?
				2800	*/
				2801	if (test_set_buffer_req(bh) && (rw == WRITE \|\| rw == WRITE_BARRIER))
				2802	clear_buffer_write_io_error(bh);
				2803
				2804	/*
				2805	* from here on down, it's all bio -- do the initial mapping,
				2806	* submit_bio -> generic_make_request may further map this bio around
				2807	*/
				2808	bio = bio_alloc(GFP_NOIO, 1);
				2809
				2810	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
				2811	bio->bi_bdev = bh->b_bdev;
				2812	bio->bi_io_vec[0].bv_page = bh->b_page;
				2813	bio->bi_io_vec[0].bv_len = bh->b_size;
				2814	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
				2815
				2816	bio->bi_vcnt = 1;
				2817	bio->bi_idx = 0;
				2818	bio->bi_size = bh->b_size;
				2819
				2820	bio->bi_end_io = end_bio_bh_io_sync;
				2821	bio->bi_private = bh;
				2822
				2823	bio_get(bio);
				2824	submit_bio(rw, bio);
				2825
				2826	if (bio_flagged(bio, BIO_EOPNOTSUPP))
				2827	ret = -EOPNOTSUPP;
				2828
				2829	bio_put(bio);
				2830	return ret;
				2831	}
				2832
				2833	/**
				2834	* ll_rw_block: low-level access to block devices (DEPRECATED)
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2835	* @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2836	* @nr: number of &struct buffer_heads in the array
				2837	* @bhs: array of pointers to &struct buffer_head
				2838	*
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2839	* ll_rw_block() takes an array of pointers to &struct buffer_heads, and
				2840	* requests an I/O operation on them, either a %READ or a %WRITE. The third
				2841	* %SWRITE is like %WRITE only we make sure that the current data in buffers
				2842	* are sent to disk. The fourth %READA option is described in the documentation
				2843	* for generic_make_request() which ll_rw_block() calls.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2844	*
				2845	* This function drops any buffer that it cannot get a lock on (with the
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2846	* BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
				2847	* clean when doing a write request, and any buffer that appears to be
				2848	* up-to-date when doing read request. Further it marks as clean buffers that
				2849	* are processed for writing (the buffer cache won't assume that they are
				2850	* actually clean until the buffer gets unlocked).
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2851	*
				2852	* ll_rw_block sets b_end_io to simple completion handler that marks
				2853	* the buffer up-to-date (if approriate), unlocks the buffer and wakes
				2854	* any waiters.
				2855	*
				2856	* All of the buffers must be for the same device, and must also be a
				2857	* multiple of the current approved size for the device.
				2858	*/
				2859	void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
				2860	{
				2861	int i;
				2862
				2863	for (i = 0; i < nr; i++) {
				2864	struct buffer_head *bh = bhs[i];
				2865
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2866	if (rw == SWRITE)
				2867	lock_buffer(bh);
				2868	else if (test_set_buffer_locked(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2869	continue;
				2870
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2871	if (rw == WRITE \|\| rw == SWRITE) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2872	if (test_clear_buffer_dirty(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	2873	bh->b_end_io = end_buffer_write_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	2874	get_bh(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2875	submit_bh(WRITE, bh);
				2876	continue;
				2877	}
				2878	} else {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2879	if (!buffer_uptodate(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	2880	bh->b_end_io = end_buffer_read_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	2881	get_bh(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2882	submit_bh(rw, bh);
				2883	continue;
				2884	}
				2885	}
				2886	unlock_buffer(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2887	}
				2888	}
				2889
				2890	/*
				2891	* For a data-integrity writeout, we need to wait upon any in-progress I/O
				2892	* and then start new I/O and then wait upon it. The caller must have a ref on
				2893	* the buffer_head.
				2894	*/
				2895	int sync_dirty_buffer(struct buffer_head *bh)
				2896	{
				2897	int ret = 0;
				2898
				2899	WARN_ON(atomic_read(&bh->b_count) < 1);
				2900	lock_buffer(bh);
				2901	if (test_clear_buffer_dirty(bh)) {
				2902	get_bh(bh);
				2903	bh->b_end_io = end_buffer_write_sync;
				2904	ret = submit_bh(WRITE, bh);
				2905	wait_on_buffer(bh);
				2906	if (buffer_eopnotsupp(bh)) {
				2907	clear_buffer_eopnotsupp(bh);
				2908	ret = -EOPNOTSUPP;
				2909	}
				2910	if (!ret && !buffer_uptodate(bh))
				2911	ret = -EIO;
				2912	} else {
				2913	unlock_buffer(bh);
				2914	}
				2915	return ret;
				2916	}
				2917
				2918	/*
				2919	* try_to_free_buffers() checks if all the buffers on this particular page
				2920	* are unused, and releases them if so.
				2921	*
				2922	* Exclusion against try_to_free_buffers may be obtained by either
				2923	* locking the page or by holding its mapping's private_lock.
				2924	*
				2925	* If the page is dirty but all the buffers are clean then we need to
				2926	* be sure to mark the page clean as well. This is because the page
				2927	* may be against a block device, and a later reattachment of buffers
				2928	* to a dirty page will set all buffers dirty. Which would corrupt
				2929	* filesystem data on the same device.
				2930	*
				2931	* The same applies to regular filesystem pages: if all the buffers are
				2932	* clean then we set the page clean and proceed. To do that, we require
				2933	* total exclusion from __set_page_dirty_buffers(). That is obtained with
				2934	* private_lock.
				2935	*
				2936	* try_to_free_buffers() is non-blocking.
				2937	*/
				2938	static inline int buffer_busy(struct buffer_head *bh)
				2939	{
				2940	return atomic_read(&bh->b_count) \|
				2941	(bh->b_state & ((1 << BH_Dirty) \| (1 << BH_Lock)));
				2942	}
				2943
				2944	static int
				2945	drop_buffers(struct page page, struct buffer_head *buffers_to_free)
				2946	{
				2947	struct buffer_head *head = page_buffers(page);
				2948	struct buffer_head *bh;
				2949
				2950	bh = head;
				2951	do {
akpm@osdl.org	de7d5a3	2005-05-01 08:58:39 -0700	[diff] [blame]	2952	if (buffer_write_io_error(bh) && page->mapping)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2953	set_bit(AS_EIO, &page->mapping->flags);
				2954	if (buffer_busy(bh))
				2955	goto failed;
				2956	bh = bh->b_this_page;
				2957	} while (bh != head);
				2958
				2959	do {
				2960	struct buffer_head *next = bh->b_this_page;
				2961
				2962	if (!list_empty(&bh->b_assoc_buffers))
				2963	__remove_assoc_queue(bh);
				2964	bh = next;
				2965	} while (bh != head);
				2966	*buffers_to_free = head;
				2967	__clear_page_buffers(page);
				2968	return 1;
				2969	failed:
				2970	return 0;
				2971	}
				2972
				2973	int try_to_free_buffers(struct page *page)
				2974	{
				2975	struct address_space * const mapping = page->mapping;
				2976	struct buffer_head *buffers_to_free = NULL;
				2977	int ret = 0;
				2978
				2979	BUG_ON(!PageLocked(page));
				2980	if (PageWriteback(page))
				2981	return 0;
				2982
				2983	if (mapping == NULL) { /* can this still happen? */
				2984	ret = drop_buffers(page, &buffers_to_free);
				2985	goto out;
				2986	}
				2987
				2988	spin_lock(&mapping->private_lock);
				2989	ret = drop_buffers(page, &buffers_to_free);
				2990	if (ret) {
				2991	/*
				2992	* If the filesystem writes its buffers by hand (eg ext3)
				2993	* then we can have clean buffers against a dirty page. We
				2994	* clean the page here; otherwise later reattachment of buffers
				2995	* could encounter a non-uptodate page, which is unresolvable.
				2996	* This only applies in the rare case where try_to_free_buffers
				2997	* succeeds but the page is not freed.
				2998	*/
				2999	clear_page_dirty(page);
				3000	}
				3001	spin_unlock(&mapping->private_lock);
				3002	out:
				3003	if (buffers_to_free) {
				3004	struct buffer_head *bh = buffers_to_free;
				3005
				3006	do {
				3007	struct buffer_head *next = bh->b_this_page;
				3008	free_buffer_head(bh);
				3009	bh = next;
				3010	} while (bh != buffers_to_free);
				3011	}
				3012	return ret;
				3013	}
				3014	EXPORT_SYMBOL(try_to_free_buffers);
				3015
NeilBrown	3978d717	2006-03-26 01:37:17 -0800	[diff] [blame]	3016	void block_sync_page(struct page *page)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3017	{
				3018	struct address_space *mapping;
				3019
				3020	smp_mb();
				3021	mapping = page_mapping(page);
				3022	if (mapping)
				3023	blk_run_backing_dev(mapping->backing_dev_info, page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3024	}
				3025
				3026	/*
				3027	* There are no bdflush tunables left. But distributions are
				3028	* still running obsolete flush daemons, so we terminate them here.
				3029	*
				3030	* Use of bdflush() is deprecated and will be removed in a future kernel.
				3031	* The `pdflush' kernel threads fully replace bdflush daemons and this call.
				3032	*/
				3033	asmlinkage long sys_bdflush(int func, long data)
				3034	{
				3035	static int msg_count;
				3036
				3037	if (!capable(CAP_SYS_ADMIN))
				3038	return -EPERM;
				3039
				3040	if (msg_count < 5) {
				3041	msg_count++;
				3042	printk(KERN_INFO
				3043	"warning: process `%s' used the obsolete bdflush"
				3044	" system call\n", current->comm);
				3045	printk(KERN_INFO "Fix your initscripts?\n");
				3046	}
				3047
				3048	if (func == 1)
				3049	do_exit(0);
				3050	return 0;
				3051	}
				3052
				3053	/*
				3054	* Buffer-head allocation
				3055	*/
				3056	static kmem_cache_t *bh_cachep;
				3057
				3058	/*
				3059	* Once the number of bh's in the machine exceeds this level, we start
				3060	* stripping them in writeback.
				3061	*/
				3062	static int max_buffer_heads;
				3063
				3064	int buffer_heads_over_limit;
				3065
				3066	struct bh_accounting {
				3067	int nr; /* Number of live bh's */
				3068	int ratelimit; /* Limit cacheline bouncing */
				3069	};
				3070
				3071	static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
				3072
				3073	static void recalc_bh_state(void)
				3074	{
				3075	int i;
				3076	int tot = 0;
				3077
				3078	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
				3079	return;
				3080	__get_cpu_var(bh_accounting).ratelimit = 0;
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	3081	for_each_online_cpu(i)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3082	tot += per_cpu(bh_accounting, i).nr;
				3083	buffer_heads_over_limit = (tot > max_buffer_heads);
				3084	}
				3085
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	3086	struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3087	{
				3088	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
				3089	if (ret) {
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3090	get_cpu_var(bh_accounting).nr++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3091	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3092	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3093	}
				3094	return ret;
				3095	}
				3096	EXPORT_SYMBOL(alloc_buffer_head);
				3097
				3098	void free_buffer_head(struct buffer_head *bh)
				3099	{
				3100	BUG_ON(!list_empty(&bh->b_assoc_buffers));
				3101	kmem_cache_free(bh_cachep, bh);
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3102	get_cpu_var(bh_accounting).nr--;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3103	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3104	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3105	}
				3106	EXPORT_SYMBOL(free_buffer_head);
				3107
				3108	static void
				3109	init_buffer_head(void data, kmem_cache_t cachep, unsigned long flags)
				3110	{
				3111	if ((flags & (SLAB_CTOR_VERIFY\|SLAB_CTOR_CONSTRUCTOR)) ==
				3112	SLAB_CTOR_CONSTRUCTOR) {
				3113	struct buffer_head * bh = (struct buffer_head *)data;
				3114
				3115	memset(bh, 0, sizeof(*bh));
				3116	INIT_LIST_HEAD(&bh->b_assoc_buffers);
				3117	}
				3118	}
				3119
				3120	#ifdef CONFIG_HOTPLUG_CPU
				3121	static void buffer_exit_cpu(int cpu)
				3122	{
				3123	int i;
				3124	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
				3125
				3126	for (i = 0; i < BH_LRU_SIZE; i++) {
				3127	brelse(b->bhs[i]);
				3128	b->bhs[i] = NULL;
				3129	}
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	3130	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
				3131	per_cpu(bh_accounting, cpu).nr = 0;
				3132	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3133	}
				3134
				3135	static int buffer_cpu_notify(struct notifier_block *self,
				3136	unsigned long action, void *hcpu)
				3137	{
				3138	if (action == CPU_DEAD)
				3139	buffer_exit_cpu((unsigned long)hcpu);
				3140	return NOTIFY_OK;
				3141	}
				3142	#endif /* CONFIG_HOTPLUG_CPU */
				3143
				3144	void __init buffer_init(void)
				3145	{
				3146	int nrpages;
				3147
				3148	bh_cachep = kmem_cache_create("buffer_head",
Paul Jackson	b019600	2006-03-24 03:16:09 -0800	[diff] [blame]	3149	sizeof(struct buffer_head), 0,
				3150	(SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|
				3151	SLAB_MEM_SPREAD),
				3152	init_buffer_head,
				3153	NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3154
				3155	/*
				3156	* Limit the bh occupancy to 10% of ZONE_NORMAL
				3157	*/
				3158	nrpages = (nr_free_buffer_pages() * 10) / 100;
				3159	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
				3160	hotcpu_notifier(buffer_cpu_notify, 0);
				3161	}
				3162
				3163	EXPORT_SYMBOL(__bforget);
				3164	EXPORT_SYMBOL(__brelse);
				3165	EXPORT_SYMBOL(__wait_on_buffer);
				3166	EXPORT_SYMBOL(block_commit_write);
				3167	EXPORT_SYMBOL(block_prepare_write);
				3168	EXPORT_SYMBOL(block_read_full_page);
				3169	EXPORT_SYMBOL(block_sync_page);
				3170	EXPORT_SYMBOL(block_truncate_page);
				3171	EXPORT_SYMBOL(block_write_full_page);
				3172	EXPORT_SYMBOL(cont_prepare_write);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3173	EXPORT_SYMBOL(end_buffer_read_sync);
				3174	EXPORT_SYMBOL(end_buffer_write_sync);
				3175	EXPORT_SYMBOL(file_fsync);
				3176	EXPORT_SYMBOL(fsync_bdev);
				3177	EXPORT_SYMBOL(generic_block_bmap);
				3178	EXPORT_SYMBOL(generic_commit_write);
				3179	EXPORT_SYMBOL(generic_cont_expand);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	3180	EXPORT_SYMBOL(generic_cont_expand_simple);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3181	EXPORT_SYMBOL(init_buffer);
				3182	EXPORT_SYMBOL(invalidate_bdev);
				3183	EXPORT_SYMBOL(ll_rw_block);
				3184	EXPORT_SYMBOL(mark_buffer_dirty);
				3185	EXPORT_SYMBOL(submit_bh);
				3186	EXPORT_SYMBOL(sync_dirty_buffer);
				3187	EXPORT_SYMBOL(unlock_buffer);