Blame - fs/buffer.c - kernel/msm-4.9

blob: 1c62203a4906ec1c7ba2ca9ede00e2a2c498aeda [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/fs/buffer.c
				3	*
				4	* Copyright (C) 1991, 1992, 2002 Linus Torvalds
				5	*/
				6
				7	/*
				8	* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
				9	*
				10	* Removed a lot of unnecessary code and simplified things now that
				11	* the buffer cache isn't our primary cache - Andrew Tridgell 12/96
				12	*
				13	* Speed up hash, lru, and free list operations. Use gfp() for allocating
				14	* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
				15	*
				16	* Added 32k buffer block sizes - these are required older ARM systems. - RMK
				17	*
				18	* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
				19	*/
				20
				21	#include <linux/config.h>
				22	#include <linux/kernel.h>
				23	#include <linux/syscalls.h>
				24	#include <linux/fs.h>
				25	#include <linux/mm.h>
				26	#include <linux/percpu.h>
				27	#include <linux/slab.h>
				28	#include <linux/smp_lock.h>
				29	#include <linux/blkdev.h>
				30	#include <linux/file.h>
				31	#include <linux/quotaops.h>
				32	#include <linux/highmem.h>
				33	#include <linux/module.h>
				34	#include <linux/writeback.h>
				35	#include <linux/hash.h>
				36	#include <linux/suspend.h>
				37	#include <linux/buffer_head.h>
				38	#include <linux/bio.h>
				39	#include <linux/notifier.h>
				40	#include <linux/cpu.h>
				41	#include <linux/bitops.h>
				42	#include <linux/mpage.h>
				43
				44	static int fsync_buffers_list(spinlock_t lock, struct list_head list);
				45	static void invalidate_bh_lrus(void);
				46
				47	#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
				48
				49	inline void
				50	init_buffer(struct buffer_head bh, bh_end_io_t handler, void *private)
				51	{
				52	bh->b_end_io = handler;
				53	bh->b_private = private;
				54	}
				55
				56	static int sync_buffer(void *word)
				57	{
				58	struct block_device *bd;
				59	struct buffer_head *bh
				60	= container_of(word, struct buffer_head, b_state);
				61
				62	smp_mb();
				63	bd = bh->b_bdev;
				64	if (bd)
				65	blk_run_address_space(bd->bd_inode->i_mapping);
				66	io_schedule();
				67	return 0;
				68	}
				69
				70	void fastcall __lock_buffer(struct buffer_head *bh)
				71	{
				72	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
				73	TASK_UNINTERRUPTIBLE);
				74	}
				75	EXPORT_SYMBOL(__lock_buffer);
				76
				77	void fastcall unlock_buffer(struct buffer_head *bh)
				78	{
				79	clear_buffer_locked(bh);
				80	smp_mb__after_clear_bit();
				81	wake_up_bit(&bh->b_state, BH_Lock);
				82	}
				83
				84	/*
				85	* Block until a buffer comes unlocked. This doesn't stop it
				86	* from becoming locked again - you have to lock it yourself
				87	* if you want to preserve its state.
				88	*/
				89	void __wait_on_buffer(struct buffer_head * bh)
				90	{
				91	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
				92	}
				93
				94	static void
				95	__clear_page_buffers(struct page *page)
				96	{
				97	ClearPagePrivate(page);
				98	page->private = 0;
				99	page_cache_release(page);
				100	}
				101
				102	static void buffer_io_error(struct buffer_head *bh)
				103	{
				104	char b[BDEVNAME_SIZE];
				105
				106	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
				107	bdevname(bh->b_bdev, b),
				108	(unsigned long long)bh->b_blocknr);
				109	}
				110
				111	/*
				112	* Default synchronous end-of-IO handler.. Just mark it up-to-date and
				113	* unlock the buffer. This is what ll_rw_block uses too.
				114	*/
				115	void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
				116	{
				117	if (uptodate) {
				118	set_buffer_uptodate(bh);
				119	} else {
				120	/* This happens, due to failed READA attempts. */
				121	clear_buffer_uptodate(bh);
				122	}
				123	unlock_buffer(bh);
				124	put_bh(bh);
				125	}
				126
				127	void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
				128	{
				129	char b[BDEVNAME_SIZE];
				130
				131	if (uptodate) {
				132	set_buffer_uptodate(bh);
				133	} else {
				134	if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
				135	buffer_io_error(bh);
				136	printk(KERN_WARNING "lost page write due to "
				137	"I/O error on %s\n",
				138	bdevname(bh->b_bdev, b));
				139	}
				140	set_buffer_write_io_error(bh);
				141	clear_buffer_uptodate(bh);
				142	}
				143	unlock_buffer(bh);
				144	put_bh(bh);
				145	}
				146
				147	/*
				148	* Write out and wait upon all the dirty data associated with a block
				149	* device via its mapping. Does not take the superblock lock.
				150	*/
				151	int sync_blockdev(struct block_device *bdev)
				152	{
				153	int ret = 0;
				154
				155	if (bdev) {
				156	int err;
				157
				158	ret = filemap_fdatawrite(bdev->bd_inode->i_mapping);
				159	err = filemap_fdatawait(bdev->bd_inode->i_mapping);
				160	if (!ret)
				161	ret = err;
				162	}
				163	return ret;
				164	}
				165	EXPORT_SYMBOL(sync_blockdev);
				166
				167	/*
				168	* Write out and wait upon all dirty data associated with this
				169	* superblock. Filesystem data as well as the underlying block
				170	* device. Takes the superblock lock.
				171	*/
				172	int fsync_super(struct super_block *sb)
				173	{
				174	sync_inodes_sb(sb, 0);
				175	DQUOT_SYNC(sb);
				176	lock_super(sb);
				177	if (sb->s_dirt && sb->s_op->write_super)
				178	sb->s_op->write_super(sb);
				179	unlock_super(sb);
				180	if (sb->s_op->sync_fs)
				181	sb->s_op->sync_fs(sb, 1);
				182	sync_blockdev(sb->s_bdev);
				183	sync_inodes_sb(sb, 1);
				184
				185	return sync_blockdev(sb->s_bdev);
				186	}
				187
				188	/*
				189	* Write out and wait upon all dirty data associated with this
				190	* device. Filesystem data as well as the underlying block
				191	* device. Takes the superblock lock.
				192	*/
				193	int fsync_bdev(struct block_device *bdev)
				194	{
				195	struct super_block *sb = get_super(bdev);
				196	if (sb) {
				197	int res = fsync_super(sb);
				198	drop_super(sb);
				199	return res;
				200	}
				201	return sync_blockdev(bdev);
				202	}
				203
				204	/**
				205	* freeze_bdev -- lock a filesystem and force it into a consistent state
				206	* @bdev: blockdevice to lock
				207	*
				208	* This takes the block device bd_mount_sem to make sure no new mounts
				209	* happen on bdev until thaw_bdev() is called.
				210	* If a superblock is found on this device, we take the s_umount semaphore
				211	* on it to make sure nobody unmounts until the snapshot creation is done.
				212	*/
				213	struct super_block freeze_bdev(struct block_device bdev)
				214	{
				215	struct super_block *sb;
				216
				217	down(&bdev->bd_mount_sem);
				218	sb = get_super(bdev);
				219	if (sb && !(sb->s_flags & MS_RDONLY)) {
				220	sb->s_frozen = SB_FREEZE_WRITE;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	221	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	222
				223	sync_inodes_sb(sb, 0);
				224	DQUOT_SYNC(sb);
				225
				226	lock_super(sb);
				227	if (sb->s_dirt && sb->s_op->write_super)
				228	sb->s_op->write_super(sb);
				229	unlock_super(sb);
				230
				231	if (sb->s_op->sync_fs)
				232	sb->s_op->sync_fs(sb, 1);
				233
				234	sync_blockdev(sb->s_bdev);
				235	sync_inodes_sb(sb, 1);
				236
				237	sb->s_frozen = SB_FREEZE_TRANS;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	238	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	239
				240	sync_blockdev(sb->s_bdev);
				241
				242	if (sb->s_op->write_super_lockfs)
				243	sb->s_op->write_super_lockfs(sb);
				244	}
				245
				246	sync_blockdev(bdev);
				247	return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
				248	}
				249	EXPORT_SYMBOL(freeze_bdev);
				250
				251	/**
				252	* thaw_bdev -- unlock filesystem
				253	* @bdev: blockdevice to unlock
				254	* @sb: associated superblock
				255	*
				256	* Unlocks the filesystem and marks it writeable again after freeze_bdev().
				257	*/
				258	void thaw_bdev(struct block_device bdev, struct super_block sb)
				259	{
				260	if (sb) {
				261	BUG_ON(sb->s_bdev != bdev);
				262
				263	if (sb->s_op->unlockfs)
				264	sb->s_op->unlockfs(sb);
				265	sb->s_frozen = SB_UNFROZEN;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	266	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	267	wake_up(&sb->s_wait_unfrozen);
				268	drop_super(sb);
				269	}
				270
				271	up(&bdev->bd_mount_sem);
				272	}
				273	EXPORT_SYMBOL(thaw_bdev);
				274
				275	/*
				276	* sync everything. Start out by waking pdflush, because that writes back
				277	* all queues in parallel.
				278	*/
				279	static void do_sync(unsigned long wait)
				280	{
Pekka J Enberg	687a21c	2005-06-28 20:44:55 -0700	[diff] [blame]	281	wakeup_pdflush(0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	282	sync_inodes(0); /* All mappings, inodes and their blockdevs */
				283	DQUOT_SYNC(NULL);
				284	sync_supers(); /* Write the superblocks */
				285	sync_filesystems(0); /* Start syncing the filesystems */
				286	sync_filesystems(wait); /* Waitingly sync the filesystems */
				287	sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */
				288	if (!wait)
				289	printk("Emergency Sync complete\n");
				290	if (unlikely(laptop_mode))
				291	laptop_sync_completion();
				292	}
				293
				294	asmlinkage long sys_sync(void)
				295	{
				296	do_sync(1);
				297	return 0;
				298	}
				299
				300	void emergency_sync(void)
				301	{
				302	pdflush_operation(do_sync, 0);
				303	}
				304
				305	/*
				306	* Generic function to fsync a file.
				307	*
				308	* filp may be NULL if called via the msync of a vma.
				309	*/
				310
				311	int file_fsync(struct file filp, struct dentry dentry, int datasync)
				312	{
				313	struct inode * inode = dentry->d_inode;
				314	struct super_block * sb;
				315	int ret, err;
				316
				317	/* sync the inode to buffers */
				318	ret = write_inode_now(inode, 0);
				319
				320	/* sync the superblock to buffers */
				321	sb = inode->i_sb;
				322	lock_super(sb);
				323	if (sb->s_op->write_super)
				324	sb->s_op->write_super(sb);
				325	unlock_super(sb);
				326
				327	/* .. finally sync the buffers to disk */
				328	err = sync_blockdev(sb->s_bdev);
				329	if (!ret)
				330	ret = err;
				331	return ret;
				332	}
				333
Oleg Nesterov	dfb388b	2005-06-23 00:10:02 -0700	[diff] [blame]	334	static long do_fsync(unsigned int fd, int datasync)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	335	{
				336	struct file * file;
				337	struct address_space *mapping;
				338	int ret, err;
				339
				340	ret = -EBADF;
				341	file = fget(fd);
				342	if (!file)
				343	goto out;
				344
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	345	ret = -EINVAL;
				346	if (!file->f_op \|\| !file->f_op->fsync) {
				347	/* Why? We can still call filemap_fdatawrite */
				348	goto out_putf;
				349	}
				350
Oleg Nesterov	dfb388b	2005-06-23 00:10:02 -0700	[diff] [blame]	351	mapping = file->f_mapping;
				352
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	353	current->flags \|= PF_SYNCWRITE;
				354	ret = filemap_fdatawrite(mapping);
				355
				356	/*
				357	* We need to protect against concurrent writers,
				358	* which could cause livelocks in fsync_buffers_list
				359	*/
				360	down(&mapping->host->i_sem);
Oleg Nesterov	dfb388b	2005-06-23 00:10:02 -0700	[diff] [blame]	361	err = file->f_op->fsync(file, file->f_dentry, datasync);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	362	if (!ret)
				363	ret = err;
				364	up(&mapping->host->i_sem);
				365	err = filemap_fdatawait(mapping);
				366	if (!ret)
				367	ret = err;
				368	current->flags &= ~PF_SYNCWRITE;
				369
				370	out_putf:
				371	fput(file);
				372	out:
				373	return ret;
				374	}
				375
Oleg Nesterov	dfb388b	2005-06-23 00:10:02 -0700	[diff] [blame]	376	asmlinkage long sys_fsync(unsigned int fd)
				377	{
				378	return do_fsync(fd, 0);
				379	}
				380
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	381	asmlinkage long sys_fdatasync(unsigned int fd)
				382	{
Oleg Nesterov	dfb388b	2005-06-23 00:10:02 -0700	[diff] [blame]	383	return do_fsync(fd, 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	384	}
				385
				386	/*
				387	* Various filesystems appear to want __find_get_block to be non-blocking.
				388	* But it's the page lock which protects the buffers. To get around this,
				389	* we get exclusion from try_to_free_buffers with the blockdev mapping's
				390	* private_lock.
				391	*
				392	* Hack idea: for the blockdev mapping, i_bufferlist_lock contention
				393	* may be quite high. This code could TryLock the page, and if that
				394	* succeeds, there is no need to take private_lock. (But if
				395	* private_lock is contended then so is mapping->tree_lock).
				396	*/
				397	static struct buffer_head *
				398	__find_get_block_slow(struct block_device *bdev, sector_t block, int unused)
				399	{
				400	struct inode *bd_inode = bdev->bd_inode;
				401	struct address_space *bd_mapping = bd_inode->i_mapping;
				402	struct buffer_head *ret = NULL;
				403	pgoff_t index;
				404	struct buffer_head *bh;
				405	struct buffer_head *head;
				406	struct page *page;
				407	int all_mapped = 1;
				408
				409	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
				410	page = find_get_page(bd_mapping, index);
				411	if (!page)
				412	goto out;
				413
				414	spin_lock(&bd_mapping->private_lock);
				415	if (!page_has_buffers(page))
				416	goto out_unlock;
				417	head = page_buffers(page);
				418	bh = head;
				419	do {
				420	if (bh->b_blocknr == block) {
				421	ret = bh;
				422	get_bh(bh);
				423	goto out_unlock;
				424	}
				425	if (!buffer_mapped(bh))
				426	all_mapped = 0;
				427	bh = bh->b_this_page;
				428	} while (bh != head);
				429
				430	/* we might be here because some of the buffers on this page are
				431	* not mapped. This is due to various races between
				432	* file io on the block device and getblk. It gets dealt with
				433	* elsewhere, don't buffer_error if we had some unmapped buffers
				434	*/
				435	if (all_mapped) {
				436	printk("__find_get_block_slow() failed. "
				437	"block=%llu, b_blocknr=%llu\n",
				438	(unsigned long long)block, (unsigned long long)bh->b_blocknr);
				439	printk("b_state=0x%08lx, b_size=%u\n", bh->b_state, bh->b_size);
				440	printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
				441	}
				442	out_unlock:
				443	spin_unlock(&bd_mapping->private_lock);
				444	page_cache_release(page);
				445	out:
				446	return ret;
				447	}
				448
				449	/* If invalidate_buffers() will trash dirty buffers, it means some kind
				450	of fs corruption is going on. Trashing dirty data always imply losing
				451	information that was supposed to be just stored on the physical layer
				452	by the user.
				453
				454	Thus invalidate_buffers in general usage is not allwowed to trash
				455	dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
				456	be preserved. These buffers are simply skipped.
				457
				458	We also skip buffers which are still in use. For example this can
				459	happen if a userspace program is reading the block device.
				460
				461	NOTE: In the case where the user removed a removable-media-disk even if
				462	there's still dirty data not synced on disk (due a bug in the device driver
				463	or due an error of the user), by not destroying the dirty buffers we could
				464	generate corruption also on the next media inserted, thus a parameter is
				465	necessary to handle this case in the most safe way possible (trying
				466	to not corrupt also the new disk inserted with the data belonging to
				467	the old now corrupted disk). Also for the ramdisk the natural thing
				468	to do in order to release the ramdisk memory is to destroy dirty buffers.
				469
				470	These are two special cases. Normal usage imply the device driver
				471	to issue a sync on the device (without waiting I/O completion) and
				472	then an invalidate_buffers call that doesn't trash dirty buffers.
				473
				474	For handling cache coherency with the blkdev pagecache the 'update' case
				475	is been introduced. It is needed to re-read from disk any pinned
				476	buffer. NOTE: re-reading from disk is destructive so we can do it only
				477	when we assume nobody is changing the buffercache under our I/O and when
				478	we think the disk contains more recent information than the buffercache.
				479	The update == 1 pass marks the buffers we need to update, the update == 2
				480	pass does the actual I/O. */
				481	void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
				482	{
				483	invalidate_bh_lrus();
				484	/*
				485	* FIXME: what about destroy_dirty_buffers?
				486	* We really want to use invalidate_inode_pages2() for
				487	* that, but not until that's cleaned up.
				488	*/
				489	invalidate_inode_pages(bdev->bd_inode->i_mapping);
				490	}
				491
				492	/*
				493	* Kick pdflush then try to free up some ZONE_NORMAL memory.
				494	*/
				495	static void free_more_memory(void)
				496	{
				497	struct zone **zones;
				498	pg_data_t *pgdat;
				499
Pekka J Enberg	687a21c	2005-06-28 20:44:55 -0700	[diff] [blame]	500	wakeup_pdflush(1024);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	501	yield();
				502
				503	for_each_pgdat(pgdat) {
				504	zones = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones;
				505	if (*zones)
Darren Hart	1ad539b	2005-06-21 17:14:53 -0700	[diff] [blame]	506	try_to_free_pages(zones, GFP_NOFS);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	507	}
				508	}
				509
				510	/*
				511	* I/O completion handler for block_read_full_page() - pages
				512	* which come unlocked at the end of I/O.
				513	*/
				514	static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
				515	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	516	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	517	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	518	struct buffer_head *tmp;
				519	struct page *page;
				520	int page_uptodate = 1;
				521
				522	BUG_ON(!buffer_async_read(bh));
				523
				524	page = bh->b_page;
				525	if (uptodate) {
				526	set_buffer_uptodate(bh);
				527	} else {
				528	clear_buffer_uptodate(bh);
				529	if (printk_ratelimit())
				530	buffer_io_error(bh);
				531	SetPageError(page);
				532	}
				533
				534	/*
				535	* Be _very_ careful from here on. Bad things can happen if
				536	* two buffer heads end IO at almost the same time and both
				537	* decide that the page is now completely done.
				538	*/
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	539	first = page_buffers(page);
				540	local_irq_save(flags);
				541	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	542	clear_buffer_async_read(bh);
				543	unlock_buffer(bh);
				544	tmp = bh;
				545	do {
				546	if (!buffer_uptodate(tmp))
				547	page_uptodate = 0;
				548	if (buffer_async_read(tmp)) {
				549	BUG_ON(!buffer_locked(tmp));
				550	goto still_busy;
				551	}
				552	tmp = tmp->b_this_page;
				553	} while (tmp != bh);
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	554	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				555	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	556
				557	/*
				558	* If none of the buffers had errors and they are all
				559	* uptodate then we can set the page uptodate.
				560	*/
				561	if (page_uptodate && !PageError(page))
				562	SetPageUptodate(page);
				563	unlock_page(page);
				564	return;
				565
				566	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	567	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				568	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	569	return;
				570	}
				571
				572	/*
				573	* Completion handler for block_write_full_page() - pages which are unlocked
				574	* during I/O, and which have PageWriteback cleared upon I/O completion.
				575	*/
				576	void end_buffer_async_write(struct buffer_head *bh, int uptodate)
				577	{
				578	char b[BDEVNAME_SIZE];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	579	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	580	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	581	struct buffer_head *tmp;
				582	struct page *page;
				583
				584	BUG_ON(!buffer_async_write(bh));
				585
				586	page = bh->b_page;
				587	if (uptodate) {
				588	set_buffer_uptodate(bh);
				589	} else {
				590	if (printk_ratelimit()) {
				591	buffer_io_error(bh);
				592	printk(KERN_WARNING "lost page write due to "
				593	"I/O error on %s\n",
				594	bdevname(bh->b_bdev, b));
				595	}
				596	set_bit(AS_EIO, &page->mapping->flags);
				597	clear_buffer_uptodate(bh);
				598	SetPageError(page);
				599	}
				600
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	601	first = page_buffers(page);
				602	local_irq_save(flags);
				603	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
				604
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	605	clear_buffer_async_write(bh);
				606	unlock_buffer(bh);
				607	tmp = bh->b_this_page;
				608	while (tmp != bh) {
				609	if (buffer_async_write(tmp)) {
				610	BUG_ON(!buffer_locked(tmp));
				611	goto still_busy;
				612	}
				613	tmp = tmp->b_this_page;
				614	}
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	615	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				616	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	617	end_page_writeback(page);
				618	return;
				619
				620	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	621	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				622	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	623	return;
				624	}
				625
				626	/*
				627	* If a page's buffers are under async readin (end_buffer_async_read
				628	* completion) then there is a possibility that another thread of
				629	* control could lock one of the buffers after it has completed
				630	* but while some of the other buffers have not completed. This
				631	* locked buffer would confuse end_buffer_async_read() into not unlocking
				632	* the page. So the absence of BH_Async_Read tells end_buffer_async_read()
				633	* that this buffer is not under async I/O.
				634	*
				635	* The page comes unlocked when it has no locked buffer_async buffers
				636	* left.
				637	*
				638	* PageLocked prevents anyone starting new async I/O reads any of
				639	* the buffers.
				640	*
				641	* PageWriteback is used to prevent simultaneous writeout of the same
				642	* page.
				643	*
				644	* PageLocked prevents anyone from starting writeback of a page which is
				645	* under read I/O (PageWriteback is only ever set against a locked page).
				646	*/
				647	static void mark_buffer_async_read(struct buffer_head *bh)
				648	{
				649	bh->b_end_io = end_buffer_async_read;
				650	set_buffer_async_read(bh);
				651	}
				652
				653	void mark_buffer_async_write(struct buffer_head *bh)
				654	{
				655	bh->b_end_io = end_buffer_async_write;
				656	set_buffer_async_write(bh);
				657	}
				658	EXPORT_SYMBOL(mark_buffer_async_write);
				659
				660
				661	/*
				662	* fs/buffer.c contains helper functions for buffer-backed address space's
				663	* fsync functions. A common requirement for buffer-based filesystems is
				664	* that certain data from the backing blockdev needs to be written out for
				665	* a successful fsync(). For example, ext2 indirect blocks need to be
				666	* written back and waited upon before fsync() returns.
				667	*
				668	* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
				669	* inode_has_buffers() and invalidate_inode_buffers() are provided for the
				670	* management of a list of dependent buffers at ->i_mapping->private_list.
				671	*
				672	* Locking is a little subtle: try_to_free_buffers() will remove buffers
				673	* from their controlling inode's queue when they are being freed. But
				674	* try_to_free_buffers() will be operating against the blockdev mapping
				675	* at the time, not against the S_ISREG file which depends on those buffers.
				676	* So the locking for private_list is via the private_lock in the address_space
				677	* which backs the buffers. Which is different from the address_space
				678	* against which the buffers are listed. So for a particular address_space,
				679	* mapping->private_lock does not protect mapping->private_list! In fact,
				680	* mapping->private_list will always be protected by the backing blockdev's
				681	* ->private_lock.
				682	*
				683	* Which introduces a requirement: all buffers on an address_space's
				684	* ->private_list must be from the same address_space: the blockdev's.
				685	*
				686	* address_spaces which do not place buffers at ->private_list via these
				687	* utility functions are free to use private_lock and private_list for
				688	* whatever they want. The only requirement is that list_empty(private_list)
				689	* be true at clear_inode() time.
				690	*
				691	* FIXME: clear_inode should not call invalidate_inode_buffers(). The
				692	* filesystems should do that. invalidate_inode_buffers() should just go
				693	* BUG_ON(!list_empty).
				694	*
				695	* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
				696	* take an address_space, not an inode. And it should be called
				697	* mark_buffer_dirty_fsync() to clearly define why those buffers are being
				698	* queued up.
				699	*
				700	* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
				701	* list if it is already on a list. Because if the buffer is on a list,
				702	* it must already be on the right one. If not, the filesystem is being
				703	* silly. This will save a ton of locking. But first we have to ensure
				704	* that buffers are taken off the old inode's list when they are freed
				705	* (presumably in truncate). That requires careful auditing of all
				706	* filesystems (do it inside bforget()). It could also be done by bringing
				707	* b_inode back.
				708	*/
				709
				710	/*
				711	* The buffer's backing address_space's private_lock must be held
				712	*/
				713	static inline void __remove_assoc_queue(struct buffer_head *bh)
				714	{
				715	list_del_init(&bh->b_assoc_buffers);
				716	}
				717
				718	int inode_has_buffers(struct inode *inode)
				719	{
				720	return !list_empty(&inode->i_data.private_list);
				721	}
				722
				723	/*
				724	* osync is designed to support O_SYNC io. It waits synchronously for
				725	* all already-submitted IO to complete, but does not queue any new
				726	* writes to the disk.
				727	*
				728	* To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
				729	* you dirty the buffers, and then use osync_inode_buffers to wait for
				730	* completion. Any other dirty buffers which are not yet queued for
				731	* write will not be flushed to disk by the osync.
				732	*/
				733	static int osync_buffers_list(spinlock_t lock, struct list_head list)
				734	{
				735	struct buffer_head *bh;
				736	struct list_head *p;
				737	int err = 0;
				738
				739	spin_lock(lock);
				740	repeat:
				741	list_for_each_prev(p, list) {
				742	bh = BH_ENTRY(p);
				743	if (buffer_locked(bh)) {
				744	get_bh(bh);
				745	spin_unlock(lock);
				746	wait_on_buffer(bh);
				747	if (!buffer_uptodate(bh))
				748	err = -EIO;
				749	brelse(bh);
				750	spin_lock(lock);
				751	goto repeat;
				752	}
				753	}
				754	spin_unlock(lock);
				755	return err;
				756	}
				757
				758	/**
				759	* sync_mapping_buffers - write out and wait upon a mapping's "associated"
				760	* buffers
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	761	* @mapping: the mapping which wants those buffers written
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	762	*
				763	* Starts I/O against the buffers at mapping->private_list, and waits upon
				764	* that I/O.
				765	*
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	766	* Basically, this is a convenience function for fsync().
				767	* @mapping is a file or directory which needs those buffers to be written for
				768	* a successful fsync().
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	769	*/
				770	int sync_mapping_buffers(struct address_space *mapping)
				771	{
				772	struct address_space *buffer_mapping = mapping->assoc_mapping;
				773
				774	if (buffer_mapping == NULL \|\| list_empty(&mapping->private_list))
				775	return 0;
				776
				777	return fsync_buffers_list(&buffer_mapping->private_lock,
				778	&mapping->private_list);
				779	}
				780	EXPORT_SYMBOL(sync_mapping_buffers);
				781
				782	/*
				783	* Called when we've recently written block `bblock', and it is known that
				784	* `bblock' was for a buffer_boundary() buffer. This means that the block at
				785	* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
				786	* dirty, schedule it for IO. So that indirects merge nicely with their data.
				787	*/
				788	void write_boundary_block(struct block_device *bdev,
				789	sector_t bblock, unsigned blocksize)
				790	{
				791	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
				792	if (bh) {
				793	if (buffer_dirty(bh))
				794	ll_rw_block(WRITE, 1, &bh);
				795	put_bh(bh);
				796	}
				797	}
				798
				799	void mark_buffer_dirty_inode(struct buffer_head bh, struct inode inode)
				800	{
				801	struct address_space *mapping = inode->i_mapping;
				802	struct address_space *buffer_mapping = bh->b_page->mapping;
				803
				804	mark_buffer_dirty(bh);
				805	if (!mapping->assoc_mapping) {
				806	mapping->assoc_mapping = buffer_mapping;
				807	} else {
				808	if (mapping->assoc_mapping != buffer_mapping)
				809	BUG();
				810	}
				811	if (list_empty(&bh->b_assoc_buffers)) {
				812	spin_lock(&buffer_mapping->private_lock);
				813	list_move_tail(&bh->b_assoc_buffers,
				814	&mapping->private_list);
				815	spin_unlock(&buffer_mapping->private_lock);
				816	}
				817	}
				818	EXPORT_SYMBOL(mark_buffer_dirty_inode);
				819
				820	/*
				821	* Add a page to the dirty page list.
				822	*
				823	* It is a sad fact of life that this function is called from several places
				824	* deeply under spinlocking. It may not sleep.
				825	*
				826	* If the page has buffers, the uptodate buffers are set dirty, to preserve
				827	* dirty-state coherency between the page and the buffers. It the page does
				828	* not have buffers then when they are later attached they will all be set
				829	* dirty.
				830	*
				831	* The buffers are dirtied before the page is dirtied. There's a small race
				832	* window in which a writepage caller may see the page cleanness but not the
				833	* buffer dirtiness. That's fine. If this code were to set the page dirty
				834	* before the buffers, a concurrent writepage caller could clear the page dirty
				835	* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
				836	* page on the dirty page list.
				837	*
				838	* We use private_lock to lock against try_to_free_buffers while using the
				839	* page's buffer list. Also use this to protect against clean buffers being
				840	* added to the page after it was set dirty.
				841	*
				842	* FIXME: may need to call ->reservepage here as well. That's rather up to the
				843	* address_space though.
				844	*/
				845	int __set_page_dirty_buffers(struct page *page)
				846	{
				847	struct address_space * const mapping = page->mapping;
				848
				849	spin_lock(&mapping->private_lock);
				850	if (page_has_buffers(page)) {
				851	struct buffer_head *head = page_buffers(page);
				852	struct buffer_head *bh = head;
				853
				854	do {
				855	set_buffer_dirty(bh);
				856	bh = bh->b_this_page;
				857	} while (bh != head);
				858	}
				859	spin_unlock(&mapping->private_lock);
				860
				861	if (!TestSetPageDirty(page)) {
				862	write_lock_irq(&mapping->tree_lock);
				863	if (page->mapping) { /* Race with truncate? */
				864	if (mapping_cap_account_dirty(mapping))
				865	inc_page_state(nr_dirty);
				866	radix_tree_tag_set(&mapping->page_tree,
				867	page_index(page),
				868	PAGECACHE_TAG_DIRTY);
				869	}
				870	write_unlock_irq(&mapping->tree_lock);
				871	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
				872	}
				873
				874	return 0;
				875	}
				876	EXPORT_SYMBOL(__set_page_dirty_buffers);
				877
				878	/*
				879	* Write out and wait upon a list of buffers.
				880	*
				881	* We have conflicting pressures: we want to make sure that all
				882	* initially dirty buffers get waited on, but that any subsequently
				883	* dirtied buffers don't. After all, we don't want fsync to last
				884	* forever if somebody is actively writing to the file.
				885	*
				886	* Do this in two main stages: first we copy dirty buffers to a
				887	* temporary inode list, queueing the writes as we go. Then we clean
				888	* up, waiting for those writes to complete.
				889	*
				890	* During this second stage, any subsequent updates to the file may end
				891	* up refiling the buffer on the original inode's dirty list again, so
				892	* there is a chance we will end up with a buffer queued for write but
				893	* not yet completed on that list. So, as a final cleanup we go through
				894	* the osync code to catch these locked, dirty buffers without requeuing
				895	* any newly dirty buffers for write.
				896	*/
				897	static int fsync_buffers_list(spinlock_t lock, struct list_head list)
				898	{
				899	struct buffer_head *bh;
				900	struct list_head tmp;
				901	int err = 0, err2;
				902
				903	INIT_LIST_HEAD(&tmp);
				904
				905	spin_lock(lock);
				906	while (!list_empty(list)) {
				907	bh = BH_ENTRY(list->next);
				908	list_del_init(&bh->b_assoc_buffers);
				909	if (buffer_dirty(bh) \|\| buffer_locked(bh)) {
				910	list_add(&bh->b_assoc_buffers, &tmp);
				911	if (buffer_dirty(bh)) {
				912	get_bh(bh);
				913	spin_unlock(lock);
				914	/*
				915	* Ensure any pending I/O completes so that
				916	* ll_rw_block() actually writes the current
				917	* contents - it is a noop if I/O is still in
				918	* flight on potentially older contents.
				919	*/
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	920	ll_rw_block(SWRITE, 1, &bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	921	brelse(bh);
				922	spin_lock(lock);
				923	}
				924	}
				925	}
				926
				927	while (!list_empty(&tmp)) {
				928	bh = BH_ENTRY(tmp.prev);
				929	__remove_assoc_queue(bh);
				930	get_bh(bh);
				931	spin_unlock(lock);
				932	wait_on_buffer(bh);
				933	if (!buffer_uptodate(bh))
				934	err = -EIO;
				935	brelse(bh);
				936	spin_lock(lock);
				937	}
				938
				939	spin_unlock(lock);
				940	err2 = osync_buffers_list(lock, list);
				941	if (err)
				942	return err;
				943	else
				944	return err2;
				945	}
				946
				947	/*
				948	* Invalidate any and all dirty buffers on a given inode. We are
				949	* probably unmounting the fs, but that doesn't mean we have already
				950	* done a sync(). Just drop the buffers from the inode list.
				951	*
				952	* NOTE: we take the inode's blockdev's mapping's private_lock. Which
				953	* assumes that all the buffers are against the blockdev. Not true
				954	* for reiserfs.
				955	*/
				956	void invalidate_inode_buffers(struct inode *inode)
				957	{
				958	if (inode_has_buffers(inode)) {
				959	struct address_space *mapping = &inode->i_data;
				960	struct list_head *list = &mapping->private_list;
				961	struct address_space *buffer_mapping = mapping->assoc_mapping;
				962
				963	spin_lock(&buffer_mapping->private_lock);
				964	while (!list_empty(list))
				965	__remove_assoc_queue(BH_ENTRY(list->next));
				966	spin_unlock(&buffer_mapping->private_lock);
				967	}
				968	}
				969
				970	/*
				971	* Remove any clean buffers from the inode's buffer list. This is called
				972	* when we're trying to free the inode itself. Those buffers can pin it.
				973	*
				974	* Returns true if all buffers were removed.
				975	*/
				976	int remove_inode_buffers(struct inode *inode)
				977	{
				978	int ret = 1;
				979
				980	if (inode_has_buffers(inode)) {
				981	struct address_space *mapping = &inode->i_data;
				982	struct list_head *list = &mapping->private_list;
				983	struct address_space *buffer_mapping = mapping->assoc_mapping;
				984
				985	spin_lock(&buffer_mapping->private_lock);
				986	while (!list_empty(list)) {
				987	struct buffer_head *bh = BH_ENTRY(list->next);
				988	if (buffer_dirty(bh)) {
				989	ret = 0;
				990	break;
				991	}
				992	__remove_assoc_queue(bh);
				993	}
				994	spin_unlock(&buffer_mapping->private_lock);
				995	}
				996	return ret;
				997	}
				998
				999	/*
				1000	* Create the appropriate buffers when given a page for data area and
				1001	* the size of each buffer.. Use the bh->b_this_page linked list to
				1002	* follow the buffers created. Return NULL if unable to create more
				1003	* buffers.
				1004	*
				1005	* The retry flag is used to differentiate async IO (paging, swapping)
				1006	* which may not fail from ordinary buffer allocations.
				1007	*/
				1008	struct buffer_head alloc_page_buffers(struct page page, unsigned long size,
				1009	int retry)
				1010	{
				1011	struct buffer_head bh, head;
				1012	long offset;
				1013
				1014	try_again:
				1015	head = NULL;
				1016	offset = PAGE_SIZE;
				1017	while ((offset -= size) >= 0) {
				1018	bh = alloc_buffer_head(GFP_NOFS);
				1019	if (!bh)
				1020	goto no_grow;
				1021
				1022	bh->b_bdev = NULL;
				1023	bh->b_this_page = head;
				1024	bh->b_blocknr = -1;
				1025	head = bh;
				1026
				1027	bh->b_state = 0;
				1028	atomic_set(&bh->b_count, 0);
				1029	bh->b_size = size;
				1030
				1031	/* Link the buffer to its page */
				1032	set_bh_page(bh, page, offset);
				1033
				1034	bh->b_end_io = NULL;
				1035	}
				1036	return head;
				1037	/*
				1038	* In case anything failed, we just free everything we got.
				1039	*/
				1040	no_grow:
				1041	if (head) {
				1042	do {
				1043	bh = head;
				1044	head = head->b_this_page;
				1045	free_buffer_head(bh);
				1046	} while (head);
				1047	}
				1048
				1049	/*
				1050	* Return failure for non-async IO requests. Async IO requests
				1051	* are not allowed to fail, so we have to wait until buffer heads
				1052	* become available. But we don't want tasks sleeping with
				1053	* partially complete buffers, so all were released above.
				1054	*/
				1055	if (!retry)
				1056	return NULL;
				1057
				1058	/* We're _really_ low on memory. Now we just
				1059	* wait for old buffer heads to become free due to
				1060	* finishing IO. Since this is an async request and
				1061	* the reserve list is empty, we're sure there are
				1062	* async buffer heads in use.
				1063	*/
				1064	free_more_memory();
				1065	goto try_again;
				1066	}
				1067	EXPORT_SYMBOL_GPL(alloc_page_buffers);
				1068
				1069	static inline void
				1070	link_dev_buffers(struct page page, struct buffer_head head)
				1071	{
				1072	struct buffer_head bh, tail;
				1073
				1074	bh = head;
				1075	do {
				1076	tail = bh;
				1077	bh = bh->b_this_page;
				1078	} while (bh);
				1079	tail->b_this_page = head;
				1080	attach_page_buffers(page, head);
				1081	}
				1082
				1083	/*
				1084	* Initialise the state of a blockdev page's buffers.
				1085	*/
				1086	static void
				1087	init_page_buffers(struct page page, struct block_device bdev,
				1088	sector_t block, int size)
				1089	{
				1090	struct buffer_head *head = page_buffers(page);
				1091	struct buffer_head *bh = head;
				1092	int uptodate = PageUptodate(page);
				1093
				1094	do {
				1095	if (!buffer_mapped(bh)) {
				1096	init_buffer(bh, NULL, NULL);
				1097	bh->b_bdev = bdev;
				1098	bh->b_blocknr = block;
				1099	if (uptodate)
				1100	set_buffer_uptodate(bh);
				1101	set_buffer_mapped(bh);
				1102	}
				1103	block++;
				1104	bh = bh->b_this_page;
				1105	} while (bh != head);
				1106	}
				1107
				1108	/*
				1109	* Create the page-cache page that contains the requested block.
				1110	*
				1111	* This is user purely for blockdev mappings.
				1112	*/
				1113	static struct page *
				1114	grow_dev_page(struct block_device *bdev, sector_t block,
				1115	pgoff_t index, int size)
				1116	{
				1117	struct inode *inode = bdev->bd_inode;
				1118	struct page *page;
				1119	struct buffer_head *bh;
				1120
				1121	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
				1122	if (!page)
				1123	return NULL;
				1124
				1125	if (!PageLocked(page))
				1126	BUG();
				1127
				1128	if (page_has_buffers(page)) {
				1129	bh = page_buffers(page);
				1130	if (bh->b_size == size) {
				1131	init_page_buffers(page, bdev, block, size);
				1132	return page;
				1133	}
				1134	if (!try_to_free_buffers(page))
				1135	goto failed;
				1136	}
				1137
				1138	/*
				1139	* Allocate some buffers for this page
				1140	*/
				1141	bh = alloc_page_buffers(page, size, 0);
				1142	if (!bh)
				1143	goto failed;
				1144
				1145	/*
				1146	* Link the page to the buffers and initialise them. Take the
				1147	* lock to be atomic wrt __find_get_block(), which does not
				1148	* run under the page lock.
				1149	*/
				1150	spin_lock(&inode->i_mapping->private_lock);
				1151	link_dev_buffers(page, bh);
				1152	init_page_buffers(page, bdev, block, size);
				1153	spin_unlock(&inode->i_mapping->private_lock);
				1154	return page;
				1155
				1156	failed:
				1157	BUG();
				1158	unlock_page(page);
				1159	page_cache_release(page);
				1160	return NULL;
				1161	}
				1162
				1163	/*
				1164	* Create buffers for the specified block device block's page. If
				1165	* that page was dirty, the buffers are set dirty also.
				1166	*
				1167	* Except that's a bug. Attaching dirty buffers to a dirty
				1168	* blockdev's page can result in filesystem corruption, because
				1169	* some of those buffers may be aliases of filesystem data.
				1170	* grow_dev_page() will go BUG() if this happens.
				1171	*/
				1172	static inline int
				1173	grow_buffers(struct block_device *bdev, sector_t block, int size)
				1174	{
				1175	struct page *page;
				1176	pgoff_t index;
				1177	int sizebits;
				1178
				1179	sizebits = -1;
				1180	do {
				1181	sizebits++;
				1182	} while ((size << sizebits) < PAGE_SIZE);
				1183
				1184	index = block >> sizebits;
				1185	block = index << sizebits;
				1186
				1187	/* Create a page with the proper size buffers.. */
				1188	page = grow_dev_page(bdev, block, index, size);
				1189	if (!page)
				1190	return 0;
				1191	unlock_page(page);
				1192	page_cache_release(page);
				1193	return 1;
				1194	}
				1195
Adrian Bunk	75c96f8	2005-05-05 16:16:09 -0700	[diff] [blame]	1196	static struct buffer_head *
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1197	__getblk_slow(struct block_device *bdev, sector_t block, int size)
				1198	{
				1199	/* Size must be multiple of hard sectorsize */
				1200	if (unlikely(size & (bdev_hardsect_size(bdev)-1) \|\|
				1201	(size < 512 \|\| size > PAGE_SIZE))) {
				1202	printk(KERN_ERR "getblk(): invalid block size %d requested\n",
				1203	size);
				1204	printk(KERN_ERR "hardsect size: %d\n",
				1205	bdev_hardsect_size(bdev));
				1206
				1207	dump_stack();
				1208	return NULL;
				1209	}
				1210
				1211	for (;;) {
				1212	struct buffer_head * bh;
				1213
				1214	bh = __find_get_block(bdev, block, size);
				1215	if (bh)
				1216	return bh;
				1217
				1218	if (!grow_buffers(bdev, block, size))
				1219	free_more_memory();
				1220	}
				1221	}
				1222
				1223	/*
				1224	* The relationship between dirty buffers and dirty pages:
				1225	*
				1226	* Whenever a page has any dirty buffers, the page's dirty bit is set, and
				1227	* the page is tagged dirty in its radix tree.
				1228	*
				1229	* At all times, the dirtiness of the buffers represents the dirtiness of
				1230	* subsections of the page. If the page has buffers, the page dirty bit is
				1231	* merely a hint about the true dirty state.
				1232	*
				1233	* When a page is set dirty in its entirety, all its buffers are marked dirty
				1234	* (if the page has buffers).
				1235	*
				1236	* When a buffer is marked dirty, its page is dirtied, but the page's other
				1237	* buffers are not.
				1238	*
				1239	* Also. When blockdev buffers are explicitly read with bread(), they
				1240	* individually become uptodate. But their backing page remains not
				1241	* uptodate - even if all of its buffers are uptodate. A subsequent
				1242	* block_read_full_page() against that page will discover all the uptodate
				1243	* buffers, will set the page uptodate and will perform no I/O.
				1244	*/
				1245
				1246	/**
				1247	* mark_buffer_dirty - mark a buffer_head as needing writeout
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1248	* @bh: the buffer_head to mark dirty
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1249	*
				1250	* mark_buffer_dirty() will set the dirty bit against the buffer, then set its
				1251	* backing page dirty, then tag the page as dirty in its address_space's radix
				1252	* tree and then attach the address_space's inode to its superblock's dirty
				1253	* inode list.
				1254	*
				1255	* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
				1256	* mapping->tree_lock and the global inode_lock.
				1257	*/
				1258	void fastcall mark_buffer_dirty(struct buffer_head *bh)
				1259	{
				1260	if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
				1261	__set_page_dirty_nobuffers(bh->b_page);
				1262	}
				1263
				1264	/*
				1265	* Decrement a buffer_head's reference count. If all buffers against a page
				1266	* have zero reference count, are clean and unlocked, and if the page is clean
				1267	* and unlocked then try_to_free_buffers() may strip the buffers from the page
				1268	* in preparation for freeing it (sometimes, rarely, buffers are removed from
				1269	* a page but it ends up not being freed, and buffers may later be reattached).
				1270	*/
				1271	void __brelse(struct buffer_head * buf)
				1272	{
				1273	if (atomic_read(&buf->b_count)) {
				1274	put_bh(buf);
				1275	return;
				1276	}
				1277	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
				1278	WARN_ON(1);
				1279	}
				1280
				1281	/*
				1282	* bforget() is like brelse(), except it discards any
				1283	* potentially dirty data.
				1284	*/
				1285	void __bforget(struct buffer_head *bh)
				1286	{
				1287	clear_buffer_dirty(bh);
				1288	if (!list_empty(&bh->b_assoc_buffers)) {
				1289	struct address_space *buffer_mapping = bh->b_page->mapping;
				1290
				1291	spin_lock(&buffer_mapping->private_lock);
				1292	list_del_init(&bh->b_assoc_buffers);
				1293	spin_unlock(&buffer_mapping->private_lock);
				1294	}
				1295	__brelse(bh);
				1296	}
				1297
				1298	static struct buffer_head __bread_slow(struct buffer_head bh)
				1299	{
				1300	lock_buffer(bh);
				1301	if (buffer_uptodate(bh)) {
				1302	unlock_buffer(bh);
				1303	return bh;
				1304	} else {
				1305	get_bh(bh);
				1306	bh->b_end_io = end_buffer_read_sync;
				1307	submit_bh(READ, bh);
				1308	wait_on_buffer(bh);
				1309	if (buffer_uptodate(bh))
				1310	return bh;
				1311	}
				1312	brelse(bh);
				1313	return NULL;
				1314	}
				1315
				1316	/*
				1317	* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
				1318	* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
				1319	* refcount elevated by one when they're in an LRU. A buffer can only appear
				1320	* once in a particular CPU's LRU. A single buffer can be present in multiple
				1321	* CPU's LRUs at the same time.
				1322	*
				1323	* This is a transparent caching front-end to sb_bread(), sb_getblk() and
				1324	* sb_find_get_block().
				1325	*
				1326	* The LRUs themselves only need locking against invalidate_bh_lrus. We use
				1327	* a local interrupt disable for that.
				1328	*/
				1329
				1330	#define BH_LRU_SIZE 8
				1331
				1332	struct bh_lru {
				1333	struct buffer_head *bhs[BH_LRU_SIZE];
				1334	};
				1335
				1336	static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
				1337
				1338	#ifdef CONFIG_SMP
				1339	#define bh_lru_lock() local_irq_disable()
				1340	#define bh_lru_unlock() local_irq_enable()
				1341	#else
				1342	#define bh_lru_lock() preempt_disable()
				1343	#define bh_lru_unlock() preempt_enable()
				1344	#endif
				1345
				1346	static inline void check_irqs_on(void)
				1347	{
				1348	#ifdef irqs_disabled
				1349	BUG_ON(irqs_disabled());
				1350	#endif
				1351	}
				1352
				1353	/*
				1354	* The LRU management algorithm is dopey-but-simple. Sorry.
				1355	*/
				1356	static void bh_lru_install(struct buffer_head *bh)
				1357	{
				1358	struct buffer_head *evictee = NULL;
				1359	struct bh_lru *lru;
				1360
				1361	check_irqs_on();
				1362	bh_lru_lock();
				1363	lru = &__get_cpu_var(bh_lrus);
				1364	if (lru->bhs[0] != bh) {
				1365	struct buffer_head *bhs[BH_LRU_SIZE];
				1366	int in;
				1367	int out = 0;
				1368
				1369	get_bh(bh);
				1370	bhs[out++] = bh;
				1371	for (in = 0; in < BH_LRU_SIZE; in++) {
				1372	struct buffer_head *bh2 = lru->bhs[in];
				1373
				1374	if (bh2 == bh) {
				1375	__brelse(bh2);
				1376	} else {
				1377	if (out >= BH_LRU_SIZE) {
				1378	BUG_ON(evictee != NULL);
				1379	evictee = bh2;
				1380	} else {
				1381	bhs[out++] = bh2;
				1382	}
				1383	}
				1384	}
				1385	while (out < BH_LRU_SIZE)
				1386	bhs[out++] = NULL;
				1387	memcpy(lru->bhs, bhs, sizeof(bhs));
				1388	}
				1389	bh_lru_unlock();
				1390
				1391	if (evictee)
				1392	__brelse(evictee);
				1393	}
				1394
				1395	/*
				1396	* Look up the bh in this cpu's LRU. If it's there, move it to the head.
				1397	*/
				1398	static inline struct buffer_head *
				1399	lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
				1400	{
				1401	struct buffer_head *ret = NULL;
				1402	struct bh_lru *lru;
				1403	int i;
				1404
				1405	check_irqs_on();
				1406	bh_lru_lock();
				1407	lru = &__get_cpu_var(bh_lrus);
				1408	for (i = 0; i < BH_LRU_SIZE; i++) {
				1409	struct buffer_head *bh = lru->bhs[i];
				1410
				1411	if (bh && bh->b_bdev == bdev &&
				1412	bh->b_blocknr == block && bh->b_size == size) {
				1413	if (i) {
				1414	while (i) {
				1415	lru->bhs[i] = lru->bhs[i - 1];
				1416	i--;
				1417	}
				1418	lru->bhs[0] = bh;
				1419	}
				1420	get_bh(bh);
				1421	ret = bh;
				1422	break;
				1423	}
				1424	}
				1425	bh_lru_unlock();
				1426	return ret;
				1427	}
				1428
				1429	/*
				1430	* Perform a pagecache lookup for the matching buffer. If it's there, refresh
				1431	* it in the LRU and mark it as accessed. If it is not present then return
				1432	* NULL
				1433	*/
				1434	struct buffer_head *
				1435	__find_get_block(struct block_device *bdev, sector_t block, int size)
				1436	{
				1437	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
				1438
				1439	if (bh == NULL) {
				1440	bh = __find_get_block_slow(bdev, block, size);
				1441	if (bh)
				1442	bh_lru_install(bh);
				1443	}
				1444	if (bh)
				1445	touch_buffer(bh);
				1446	return bh;
				1447	}
				1448	EXPORT_SYMBOL(__find_get_block);
				1449
				1450	/*
				1451	* __getblk will locate (and, if necessary, create) the buffer_head
				1452	* which corresponds to the passed block_device, block and size. The
				1453	* returned buffer has its reference count incremented.
				1454	*
				1455	* __getblk() cannot fail - it just keeps trying. If you pass it an
				1456	* illegal block number, __getblk() will happily return a buffer_head
				1457	* which represents the non-existent block. Very weird.
				1458	*
				1459	* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
				1460	* attempt is failing. FIXME, perhaps?
				1461	*/
				1462	struct buffer_head *
				1463	__getblk(struct block_device *bdev, sector_t block, int size)
				1464	{
				1465	struct buffer_head *bh = __find_get_block(bdev, block, size);
				1466
				1467	might_sleep();
				1468	if (bh == NULL)
				1469	bh = __getblk_slow(bdev, block, size);
				1470	return bh;
				1471	}
				1472	EXPORT_SYMBOL(__getblk);
				1473
				1474	/*
				1475	* Do async read-ahead on a buffer..
				1476	*/
				1477	void __breadahead(struct block_device *bdev, sector_t block, int size)
				1478	{
				1479	struct buffer_head *bh = __getblk(bdev, block, size);
				1480	ll_rw_block(READA, 1, &bh);
				1481	brelse(bh);
				1482	}
				1483	EXPORT_SYMBOL(__breadahead);
				1484
				1485	/**
				1486	* __bread() - reads a specified block and returns the bh
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1487	* @bdev: the block_device to read from
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1488	* @block: number of block
				1489	* @size: size (in bytes) to read
				1490	*
				1491	* Reads a specified block, and returns buffer head that contains it.
				1492	* It returns NULL if the block was unreadable.
				1493	*/
				1494	struct buffer_head *
				1495	__bread(struct block_device *bdev, sector_t block, int size)
				1496	{
				1497	struct buffer_head *bh = __getblk(bdev, block, size);
				1498
				1499	if (!buffer_uptodate(bh))
				1500	bh = __bread_slow(bh);
				1501	return bh;
				1502	}
				1503	EXPORT_SYMBOL(__bread);
				1504
				1505	/*
				1506	* invalidate_bh_lrus() is called rarely - but not only at unmount.
				1507	* This doesn't race because it runs in each cpu either in irq
				1508	* or with preempt disabled.
				1509	*/
				1510	static void invalidate_bh_lru(void *arg)
				1511	{
				1512	struct bh_lru *b = &get_cpu_var(bh_lrus);
				1513	int i;
				1514
				1515	for (i = 0; i < BH_LRU_SIZE; i++) {
				1516	brelse(b->bhs[i]);
				1517	b->bhs[i] = NULL;
				1518	}
				1519	put_cpu_var(bh_lrus);
				1520	}
				1521
				1522	static void invalidate_bh_lrus(void)
				1523	{
				1524	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
				1525	}
				1526
				1527	void set_bh_page(struct buffer_head *bh,
				1528	struct page *page, unsigned long offset)
				1529	{
				1530	bh->b_page = page;
				1531	if (offset >= PAGE_SIZE)
				1532	BUG();
				1533	if (PageHighMem(page))
				1534	/*
				1535	* This catches illegal uses and preserves the offset:
				1536	*/
				1537	bh->b_data = (char *)(0 + offset);
				1538	else
				1539	bh->b_data = page_address(page) + offset;
				1540	}
				1541	EXPORT_SYMBOL(set_bh_page);
				1542
				1543	/*
				1544	* Called when truncating a buffer on a page completely.
				1545	*/
				1546	static inline void discard_buffer(struct buffer_head * bh)
				1547	{
				1548	lock_buffer(bh);
				1549	clear_buffer_dirty(bh);
				1550	bh->b_bdev = NULL;
				1551	clear_buffer_mapped(bh);
				1552	clear_buffer_req(bh);
				1553	clear_buffer_new(bh);
				1554	clear_buffer_delay(bh);
				1555	unlock_buffer(bh);
				1556	}
				1557
				1558	/**
				1559	* try_to_release_page() - release old fs-specific metadata on a page
				1560	*
				1561	* @page: the page which the kernel is trying to free
				1562	* @gfp_mask: memory allocation flags (and I/O mode)
				1563	*
				1564	* The address_space is to try to release any data against the page
				1565	* (presumably at page->private). If the release was successful, return `1'.
				1566	* Otherwise return zero.
				1567	*
				1568	* The @gfp_mask argument specifies whether I/O may be performed to release
				1569	* this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
				1570	*
				1571	* NOTE: @gfp_mask may go away, and this function may become non-blocking.
				1572	*/
				1573	int try_to_release_page(struct page *page, int gfp_mask)
				1574	{
				1575	struct address_space * const mapping = page->mapping;
				1576
				1577	BUG_ON(!PageLocked(page));
				1578	if (PageWriteback(page))
				1579	return 0;
				1580
				1581	if (mapping && mapping->a_ops->releasepage)
				1582	return mapping->a_ops->releasepage(page, gfp_mask);
				1583	return try_to_free_buffers(page);
				1584	}
				1585	EXPORT_SYMBOL(try_to_release_page);
				1586
				1587	/**
				1588	* block_invalidatepage - invalidate part of all of a buffer-backed page
				1589	*
				1590	* @page: the page which is affected
				1591	* @offset: the index of the truncation point
				1592	*
				1593	* block_invalidatepage() is called when all or part of the page has become
				1594	* invalidatedby a truncate operation.
				1595	*
				1596	* block_invalidatepage() does not have to release all buffers, but it must
				1597	* ensure that no dirty buffer is left outside @offset and that no I/O
				1598	* is underway against any of the blocks which are outside the truncation
				1599	* point. Because the caller is about to free (and possibly reuse) those
				1600	* blocks on-disk.
				1601	*/
				1602	int block_invalidatepage(struct page *page, unsigned long offset)
				1603	{
				1604	struct buffer_head head, bh, *next;
				1605	unsigned int curr_off = 0;
				1606	int ret = 1;
				1607
				1608	BUG_ON(!PageLocked(page));
				1609	if (!page_has_buffers(page))
				1610	goto out;
				1611
				1612	head = page_buffers(page);
				1613	bh = head;
				1614	do {
				1615	unsigned int next_off = curr_off + bh->b_size;
				1616	next = bh->b_this_page;
				1617
				1618	/*
				1619	* is this block fully invalidated?
				1620	*/
				1621	if (offset <= curr_off)
				1622	discard_buffer(bh);
				1623	curr_off = next_off;
				1624	bh = next;
				1625	} while (bh != head);
				1626
				1627	/*
				1628	* We release buffers only if the entire page is being invalidated.
				1629	* The get_block cached value has been unconditionally invalidated,
				1630	* so real IO is not possible anymore.
				1631	*/
				1632	if (offset == 0)
				1633	ret = try_to_release_page(page, 0);
				1634	out:
				1635	return ret;
				1636	}
				1637	EXPORT_SYMBOL(block_invalidatepage);
				1638
				1639	/*
				1640	* We attach and possibly dirty the buffers atomically wrt
				1641	* __set_page_dirty_buffers() via private_lock. try_to_free_buffers
				1642	* is already excluded via the page lock.
				1643	*/
				1644	void create_empty_buffers(struct page *page,
				1645	unsigned long blocksize, unsigned long b_state)
				1646	{
				1647	struct buffer_head bh, head, *tail;
				1648
				1649	head = alloc_page_buffers(page, blocksize, 1);
				1650	bh = head;
				1651	do {
				1652	bh->b_state \|= b_state;
				1653	tail = bh;
				1654	bh = bh->b_this_page;
				1655	} while (bh);
				1656	tail->b_this_page = head;
				1657
				1658	spin_lock(&page->mapping->private_lock);
				1659	if (PageUptodate(page) \|\| PageDirty(page)) {
				1660	bh = head;
				1661	do {
				1662	if (PageDirty(page))
				1663	set_buffer_dirty(bh);
				1664	if (PageUptodate(page))
				1665	set_buffer_uptodate(bh);
				1666	bh = bh->b_this_page;
				1667	} while (bh != head);
				1668	}
				1669	attach_page_buffers(page, head);
				1670	spin_unlock(&page->mapping->private_lock);
				1671	}
				1672	EXPORT_SYMBOL(create_empty_buffers);
				1673
				1674	/*
				1675	* We are taking a block for data and we don't want any output from any
				1676	* buffer-cache aliases starting from return from that function and
				1677	* until the moment when something will explicitly mark the buffer
				1678	* dirty (hopefully that will not happen until we will free that block ;-)
				1679	* We don't even need to mark it not-uptodate - nobody can expect
				1680	* anything from a newly allocated buffer anyway. We used to used
				1681	* unmap_buffer() for such invalidation, but that was wrong. We definitely
				1682	* don't want to mark the alias unmapped, for example - it would confuse
				1683	* anyone who might pick it with bread() afterwards...
				1684	*
				1685	* Also.. Note that bforget() doesn't lock the buffer. So there can
				1686	* be writeout I/O going on against recently-freed buffers. We don't
				1687	* wait on that I/O in bforget() - it's more efficient to wait on the I/O
				1688	* only if we really need to. That happens here.
				1689	*/
				1690	void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
				1691	{
				1692	struct buffer_head *old_bh;
				1693
				1694	might_sleep();
				1695
				1696	old_bh = __find_get_block_slow(bdev, block, 0);
				1697	if (old_bh) {
				1698	clear_buffer_dirty(old_bh);
				1699	wait_on_buffer(old_bh);
				1700	clear_buffer_req(old_bh);
				1701	__brelse(old_bh);
				1702	}
				1703	}
				1704	EXPORT_SYMBOL(unmap_underlying_metadata);
				1705
				1706	/*
				1707	* NOTE! All mapped/uptodate combinations are valid:
				1708	*
				1709	* Mapped Uptodate Meaning
				1710	*
				1711	* No No "unknown" - must do get_block()
				1712	* No Yes "hole" - zero-filled
				1713	* Yes No "allocated" - allocated on disk, not read in
				1714	* Yes Yes "valid" - allocated and up-to-date in memory.
				1715	*
				1716	* "Dirty" is valid only with the last case (mapped+uptodate).
				1717	*/
				1718
				1719	/*
				1720	* While block_write_full_page is writing back the dirty buffers under
				1721	* the page lock, whoever dirtied the buffers may decide to clean them
				1722	* again at any time. We handle that by only looking at the buffer
				1723	* state inside lock_buffer().
				1724	*
				1725	* If block_write_full_page() is called for regular writeback
				1726	* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
				1727	* locked buffer. This only can happen if someone has written the buffer
				1728	* directly, with submit_bh(). At the address_space level PageWriteback
				1729	* prevents this contention from occurring.
				1730	*/
				1731	static int __block_write_full_page(struct inode inode, struct page page,
				1732	get_block_t get_block, struct writeback_control wbc)
				1733	{
				1734	int err;
				1735	sector_t block;
				1736	sector_t last_block;
Andrew Morton	f0fbd5f	2005-05-05 16:15:48 -0700	[diff] [blame]	1737	struct buffer_head bh, head;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1738	int nr_underway = 0;
				1739
				1740	BUG_ON(!PageLocked(page));
				1741
				1742	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
				1743
				1744	if (!page_has_buffers(page)) {
				1745	create_empty_buffers(page, 1 << inode->i_blkbits,
				1746	(1 << BH_Dirty)\|(1 << BH_Uptodate));
				1747	}
				1748
				1749	/*
				1750	* Be very careful. We have no exclusion from __set_page_dirty_buffers
				1751	* here, and the (potentially unmapped) buffers may become dirty at
				1752	* any time. If a buffer becomes dirty here after we've inspected it
				1753	* then we just miss that fact, and the page stays dirty.
				1754	*
				1755	* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
				1756	* handle that here by just cleaning them.
				1757	*/
				1758
				1759	block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
				1760	head = page_buffers(page);
				1761	bh = head;
				1762
				1763	/*
				1764	* Get all the dirty buffers mapped to disk addresses and
				1765	* handle any aliases from the underlying blockdev's mapping.
				1766	*/
				1767	do {
				1768	if (block > last_block) {
				1769	/*
				1770	* mapped buffers outside i_size will occur, because
				1771	* this page can be outside i_size when there is a
				1772	* truncate in progress.
				1773	*/
				1774	/*
				1775	* The buffer was zeroed by block_write_full_page()
				1776	*/
				1777	clear_buffer_dirty(bh);
				1778	set_buffer_uptodate(bh);
				1779	} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
				1780	err = get_block(inode, block, bh, 1);
				1781	if (err)
				1782	goto recover;
				1783	if (buffer_new(bh)) {
				1784	/* blockdev mappings never come here */
				1785	clear_buffer_new(bh);
				1786	unmap_underlying_metadata(bh->b_bdev,
				1787	bh->b_blocknr);
				1788	}
				1789	}
				1790	bh = bh->b_this_page;
				1791	block++;
				1792	} while (bh != head);
				1793
				1794	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1795	if (!buffer_mapped(bh))
				1796	continue;
				1797	/*
				1798	* If it's a fully non-blocking write attempt and we cannot
				1799	* lock the buffer then redirty the page. Note that this can
				1800	* potentially cause a busy-wait loop from pdflush and kswapd
				1801	* activity, but those code paths have their own higher-level
				1802	* throttling.
				1803	*/
				1804	if (wbc->sync_mode != WB_SYNC_NONE \|\| !wbc->nonblocking) {
				1805	lock_buffer(bh);
				1806	} else if (test_set_buffer_locked(bh)) {
				1807	redirty_page_for_writepage(wbc, page);
				1808	continue;
				1809	}
				1810	if (test_clear_buffer_dirty(bh)) {
				1811	mark_buffer_async_write(bh);
				1812	} else {
				1813	unlock_buffer(bh);
				1814	}
				1815	} while ((bh = bh->b_this_page) != head);
				1816
				1817	/*
				1818	* The page and its buffers are protected by PageWriteback(), so we can
				1819	* drop the bh refcounts early.
				1820	*/
				1821	BUG_ON(PageWriteback(page));
				1822	set_page_writeback(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1823
				1824	do {
				1825	struct buffer_head *next = bh->b_this_page;
				1826	if (buffer_async_write(bh)) {
				1827	submit_bh(WRITE, bh);
				1828	nr_underway++;
				1829	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1830	bh = next;
				1831	} while (bh != head);
Andrew Morton	05937ba	2005-05-05 16:15:47 -0700	[diff] [blame]	1832	unlock_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1833
				1834	err = 0;
				1835	done:
				1836	if (nr_underway == 0) {
				1837	/*
				1838	* The page was marked dirty, but the buffers were
				1839	* clean. Someone wrote them back by hand with
				1840	* ll_rw_block/submit_bh. A rare case.
				1841	*/
				1842	int uptodate = 1;
				1843	do {
				1844	if (!buffer_uptodate(bh)) {
				1845	uptodate = 0;
				1846	break;
				1847	}
				1848	bh = bh->b_this_page;
				1849	} while (bh != head);
				1850	if (uptodate)
				1851	SetPageUptodate(page);
				1852	end_page_writeback(page);
				1853	/*
				1854	* The page and buffer_heads can be released at any time from
				1855	* here on.
				1856	*/
				1857	wbc->pages_skipped++; /* We didn't write this page */
				1858	}
				1859	return err;
				1860
				1861	recover:
				1862	/*
				1863	* ENOSPC, or some other error. We may already have added some
				1864	* blocks to the file, so we need to write these out to avoid
				1865	* exposing stale data.
				1866	* The page is currently locked and not marked for writeback
				1867	*/
				1868	bh = head;
				1869	/* Recovery: lock and submit the mapped buffers */
				1870	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1871	if (buffer_mapped(bh) && buffer_dirty(bh)) {
				1872	lock_buffer(bh);
				1873	mark_buffer_async_write(bh);
				1874	} else {
				1875	/*
				1876	* The buffer may have been set dirty during
				1877	* attachment to a dirty page.
				1878	*/
				1879	clear_buffer_dirty(bh);
				1880	}
				1881	} while ((bh = bh->b_this_page) != head);
				1882	SetPageError(page);
				1883	BUG_ON(PageWriteback(page));
				1884	set_page_writeback(page);
				1885	unlock_page(page);
				1886	do {
				1887	struct buffer_head *next = bh->b_this_page;
				1888	if (buffer_async_write(bh)) {
				1889	clear_buffer_dirty(bh);
				1890	submit_bh(WRITE, bh);
				1891	nr_underway++;
				1892	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1893	bh = next;
				1894	} while (bh != head);
				1895	goto done;
				1896	}
				1897
				1898	static int __block_prepare_write(struct inode inode, struct page page,
				1899	unsigned from, unsigned to, get_block_t *get_block)
				1900	{
				1901	unsigned block_start, block_end;
				1902	sector_t block;
				1903	int err = 0;
				1904	unsigned blocksize, bbits;
				1905	struct buffer_head bh, head, wait[2], *wait_bh=wait;
				1906
				1907	BUG_ON(!PageLocked(page));
				1908	BUG_ON(from > PAGE_CACHE_SIZE);
				1909	BUG_ON(to > PAGE_CACHE_SIZE);
				1910	BUG_ON(from > to);
				1911
				1912	blocksize = 1 << inode->i_blkbits;
				1913	if (!page_has_buffers(page))
				1914	create_empty_buffers(page, blocksize, 0);
				1915	head = page_buffers(page);
				1916
				1917	bbits = inode->i_blkbits;
				1918	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
				1919
				1920	for(bh = head, block_start = 0; bh != head \|\| !block_start;
				1921	block++, block_start=block_end, bh = bh->b_this_page) {
				1922	block_end = block_start + blocksize;
				1923	if (block_end <= from \|\| block_start >= to) {
				1924	if (PageUptodate(page)) {
				1925	if (!buffer_uptodate(bh))
				1926	set_buffer_uptodate(bh);
				1927	}
				1928	continue;
				1929	}
				1930	if (buffer_new(bh))
				1931	clear_buffer_new(bh);
				1932	if (!buffer_mapped(bh)) {
				1933	err = get_block(inode, block, bh, 1);
				1934	if (err)
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1935	break;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1936	if (buffer_new(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1937	unmap_underlying_metadata(bh->b_bdev,
				1938	bh->b_blocknr);
				1939	if (PageUptodate(page)) {
				1940	set_buffer_uptodate(bh);
				1941	continue;
				1942	}
				1943	if (block_end > to \|\| block_start < from) {
				1944	void *kaddr;
				1945
				1946	kaddr = kmap_atomic(page, KM_USER0);
				1947	if (block_end > to)
				1948	memset(kaddr+to, 0,
				1949	block_end-to);
				1950	if (block_start < from)
				1951	memset(kaddr+block_start,
				1952	0, from-block_start);
				1953	flush_dcache_page(page);
				1954	kunmap_atomic(kaddr, KM_USER0);
				1955	}
				1956	continue;
				1957	}
				1958	}
				1959	if (PageUptodate(page)) {
				1960	if (!buffer_uptodate(bh))
				1961	set_buffer_uptodate(bh);
				1962	continue;
				1963	}
				1964	if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
				1965	(block_start < from \|\| block_end > to)) {
				1966	ll_rw_block(READ, 1, &bh);
				1967	*wait_bh++=bh;
				1968	}
				1969	}
				1970	/*
				1971	* If we issued read requests - let them complete.
				1972	*/
				1973	while(wait_bh > wait) {
				1974	wait_on_buffer(*--wait_bh);
				1975	if (!buffer_uptodate(*wait_bh))
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1976	err = -EIO;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1977	}
Anton Altaparmakov	152becd	2005-06-23 00:10:21 -0700	[diff] [blame]	1978	if (!err) {
				1979	bh = head;
				1980	do {
				1981	if (buffer_new(bh))
				1982	clear_buffer_new(bh);
				1983	} while ((bh = bh->b_this_page) != head);
				1984	return 0;
				1985	}
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1986	/* Error case: */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1987	/*
				1988	* Zero out any newly allocated blocks to avoid exposing stale
				1989	* data. If BH_New is set, we know that the block was newly
				1990	* allocated in the above loop.
				1991	*/
				1992	bh = head;
				1993	block_start = 0;
				1994	do {
				1995	block_end = block_start+blocksize;
				1996	if (block_end <= from)
				1997	goto next_bh;
				1998	if (block_start >= to)
				1999	break;
				2000	if (buffer_new(bh)) {
				2001	void *kaddr;
				2002
				2003	clear_buffer_new(bh);
				2004	kaddr = kmap_atomic(page, KM_USER0);
				2005	memset(kaddr+block_start, 0, bh->b_size);
				2006	kunmap_atomic(kaddr, KM_USER0);
				2007	set_buffer_uptodate(bh);
				2008	mark_buffer_dirty(bh);
				2009	}
				2010	next_bh:
				2011	block_start = block_end;
				2012	bh = bh->b_this_page;
				2013	} while (bh != head);
				2014	return err;
				2015	}
				2016
				2017	static int __block_commit_write(struct inode inode, struct page page,
				2018	unsigned from, unsigned to)
				2019	{
				2020	unsigned block_start, block_end;
				2021	int partial = 0;
				2022	unsigned blocksize;
				2023	struct buffer_head bh, head;
				2024
				2025	blocksize = 1 << inode->i_blkbits;
				2026
				2027	for(bh = head = page_buffers(page), block_start = 0;
				2028	bh != head \|\| !block_start;
				2029	block_start=block_end, bh = bh->b_this_page) {
				2030	block_end = block_start + blocksize;
				2031	if (block_end <= from \|\| block_start >= to) {
				2032	if (!buffer_uptodate(bh))
				2033	partial = 1;
				2034	} else {
				2035	set_buffer_uptodate(bh);
				2036	mark_buffer_dirty(bh);
				2037	}
				2038	}
				2039
				2040	/*
				2041	* If this is a partial write which happened to make all buffers
				2042	* uptodate then we can optimize away a bogus readpage() for
				2043	* the next read(). Here we 'discover' whether the page went
				2044	* uptodate as a result of this (potentially partial) write.
				2045	*/
				2046	if (!partial)
				2047	SetPageUptodate(page);
				2048	return 0;
				2049	}
				2050
				2051	/*
				2052	* Generic "read page" function for block devices that have the normal
				2053	* get_block functionality. This is most of the block device filesystems.
				2054	* Reads the page asynchronously --- the unlock_buffer() and
				2055	* set/clear_buffer_uptodate() functions propagate buffer state into the
				2056	* page struct once IO has completed.
				2057	*/
				2058	int block_read_full_page(struct page page, get_block_t get_block)
				2059	{
				2060	struct inode *inode = page->mapping->host;
				2061	sector_t iblock, lblock;
				2062	struct buffer_head bh, head, *arr[MAX_BUF_PER_PAGE];
				2063	unsigned int blocksize;
				2064	int nr, i;
				2065	int fully_mapped = 1;
				2066
Matt Mackall	cd7619d	2005-05-01 08:59:01 -0700	[diff] [blame]	2067	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2068	blocksize = 1 << inode->i_blkbits;
				2069	if (!page_has_buffers(page))
				2070	create_empty_buffers(page, blocksize, 0);
				2071	head = page_buffers(page);
				2072
				2073	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
				2074	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
				2075	bh = head;
				2076	nr = 0;
				2077	i = 0;
				2078
				2079	do {
				2080	if (buffer_uptodate(bh))
				2081	continue;
				2082
				2083	if (!buffer_mapped(bh)) {
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2084	int err = 0;
				2085
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2086	fully_mapped = 0;
				2087	if (iblock < lblock) {
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2088	err = get_block(inode, iblock, bh, 0);
				2089	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2090	SetPageError(page);
				2091	}
				2092	if (!buffer_mapped(bh)) {
				2093	void *kaddr = kmap_atomic(page, KM_USER0);
				2094	memset(kaddr + i * blocksize, 0, blocksize);
				2095	flush_dcache_page(page);
				2096	kunmap_atomic(kaddr, KM_USER0);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2097	if (!err)
				2098	set_buffer_uptodate(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2099	continue;
				2100	}
				2101	/*
				2102	* get_block() might have updated the buffer
				2103	* synchronously
				2104	*/
				2105	if (buffer_uptodate(bh))
				2106	continue;
				2107	}
				2108	arr[nr++] = bh;
				2109	} while (i++, iblock++, (bh = bh->b_this_page) != head);
				2110
				2111	if (fully_mapped)
				2112	SetPageMappedToDisk(page);
				2113
				2114	if (!nr) {
				2115	/*
				2116	* All buffers are uptodate - we can set the page uptodate
				2117	* as well. But not if get_block() returned an error.
				2118	*/
				2119	if (!PageError(page))
				2120	SetPageUptodate(page);
				2121	unlock_page(page);
				2122	return 0;
				2123	}
				2124
				2125	/* Stage two: lock the buffers */
				2126	for (i = 0; i < nr; i++) {
				2127	bh = arr[i];
				2128	lock_buffer(bh);
				2129	mark_buffer_async_read(bh);
				2130	}
				2131
				2132	/*
				2133	* Stage 3: start the IO. Check for uptodateness
				2134	* inside the buffer lock in case another process reading
				2135	* the underlying blockdev brought it uptodate (the sct fix).
				2136	*/
				2137	for (i = 0; i < nr; i++) {
				2138	bh = arr[i];
				2139	if (buffer_uptodate(bh))
				2140	end_buffer_async_read(bh, 1);
				2141	else
				2142	submit_bh(READ, bh);
				2143	}
				2144	return 0;
				2145	}
				2146
				2147	/* utility function for filesystems that need to do work on expanding
				2148	* truncates. Uses prepare/commit_write to allow the filesystem to
				2149	* deal with the hole.
				2150	*/
				2151	int generic_cont_expand(struct inode *inode, loff_t size)
				2152	{
				2153	struct address_space *mapping = inode->i_mapping;
				2154	struct page *page;
				2155	unsigned long index, offset, limit;
				2156	int err;
				2157
				2158	err = -EFBIG;
				2159	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
				2160	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
				2161	send_sig(SIGXFSZ, current, 0);
				2162	goto out;
				2163	}
				2164	if (size > inode->i_sb->s_maxbytes)
				2165	goto out;
				2166
				2167	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
				2168
				2169	/* ugh. in prepare/commit_write, if from==to==start of block, we
				2170	** skip the prepare. make sure we never send an offset for the start
				2171	** of a block
				2172	*/
				2173	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
				2174	offset++;
				2175	}
				2176	index = size >> PAGE_CACHE_SHIFT;
				2177	err = -ENOMEM;
				2178	page = grab_cache_page(mapping, index);
				2179	if (!page)
				2180	goto out;
				2181	err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
				2182	if (!err) {
				2183	err = mapping->a_ops->commit_write(NULL, page, offset, offset);
				2184	}
				2185	unlock_page(page);
				2186	page_cache_release(page);
				2187	if (err > 0)
				2188	err = 0;
				2189	out:
				2190	return err;
				2191	}
				2192
				2193	/*
				2194	* For moronic filesystems that do not allow holes in file.
				2195	* We may have to extend the file.
				2196	*/
				2197
				2198	int cont_prepare_write(struct page *page, unsigned offset,
				2199	unsigned to, get_block_t get_block, loff_t bytes)
				2200	{
				2201	struct address_space *mapping = page->mapping;
				2202	struct inode *inode = mapping->host;
				2203	struct page *new_page;
				2204	pgoff_t pgpos;
				2205	long status;
				2206	unsigned zerofrom;
				2207	unsigned blocksize = 1 << inode->i_blkbits;
				2208	void *kaddr;
				2209
				2210	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
				2211	status = -ENOMEM;
				2212	new_page = grab_cache_page(mapping, pgpos);
				2213	if (!new_page)
				2214	goto out;
				2215	/* we might sleep */
				2216	if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
				2217	unlock_page(new_page);
				2218	page_cache_release(new_page);
				2219	continue;
				2220	}
				2221	zerofrom = *bytes & ~PAGE_CACHE_MASK;
				2222	if (zerofrom & (blocksize-1)) {
				2223	*bytes \|= (blocksize-1);
				2224	(*bytes)++;
				2225	}
				2226	status = __block_prepare_write(inode, new_page, zerofrom,
				2227	PAGE_CACHE_SIZE, get_block);
				2228	if (status)
				2229	goto out_unmap;
				2230	kaddr = kmap_atomic(new_page, KM_USER0);
				2231	memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
				2232	flush_dcache_page(new_page);
				2233	kunmap_atomic(kaddr, KM_USER0);
				2234	generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
				2235	unlock_page(new_page);
				2236	page_cache_release(new_page);
				2237	}
				2238
				2239	if (page->index < pgpos) {
				2240	/* completely inside the area */
				2241	zerofrom = offset;
				2242	} else {
				2243	/* page covers the boundary, find the boundary offset */
				2244	zerofrom = *bytes & ~PAGE_CACHE_MASK;
				2245
				2246	/* if we will expand the thing last block will be filled */
				2247	if (to > zerofrom && (zerofrom & (blocksize-1))) {
				2248	*bytes \|= (blocksize-1);
				2249	(*bytes)++;
				2250	}
				2251
				2252	/* starting below the boundary? Nothing to zero out */
				2253	if (offset <= zerofrom)
				2254	zerofrom = offset;
				2255	}
				2256	status = __block_prepare_write(inode, page, zerofrom, to, get_block);
				2257	if (status)
				2258	goto out1;
				2259	if (zerofrom < offset) {
				2260	kaddr = kmap_atomic(page, KM_USER0);
				2261	memset(kaddr+zerofrom, 0, offset-zerofrom);
				2262	flush_dcache_page(page);
				2263	kunmap_atomic(kaddr, KM_USER0);
				2264	__block_commit_write(inode, page, zerofrom, offset);
				2265	}
				2266	return 0;
				2267	out1:
				2268	ClearPageUptodate(page);
				2269	return status;
				2270
				2271	out_unmap:
				2272	ClearPageUptodate(new_page);
				2273	unlock_page(new_page);
				2274	page_cache_release(new_page);
				2275	out:
				2276	return status;
				2277	}
				2278
				2279	int block_prepare_write(struct page *page, unsigned from, unsigned to,
				2280	get_block_t *get_block)
				2281	{
				2282	struct inode *inode = page->mapping->host;
				2283	int err = __block_prepare_write(inode, page, from, to, get_block);
				2284	if (err)
				2285	ClearPageUptodate(page);
				2286	return err;
				2287	}
				2288
				2289	int block_commit_write(struct page *page, unsigned from, unsigned to)
				2290	{
				2291	struct inode *inode = page->mapping->host;
				2292	__block_commit_write(inode,page,from,to);
				2293	return 0;
				2294	}
				2295
				2296	int generic_commit_write(struct file file, struct page page,
				2297	unsigned from, unsigned to)
				2298	{
				2299	struct inode *inode = page->mapping->host;
				2300	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				2301	__block_commit_write(inode,page,from,to);
				2302	/*
				2303	* No need to use i_size_read() here, the i_size
				2304	* cannot change under us because we hold i_sem.
				2305	*/
				2306	if (pos > inode->i_size) {
				2307	i_size_write(inode, pos);
				2308	mark_inode_dirty(inode);
				2309	}
				2310	return 0;
				2311	}
				2312
				2313
				2314	/*
				2315	* nobh_prepare_write()'s prereads are special: the buffer_heads are freed
				2316	* immediately, while under the page lock. So it needs a special end_io
				2317	* handler which does not touch the bh after unlocking it.
				2318	*
				2319	* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
				2320	* a race there is benign: unlock_buffer() only use the bh's address for
				2321	* hashing after unlocking the buffer, so it doesn't actually touch the bh
				2322	* itself.
				2323	*/
				2324	static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
				2325	{
				2326	if (uptodate) {
				2327	set_buffer_uptodate(bh);
				2328	} else {
				2329	/* This happens, due to failed READA attempts. */
				2330	clear_buffer_uptodate(bh);
				2331	}
				2332	unlock_buffer(bh);
				2333	}
				2334
				2335	/*
				2336	* On entry, the page is fully not uptodate.
				2337	* On exit the page is fully uptodate in the areas outside (from,to)
				2338	*/
				2339	int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
				2340	get_block_t *get_block)
				2341	{
				2342	struct inode *inode = page->mapping->host;
				2343	const unsigned blkbits = inode->i_blkbits;
				2344	const unsigned blocksize = 1 << blkbits;
				2345	struct buffer_head map_bh;
				2346	struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
				2347	unsigned block_in_page;
				2348	unsigned block_start;
				2349	sector_t block_in_file;
				2350	char *kaddr;
				2351	int nr_reads = 0;
				2352	int i;
				2353	int ret = 0;
				2354	int is_mapped_to_disk = 1;
				2355	int dirtied_it = 0;
				2356
				2357	if (PageMappedToDisk(page))
				2358	return 0;
				2359
				2360	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
				2361	map_bh.b_page = page;
				2362
				2363	/*
				2364	* We loop across all blocks in the page, whether or not they are
				2365	* part of the affected region. This is so we can discover if the
				2366	* page is fully mapped-to-disk.
				2367	*/
				2368	for (block_start = 0, block_in_page = 0;
				2369	block_start < PAGE_CACHE_SIZE;
				2370	block_in_page++, block_start += blocksize) {
				2371	unsigned block_end = block_start + blocksize;
				2372	int create;
				2373
				2374	map_bh.b_state = 0;
				2375	create = 1;
				2376	if (block_start >= to)
				2377	create = 0;
				2378	ret = get_block(inode, block_in_file + block_in_page,
				2379	&map_bh, create);
				2380	if (ret)
				2381	goto failed;
				2382	if (!buffer_mapped(&map_bh))
				2383	is_mapped_to_disk = 0;
				2384	if (buffer_new(&map_bh))
				2385	unmap_underlying_metadata(map_bh.b_bdev,
				2386	map_bh.b_blocknr);
				2387	if (PageUptodate(page))
				2388	continue;
				2389	if (buffer_new(&map_bh) \|\| !buffer_mapped(&map_bh)) {
				2390	kaddr = kmap_atomic(page, KM_USER0);
				2391	if (block_start < from) {
				2392	memset(kaddr+block_start, 0, from-block_start);
				2393	dirtied_it = 1;
				2394	}
				2395	if (block_end > to) {
				2396	memset(kaddr + to, 0, block_end - to);
				2397	dirtied_it = 1;
				2398	}
				2399	flush_dcache_page(page);
				2400	kunmap_atomic(kaddr, KM_USER0);
				2401	continue;
				2402	}
				2403	if (buffer_uptodate(&map_bh))
				2404	continue; /* reiserfs does this */
				2405	if (block_start < from \|\| block_end > to) {
				2406	struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
				2407
				2408	if (!bh) {
				2409	ret = -ENOMEM;
				2410	goto failed;
				2411	}
				2412	bh->b_state = map_bh.b_state;
				2413	atomic_set(&bh->b_count, 0);
				2414	bh->b_this_page = NULL;
				2415	bh->b_page = page;
				2416	bh->b_blocknr = map_bh.b_blocknr;
				2417	bh->b_size = blocksize;
				2418	bh->b_data = (char *)(long)block_start;
				2419	bh->b_bdev = map_bh.b_bdev;
				2420	bh->b_private = NULL;
				2421	read_bh[nr_reads++] = bh;
				2422	}
				2423	}
				2424
				2425	if (nr_reads) {
				2426	struct buffer_head *bh;
				2427
				2428	/*
				2429	* The page is locked, so these buffers are protected from
				2430	* any VM or truncate activity. Hence we don't need to care
				2431	* for the buffer_head refcounts.
				2432	*/
				2433	for (i = 0; i < nr_reads; i++) {
				2434	bh = read_bh[i];
				2435	lock_buffer(bh);
				2436	bh->b_end_io = end_buffer_read_nobh;
				2437	submit_bh(READ, bh);
				2438	}
				2439	for (i = 0; i < nr_reads; i++) {
				2440	bh = read_bh[i];
				2441	wait_on_buffer(bh);
				2442	if (!buffer_uptodate(bh))
				2443	ret = -EIO;
				2444	free_buffer_head(bh);
				2445	read_bh[i] = NULL;
				2446	}
				2447	if (ret)
				2448	goto failed;
				2449	}
				2450
				2451	if (is_mapped_to_disk)
				2452	SetPageMappedToDisk(page);
				2453	SetPageUptodate(page);
				2454
				2455	/*
				2456	* Setting the page dirty here isn't necessary for the prepare_write
				2457	* function - commit_write will do that. But if/when this function is
				2458	* used within the pagefault handler to ensure that all mmapped pages
				2459	* have backing space in the filesystem, we will need to dirty the page
				2460	* if its contents were altered.
				2461	*/
				2462	if (dirtied_it)
				2463	set_page_dirty(page);
				2464
				2465	return 0;
				2466
				2467	failed:
				2468	for (i = 0; i < nr_reads; i++) {
				2469	if (read_bh[i])
				2470	free_buffer_head(read_bh[i]);
				2471	}
				2472
				2473	/*
				2474	* Error recovery is pretty slack. Clear the page and mark it dirty
				2475	* so we'll later zero out any blocks which _were_ allocated.
				2476	*/
				2477	kaddr = kmap_atomic(page, KM_USER0);
				2478	memset(kaddr, 0, PAGE_CACHE_SIZE);
				2479	kunmap_atomic(kaddr, KM_USER0);
				2480	SetPageUptodate(page);
				2481	set_page_dirty(page);
				2482	return ret;
				2483	}
				2484	EXPORT_SYMBOL(nobh_prepare_write);
				2485
				2486	int nobh_commit_write(struct file file, struct page page,
				2487	unsigned from, unsigned to)
				2488	{
				2489	struct inode *inode = page->mapping->host;
				2490	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				2491
				2492	set_page_dirty(page);
				2493	if (pos > inode->i_size) {
				2494	i_size_write(inode, pos);
				2495	mark_inode_dirty(inode);
				2496	}
				2497	return 0;
				2498	}
				2499	EXPORT_SYMBOL(nobh_commit_write);
				2500
				2501	/*
				2502	* nobh_writepage() - based on block_full_write_page() except
				2503	* that it tries to operate without attaching bufferheads to
				2504	* the page.
				2505	*/
				2506	int nobh_writepage(struct page page, get_block_t get_block,
				2507	struct writeback_control *wbc)
				2508	{
				2509	struct inode * const inode = page->mapping->host;
				2510	loff_t i_size = i_size_read(inode);
				2511	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2512	unsigned offset;
				2513	void *kaddr;
				2514	int ret;
				2515
				2516	/* Is the page fully inside i_size? */
				2517	if (page->index < end_index)
				2518	goto out;
				2519
				2520	/* Is the page fully outside i_size? (truncate in progress) */
				2521	offset = i_size & (PAGE_CACHE_SIZE-1);
				2522	if (page->index >= end_index+1 \|\| !offset) {
				2523	/*
				2524	* The page may have dirty, unmapped buffers. For example,
				2525	* they may have been added in ext3_writepage(). Make them
				2526	* freeable here, so the page does not leak.
				2527	*/
				2528	#if 0
				2529	/* Not really sure about this - do we need this ? */
				2530	if (page->mapping->a_ops->invalidatepage)
				2531	page->mapping->a_ops->invalidatepage(page, offset);
				2532	#endif
				2533	unlock_page(page);
				2534	return 0; /* don't care */
				2535	}
				2536
				2537	/*
				2538	* The page straddles i_size. It must be zeroed out on each and every
				2539	* writepage invocation because it may be mmapped. "A file is mapped
				2540	* in multiples of the page size. For a file that is not a multiple of
				2541	* the page size, the remaining memory is zeroed when mapped, and
				2542	* writes to that region are not written out to the file."
				2543	*/
				2544	kaddr = kmap_atomic(page, KM_USER0);
				2545	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2546	flush_dcache_page(page);
				2547	kunmap_atomic(kaddr, KM_USER0);
				2548	out:
				2549	ret = mpage_writepage(page, get_block, wbc);
				2550	if (ret == -EAGAIN)
				2551	ret = __block_write_full_page(inode, page, get_block, wbc);
				2552	return ret;
				2553	}
				2554	EXPORT_SYMBOL(nobh_writepage);
				2555
				2556	/*
				2557	* This function assumes that ->prepare_write() uses nobh_prepare_write().
				2558	*/
				2559	int nobh_truncate_page(struct address_space *mapping, loff_t from)
				2560	{
				2561	struct inode *inode = mapping->host;
				2562	unsigned blocksize = 1 << inode->i_blkbits;
				2563	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2564	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				2565	unsigned to;
				2566	struct page *page;
				2567	struct address_space_operations *a_ops = mapping->a_ops;
				2568	char *kaddr;
				2569	int ret = 0;
				2570
				2571	if ((offset & (blocksize - 1)) == 0)
				2572	goto out;
				2573
				2574	ret = -ENOMEM;
				2575	page = grab_cache_page(mapping, index);
				2576	if (!page)
				2577	goto out;
				2578
				2579	to = (offset + blocksize) & ~(blocksize - 1);
				2580	ret = a_ops->prepare_write(NULL, page, offset, to);
				2581	if (ret == 0) {
				2582	kaddr = kmap_atomic(page, KM_USER0);
				2583	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2584	flush_dcache_page(page);
				2585	kunmap_atomic(kaddr, KM_USER0);
				2586	set_page_dirty(page);
				2587	}
				2588	unlock_page(page);
				2589	page_cache_release(page);
				2590	out:
				2591	return ret;
				2592	}
				2593	EXPORT_SYMBOL(nobh_truncate_page);
				2594
				2595	int block_truncate_page(struct address_space *mapping,
				2596	loff_t from, get_block_t *get_block)
				2597	{
				2598	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2599	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				2600	unsigned blocksize;
				2601	pgoff_t iblock;
				2602	unsigned length, pos;
				2603	struct inode *inode = mapping->host;
				2604	struct page *page;
				2605	struct buffer_head *bh;
				2606	void *kaddr;
				2607	int err;
				2608
				2609	blocksize = 1 << inode->i_blkbits;
				2610	length = offset & (blocksize - 1);
				2611
				2612	/* Block boundary? Nothing to do */
				2613	if (!length)
				2614	return 0;
				2615
				2616	length = blocksize - length;
				2617	iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
				2618
				2619	page = grab_cache_page(mapping, index);
				2620	err = -ENOMEM;
				2621	if (!page)
				2622	goto out;
				2623
				2624	if (!page_has_buffers(page))
				2625	create_empty_buffers(page, blocksize, 0);
				2626
				2627	/* Find the buffer that contains "offset" */
				2628	bh = page_buffers(page);
				2629	pos = blocksize;
				2630	while (offset >= pos) {
				2631	bh = bh->b_this_page;
				2632	iblock++;
				2633	pos += blocksize;
				2634	}
				2635
				2636	err = 0;
				2637	if (!buffer_mapped(bh)) {
				2638	err = get_block(inode, iblock, bh, 0);
				2639	if (err)
				2640	goto unlock;
				2641	/* unmapped? It's a hole - nothing to do */
				2642	if (!buffer_mapped(bh))
				2643	goto unlock;
				2644	}
				2645
				2646	/* Ok, it's mapped. Make sure it's up-to-date */
				2647	if (PageUptodate(page))
				2648	set_buffer_uptodate(bh);
				2649
				2650	if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
				2651	err = -EIO;
				2652	ll_rw_block(READ, 1, &bh);
				2653	wait_on_buffer(bh);
				2654	/* Uhhuh. Read error. Complain and punt. */
				2655	if (!buffer_uptodate(bh))
				2656	goto unlock;
				2657	}
				2658
				2659	kaddr = kmap_atomic(page, KM_USER0);
				2660	memset(kaddr + offset, 0, length);
				2661	flush_dcache_page(page);
				2662	kunmap_atomic(kaddr, KM_USER0);
				2663
				2664	mark_buffer_dirty(bh);
				2665	err = 0;
				2666
				2667	unlock:
				2668	unlock_page(page);
				2669	page_cache_release(page);
				2670	out:
				2671	return err;
				2672	}
				2673
				2674	/*
				2675	* The generic ->writepage function for buffer-backed address_spaces
				2676	*/
				2677	int block_write_full_page(struct page page, get_block_t get_block,
				2678	struct writeback_control *wbc)
				2679	{
				2680	struct inode * const inode = page->mapping->host;
				2681	loff_t i_size = i_size_read(inode);
				2682	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2683	unsigned offset;
				2684	void *kaddr;
				2685
				2686	/* Is the page fully inside i_size? */
				2687	if (page->index < end_index)
				2688	return __block_write_full_page(inode, page, get_block, wbc);
				2689
				2690	/* Is the page fully outside i_size? (truncate in progress) */
				2691	offset = i_size & (PAGE_CACHE_SIZE-1);
				2692	if (page->index >= end_index+1 \|\| !offset) {
				2693	/*
				2694	* The page may have dirty, unmapped buffers. For example,
				2695	* they may have been added in ext3_writepage(). Make them
				2696	* freeable here, so the page does not leak.
				2697	*/
				2698	block_invalidatepage(page, 0);
				2699	unlock_page(page);
				2700	return 0; /* don't care */
				2701	}
				2702
				2703	/*
				2704	* The page straddles i_size. It must be zeroed out on each and every
				2705	* writepage invokation because it may be mmapped. "A file is mapped
				2706	* in multiples of the page size. For a file that is not a multiple of
				2707	* the page size, the remaining memory is zeroed when mapped, and
				2708	* writes to that region are not written out to the file."
				2709	*/
				2710	kaddr = kmap_atomic(page, KM_USER0);
				2711	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2712	flush_dcache_page(page);
				2713	kunmap_atomic(kaddr, KM_USER0);
				2714	return __block_write_full_page(inode, page, get_block, wbc);
				2715	}
				2716
				2717	sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
				2718	get_block_t *get_block)
				2719	{
				2720	struct buffer_head tmp;
				2721	struct inode *inode = mapping->host;
				2722	tmp.b_state = 0;
				2723	tmp.b_blocknr = 0;
				2724	get_block(inode, block, &tmp, 0);
				2725	return tmp.b_blocknr;
				2726	}
				2727
				2728	static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
				2729	{
				2730	struct buffer_head *bh = bio->bi_private;
				2731
				2732	if (bio->bi_size)
				2733	return 1;
				2734
				2735	if (err == -EOPNOTSUPP) {
				2736	set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
				2737	set_bit(BH_Eopnotsupp, &bh->b_state);
				2738	}
				2739
				2740	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
				2741	bio_put(bio);
				2742	return 0;
				2743	}
				2744
				2745	int submit_bh(int rw, struct buffer_head * bh)
				2746	{
				2747	struct bio *bio;
				2748	int ret = 0;
				2749
				2750	BUG_ON(!buffer_locked(bh));
				2751	BUG_ON(!buffer_mapped(bh));
				2752	BUG_ON(!bh->b_end_io);
				2753
				2754	if (buffer_ordered(bh) && (rw == WRITE))
				2755	rw = WRITE_BARRIER;
				2756
				2757	/*
				2758	* Only clear out a write error when rewriting, should this
				2759	* include WRITE_SYNC as well?
				2760	*/
				2761	if (test_set_buffer_req(bh) && (rw == WRITE \|\| rw == WRITE_BARRIER))
				2762	clear_buffer_write_io_error(bh);
				2763
				2764	/*
				2765	* from here on down, it's all bio -- do the initial mapping,
				2766	* submit_bio -> generic_make_request may further map this bio around
				2767	*/
				2768	bio = bio_alloc(GFP_NOIO, 1);
				2769
				2770	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
				2771	bio->bi_bdev = bh->b_bdev;
				2772	bio->bi_io_vec[0].bv_page = bh->b_page;
				2773	bio->bi_io_vec[0].bv_len = bh->b_size;
				2774	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
				2775
				2776	bio->bi_vcnt = 1;
				2777	bio->bi_idx = 0;
				2778	bio->bi_size = bh->b_size;
				2779
				2780	bio->bi_end_io = end_bio_bh_io_sync;
				2781	bio->bi_private = bh;
				2782
				2783	bio_get(bio);
				2784	submit_bio(rw, bio);
				2785
				2786	if (bio_flagged(bio, BIO_EOPNOTSUPP))
				2787	ret = -EOPNOTSUPP;
				2788
				2789	bio_put(bio);
				2790	return ret;
				2791	}
				2792
				2793	/**
				2794	* ll_rw_block: low-level access to block devices (DEPRECATED)
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2795	* @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2796	* @nr: number of &struct buffer_heads in the array
				2797	* @bhs: array of pointers to &struct buffer_head
				2798	*
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2799	* ll_rw_block() takes an array of pointers to &struct buffer_heads, and
				2800	* requests an I/O operation on them, either a %READ or a %WRITE. The third
				2801	* %SWRITE is like %WRITE only we make sure that the current data in buffers
				2802	* are sent to disk. The fourth %READA option is described in the documentation
				2803	* for generic_make_request() which ll_rw_block() calls.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2804	*
				2805	* This function drops any buffer that it cannot get a lock on (with the
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2806	* BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
				2807	* clean when doing a write request, and any buffer that appears to be
				2808	* up-to-date when doing read request. Further it marks as clean buffers that
				2809	* are processed for writing (the buffer cache won't assume that they are
				2810	* actually clean until the buffer gets unlocked).
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2811	*
				2812	* ll_rw_block sets b_end_io to simple completion handler that marks
				2813	* the buffer up-to-date (if approriate), unlocks the buffer and wakes
				2814	* any waiters.
				2815	*
				2816	* All of the buffers must be for the same device, and must also be a
				2817	* multiple of the current approved size for the device.
				2818	*/
				2819	void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
				2820	{
				2821	int i;
				2822
				2823	for (i = 0; i < nr; i++) {
				2824	struct buffer_head *bh = bhs[i];
				2825
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2826	if (rw == SWRITE)
				2827	lock_buffer(bh);
				2828	else if (test_set_buffer_locked(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2829	continue;
				2830
				2831	get_bh(bh);
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2832	if (rw == WRITE \|\| rw == SWRITE) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2833	if (test_clear_buffer_dirty(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	2834	bh->b_end_io = end_buffer_write_sync;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2835	submit_bh(WRITE, bh);
				2836	continue;
				2837	}
				2838	} else {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2839	if (!buffer_uptodate(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	2840	bh->b_end_io = end_buffer_read_sync;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2841	submit_bh(rw, bh);
				2842	continue;
				2843	}
				2844	}
				2845	unlock_buffer(bh);
				2846	put_bh(bh);
				2847	}
				2848	}
				2849
				2850	/*
				2851	* For a data-integrity writeout, we need to wait upon any in-progress I/O
				2852	* and then start new I/O and then wait upon it. The caller must have a ref on
				2853	* the buffer_head.
				2854	*/
				2855	int sync_dirty_buffer(struct buffer_head *bh)
				2856	{
				2857	int ret = 0;
				2858
				2859	WARN_ON(atomic_read(&bh->b_count) < 1);
				2860	lock_buffer(bh);
				2861	if (test_clear_buffer_dirty(bh)) {
				2862	get_bh(bh);
				2863	bh->b_end_io = end_buffer_write_sync;
				2864	ret = submit_bh(WRITE, bh);
				2865	wait_on_buffer(bh);
				2866	if (buffer_eopnotsupp(bh)) {
				2867	clear_buffer_eopnotsupp(bh);
				2868	ret = -EOPNOTSUPP;
				2869	}
				2870	if (!ret && !buffer_uptodate(bh))
				2871	ret = -EIO;
				2872	} else {
				2873	unlock_buffer(bh);
				2874	}
				2875	return ret;
				2876	}
				2877
				2878	/*
				2879	* try_to_free_buffers() checks if all the buffers on this particular page
				2880	* are unused, and releases them if so.
				2881	*
				2882	* Exclusion against try_to_free_buffers may be obtained by either
				2883	* locking the page or by holding its mapping's private_lock.
				2884	*
				2885	* If the page is dirty but all the buffers are clean then we need to
				2886	* be sure to mark the page clean as well. This is because the page
				2887	* may be against a block device, and a later reattachment of buffers
				2888	* to a dirty page will set all buffers dirty. Which would corrupt
				2889	* filesystem data on the same device.
				2890	*
				2891	* The same applies to regular filesystem pages: if all the buffers are
				2892	* clean then we set the page clean and proceed. To do that, we require
				2893	* total exclusion from __set_page_dirty_buffers(). That is obtained with
				2894	* private_lock.
				2895	*
				2896	* try_to_free_buffers() is non-blocking.
				2897	*/
				2898	static inline int buffer_busy(struct buffer_head *bh)
				2899	{
				2900	return atomic_read(&bh->b_count) \|
				2901	(bh->b_state & ((1 << BH_Dirty) \| (1 << BH_Lock)));
				2902	}
				2903
				2904	static int
				2905	drop_buffers(struct page page, struct buffer_head *buffers_to_free)
				2906	{
				2907	struct buffer_head *head = page_buffers(page);
				2908	struct buffer_head *bh;
				2909
				2910	bh = head;
				2911	do {
akpm@osdl.org	de7d5a3	2005-05-01 08:58:39 -0700	[diff] [blame]	2912	if (buffer_write_io_error(bh) && page->mapping)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2913	set_bit(AS_EIO, &page->mapping->flags);
				2914	if (buffer_busy(bh))
				2915	goto failed;
				2916	bh = bh->b_this_page;
				2917	} while (bh != head);
				2918
				2919	do {
				2920	struct buffer_head *next = bh->b_this_page;
				2921
				2922	if (!list_empty(&bh->b_assoc_buffers))
				2923	__remove_assoc_queue(bh);
				2924	bh = next;
				2925	} while (bh != head);
				2926	*buffers_to_free = head;
				2927	__clear_page_buffers(page);
				2928	return 1;
				2929	failed:
				2930	return 0;
				2931	}
				2932
				2933	int try_to_free_buffers(struct page *page)
				2934	{
				2935	struct address_space * const mapping = page->mapping;
				2936	struct buffer_head *buffers_to_free = NULL;
				2937	int ret = 0;
				2938
				2939	BUG_ON(!PageLocked(page));
				2940	if (PageWriteback(page))
				2941	return 0;
				2942
				2943	if (mapping == NULL) { /* can this still happen? */
				2944	ret = drop_buffers(page, &buffers_to_free);
				2945	goto out;
				2946	}
				2947
				2948	spin_lock(&mapping->private_lock);
				2949	ret = drop_buffers(page, &buffers_to_free);
				2950	if (ret) {
				2951	/*
				2952	* If the filesystem writes its buffers by hand (eg ext3)
				2953	* then we can have clean buffers against a dirty page. We
				2954	* clean the page here; otherwise later reattachment of buffers
				2955	* could encounter a non-uptodate page, which is unresolvable.
				2956	* This only applies in the rare case where try_to_free_buffers
				2957	* succeeds but the page is not freed.
				2958	*/
				2959	clear_page_dirty(page);
				2960	}
				2961	spin_unlock(&mapping->private_lock);
				2962	out:
				2963	if (buffers_to_free) {
				2964	struct buffer_head *bh = buffers_to_free;
				2965
				2966	do {
				2967	struct buffer_head *next = bh->b_this_page;
				2968	free_buffer_head(bh);
				2969	bh = next;
				2970	} while (bh != buffers_to_free);
				2971	}
				2972	return ret;
				2973	}
				2974	EXPORT_SYMBOL(try_to_free_buffers);
				2975
				2976	int block_sync_page(struct page *page)
				2977	{
				2978	struct address_space *mapping;
				2979
				2980	smp_mb();
				2981	mapping = page_mapping(page);
				2982	if (mapping)
				2983	blk_run_backing_dev(mapping->backing_dev_info, page);
				2984	return 0;
				2985	}
				2986
				2987	/*
				2988	* There are no bdflush tunables left. But distributions are
				2989	* still running obsolete flush daemons, so we terminate them here.
				2990	*
				2991	* Use of bdflush() is deprecated and will be removed in a future kernel.
				2992	* The `pdflush' kernel threads fully replace bdflush daemons and this call.
				2993	*/
				2994	asmlinkage long sys_bdflush(int func, long data)
				2995	{
				2996	static int msg_count;
				2997
				2998	if (!capable(CAP_SYS_ADMIN))
				2999	return -EPERM;
				3000
				3001	if (msg_count < 5) {
				3002	msg_count++;
				3003	printk(KERN_INFO
				3004	"warning: process `%s' used the obsolete bdflush"
				3005	" system call\n", current->comm);
				3006	printk(KERN_INFO "Fix your initscripts?\n");
				3007	}
				3008
				3009	if (func == 1)
				3010	do_exit(0);
				3011	return 0;
				3012	}
				3013
				3014	/*
				3015	* Buffer-head allocation
				3016	*/
				3017	static kmem_cache_t *bh_cachep;
				3018
				3019	/*
				3020	* Once the number of bh's in the machine exceeds this level, we start
				3021	* stripping them in writeback.
				3022	*/
				3023	static int max_buffer_heads;
				3024
				3025	int buffer_heads_over_limit;
				3026
				3027	struct bh_accounting {
				3028	int nr; /* Number of live bh's */
				3029	int ratelimit; /* Limit cacheline bouncing */
				3030	};
				3031
				3032	static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
				3033
				3034	static void recalc_bh_state(void)
				3035	{
				3036	int i;
				3037	int tot = 0;
				3038
				3039	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
				3040	return;
				3041	__get_cpu_var(bh_accounting).ratelimit = 0;
				3042	for_each_cpu(i)
				3043	tot += per_cpu(bh_accounting, i).nr;
				3044	buffer_heads_over_limit = (tot > max_buffer_heads);
				3045	}
				3046
				3047	struct buffer_head *alloc_buffer_head(unsigned int __nocast gfp_flags)
				3048	{
				3049	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
				3050	if (ret) {
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3051	get_cpu_var(bh_accounting).nr++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3052	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3053	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3054	}
				3055	return ret;
				3056	}
				3057	EXPORT_SYMBOL(alloc_buffer_head);
				3058
				3059	void free_buffer_head(struct buffer_head *bh)
				3060	{
				3061	BUG_ON(!list_empty(&bh->b_assoc_buffers));
				3062	kmem_cache_free(bh_cachep, bh);
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3063	get_cpu_var(bh_accounting).nr--;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3064	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3065	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3066	}
				3067	EXPORT_SYMBOL(free_buffer_head);
				3068
				3069	static void
				3070	init_buffer_head(void data, kmem_cache_t cachep, unsigned long flags)
				3071	{
				3072	if ((flags & (SLAB_CTOR_VERIFY\|SLAB_CTOR_CONSTRUCTOR)) ==
				3073	SLAB_CTOR_CONSTRUCTOR) {
				3074	struct buffer_head * bh = (struct buffer_head *)data;
				3075
				3076	memset(bh, 0, sizeof(*bh));
				3077	INIT_LIST_HEAD(&bh->b_assoc_buffers);
				3078	}
				3079	}
				3080
				3081	#ifdef CONFIG_HOTPLUG_CPU
				3082	static void buffer_exit_cpu(int cpu)
				3083	{
				3084	int i;
				3085	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
				3086
				3087	for (i = 0; i < BH_LRU_SIZE; i++) {
				3088	brelse(b->bhs[i]);
				3089	b->bhs[i] = NULL;
				3090	}
				3091	}
				3092
				3093	static int buffer_cpu_notify(struct notifier_block *self,
				3094	unsigned long action, void *hcpu)
				3095	{
				3096	if (action == CPU_DEAD)
				3097	buffer_exit_cpu((unsigned long)hcpu);
				3098	return NOTIFY_OK;
				3099	}
				3100	#endif /* CONFIG_HOTPLUG_CPU */
				3101
				3102	void __init buffer_init(void)
				3103	{
				3104	int nrpages;
				3105
				3106	bh_cachep = kmem_cache_create("buffer_head",
				3107	sizeof(struct buffer_head), 0,
Andrea Arcangeli	e422fd2	2005-05-05 16:15:04 -0700	[diff] [blame]	3108	SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC, init_buffer_head, NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3109
				3110	/*
				3111	* Limit the bh occupancy to 10% of ZONE_NORMAL
				3112	*/
				3113	nrpages = (nr_free_buffer_pages() * 10) / 100;
				3114	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
				3115	hotcpu_notifier(buffer_cpu_notify, 0);
				3116	}
				3117
				3118	EXPORT_SYMBOL(__bforget);
				3119	EXPORT_SYMBOL(__brelse);
				3120	EXPORT_SYMBOL(__wait_on_buffer);
				3121	EXPORT_SYMBOL(block_commit_write);
				3122	EXPORT_SYMBOL(block_prepare_write);
				3123	EXPORT_SYMBOL(block_read_full_page);
				3124	EXPORT_SYMBOL(block_sync_page);
				3125	EXPORT_SYMBOL(block_truncate_page);
				3126	EXPORT_SYMBOL(block_write_full_page);
				3127	EXPORT_SYMBOL(cont_prepare_write);
				3128	EXPORT_SYMBOL(end_buffer_async_write);
				3129	EXPORT_SYMBOL(end_buffer_read_sync);
				3130	EXPORT_SYMBOL(end_buffer_write_sync);
				3131	EXPORT_SYMBOL(file_fsync);
				3132	EXPORT_SYMBOL(fsync_bdev);
				3133	EXPORT_SYMBOL(generic_block_bmap);
				3134	EXPORT_SYMBOL(generic_commit_write);
				3135	EXPORT_SYMBOL(generic_cont_expand);
				3136	EXPORT_SYMBOL(init_buffer);
				3137	EXPORT_SYMBOL(invalidate_bdev);
				3138	EXPORT_SYMBOL(ll_rw_block);
				3139	EXPORT_SYMBOL(mark_buffer_dirty);
				3140	EXPORT_SYMBOL(submit_bh);
				3141	EXPORT_SYMBOL(sync_dirty_buffer);
				3142	EXPORT_SYMBOL(unlock_buffer);