Blame - fs/buffer.c - kernel/msm-4.19

blob: 90a98865b0ccc38bc74d62a46d7243e683d9fe50 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/fs/buffer.c
				3	*
				4	* Copyright (C) 1991, 1992, 2002 Linus Torvalds
				5	*/
				6
				7	/*
				8	* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
				9	*
				10	* Removed a lot of unnecessary code and simplified things now that
				11	* the buffer cache isn't our primary cache - Andrew Tridgell 12/96
				12	*
				13	* Speed up hash, lru, and free list operations. Use gfp() for allocating
				14	* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
				15	*
				16	* Added 32k buffer block sizes - these are required older ARM systems. - RMK
				17	*
				18	* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
				19	*/
				20
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	21	#include <linux/kernel.h>
				22	#include <linux/syscalls.h>
				23	#include <linux/fs.h>
				24	#include <linux/mm.h>
				25	#include <linux/percpu.h>
				26	#include <linux/slab.h>
Randy Dunlap	16f7e0f	2006-01-11 12:17:46 -0800	[diff] [blame]	27	#include <linux/capability.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	28	#include <linux/blkdev.h>
				29	#include <linux/file.h>
				30	#include <linux/quotaops.h>
				31	#include <linux/highmem.h>
				32	#include <linux/module.h>
				33	#include <linux/writeback.h>
				34	#include <linux/hash.h>
				35	#include <linux/suspend.h>
				36	#include <linux/buffer_head.h>
Andrew Morton	55e829a	2006-12-10 02:19:27 -0800	[diff] [blame]	37	#include <linux/task_io_accounting_ops.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	38	#include <linux/bio.h>
				39	#include <linux/notifier.h>
				40	#include <linux/cpu.h>
				41	#include <linux/bitops.h>
				42	#include <linux/mpage.h>
Ingo Molnar	fb1c8f9	2005-09-10 00:25:56 -0700	[diff] [blame]	43	#include <linux/bit_spinlock.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	44
				45	static int fsync_buffers_list(spinlock_t lock, struct list_head list);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	46
				47	#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
				48
				49	inline void
				50	init_buffer(struct buffer_head bh, bh_end_io_t handler, void *private)
				51	{
				52	bh->b_end_io = handler;
				53	bh->b_private = private;
				54	}
				55
				56	static int sync_buffer(void *word)
				57	{
				58	struct block_device *bd;
				59	struct buffer_head *bh
				60	= container_of(word, struct buffer_head, b_state);
				61
				62	smp_mb();
				63	bd = bh->b_bdev;
				64	if (bd)
				65	blk_run_address_space(bd->bd_inode->i_mapping);
				66	io_schedule();
				67	return 0;
				68	}
				69
Harvey Harrison	fc9b52c	2008-02-08 04:19:52 -0800	[diff] [blame]	70	void __lock_buffer(struct buffer_head *bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	71	{
				72	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
				73	TASK_UNINTERRUPTIBLE);
				74	}
				75	EXPORT_SYMBOL(__lock_buffer);
				76
Harvey Harrison	fc9b52c	2008-02-08 04:19:52 -0800	[diff] [blame]	77	void unlock_buffer(struct buffer_head *bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	78	{
Nick Piggin	51b07fc	2008-10-18 20:27:00 -0700	[diff] [blame]	79	clear_bit_unlock(BH_Lock, &bh->b_state);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	80	smp_mb__after_clear_bit();
				81	wake_up_bit(&bh->b_state, BH_Lock);
				82	}
				83
				84	/*
				85	* Block until a buffer comes unlocked. This doesn't stop it
				86	* from becoming locked again - you have to lock it yourself
				87	* if you want to preserve its state.
				88	*/
				89	void __wait_on_buffer(struct buffer_head * bh)
				90	{
				91	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
				92	}
				93
				94	static void
				95	__clear_page_buffers(struct page *page)
				96	{
				97	ClearPagePrivate(page);
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	98	set_page_private(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	99	page_cache_release(page);
				100	}
				101
Keith Mannthey	08bafc0	2008-11-25 10:24:35 +0100	[diff] [blame]	102
				103	static int quiet_error(struct buffer_head *bh)
				104	{
				105	if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
				106	return 0;
				107	return 1;
				108	}
				109
				110
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	111	static void buffer_io_error(struct buffer_head *bh)
				112	{
				113	char b[BDEVNAME_SIZE];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	114	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
				115	bdevname(bh->b_bdev, b),
				116	(unsigned long long)bh->b_blocknr);
				117	}
				118
				119	/*
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	120	* End-of-IO handler helper function which does not touch the bh after
				121	* unlocking it.
				122	* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
				123	* a race there is benign: unlock_buffer() only use the bh's address for
				124	* hashing after unlocking the buffer, so it doesn't actually touch the bh
				125	* itself.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	126	*/
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	127	static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	128	{
				129	if (uptodate) {
				130	set_buffer_uptodate(bh);
				131	} else {
				132	/* This happens, due to failed READA attempts. */
				133	clear_buffer_uptodate(bh);
				134	}
				135	unlock_buffer(bh);
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	136	}
				137
				138	/*
				139	* Default synchronous end-of-IO handler.. Just mark it up-to-date and
				140	* unlock the buffer. This is what ll_rw_block uses too.
				141	*/
				142	void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
				143	{
				144	__end_buffer_read_notouch(bh, uptodate);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	145	put_bh(bh);
				146	}
				147
				148	void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
				149	{
				150	char b[BDEVNAME_SIZE];
				151
				152	if (uptodate) {
				153	set_buffer_uptodate(bh);
				154	} else {
Keith Mannthey	08bafc0	2008-11-25 10:24:35 +0100	[diff] [blame]	155	if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	156	buffer_io_error(bh);
				157	printk(KERN_WARNING "lost page write due to "
				158	"I/O error on %s\n",
				159	bdevname(bh->b_bdev, b));
				160	}
				161	set_buffer_write_io_error(bh);
				162	clear_buffer_uptodate(bh);
				163	}
				164	unlock_buffer(bh);
				165	put_bh(bh);
				166	}
				167
				168	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	169	* Various filesystems appear to want __find_get_block to be non-blocking.
				170	* But it's the page lock which protects the buffers. To get around this,
				171	* we get exclusion from try_to_free_buffers with the blockdev mapping's
				172	* private_lock.
				173	*
				174	* Hack idea: for the blockdev mapping, i_bufferlist_lock contention
				175	* may be quite high. This code could TryLock the page, and if that
				176	* succeeds, there is no need to take private_lock. (But if
				177	* private_lock is contended then so is mapping->tree_lock).
				178	*/
				179	static struct buffer_head *
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	180	__find_get_block_slow(struct block_device *bdev, sector_t block)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	181	{
				182	struct inode *bd_inode = bdev->bd_inode;
				183	struct address_space *bd_mapping = bd_inode->i_mapping;
				184	struct buffer_head *ret = NULL;
				185	pgoff_t index;
				186	struct buffer_head *bh;
				187	struct buffer_head *head;
				188	struct page *page;
				189	int all_mapped = 1;
				190
				191	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
				192	page = find_get_page(bd_mapping, index);
				193	if (!page)
				194	goto out;
				195
				196	spin_lock(&bd_mapping->private_lock);
				197	if (!page_has_buffers(page))
				198	goto out_unlock;
				199	head = page_buffers(page);
				200	bh = head;
				201	do {
Nikanth Karthikesan	97f76d3	2009-04-02 16:56:46 -0700	[diff] [blame]	202	if (!buffer_mapped(bh))
				203	all_mapped = 0;
				204	else if (bh->b_blocknr == block) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	205	ret = bh;
				206	get_bh(bh);
				207	goto out_unlock;
				208	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	209	bh = bh->b_this_page;
				210	} while (bh != head);
				211
				212	/* we might be here because some of the buffers on this page are
				213	* not mapped. This is due to various races between
				214	* file io on the block device and getblk. It gets dealt with
				215	* elsewhere, don't buffer_error if we had some unmapped buffers
				216	*/
				217	if (all_mapped) {
				218	printk("__find_get_block_slow() failed. "
				219	"block=%llu, b_blocknr=%llu\n",
Badari Pulavarty	205f87f	2006-03-26 01:38:00 -0800	[diff] [blame]	220	(unsigned long long)block,
				221	(unsigned long long)bh->b_blocknr);
				222	printk("b_state=0x%08lx, b_size=%zu\n",
				223	bh->b_state, bh->b_size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	224	printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
				225	}
				226	out_unlock:
				227	spin_unlock(&bd_mapping->private_lock);
				228	page_cache_release(page);
				229	out:
				230	return ret;
				231	}
				232
				233	/* If invalidate_buffers() will trash dirty buffers, it means some kind
				234	of fs corruption is going on. Trashing dirty data always imply losing
				235	information that was supposed to be just stored on the physical layer
				236	by the user.
				237
				238	Thus invalidate_buffers in general usage is not allwowed to trash
				239	dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
				240	be preserved. These buffers are simply skipped.
				241
				242	We also skip buffers which are still in use. For example this can
				243	happen if a userspace program is reading the block device.
				244
				245	NOTE: In the case where the user removed a removable-media-disk even if
				246	there's still dirty data not synced on disk (due a bug in the device driver
				247	or due an error of the user), by not destroying the dirty buffers we could
				248	generate corruption also on the next media inserted, thus a parameter is
				249	necessary to handle this case in the most safe way possible (trying
				250	to not corrupt also the new disk inserted with the data belonging to
				251	the old now corrupted disk). Also for the ramdisk the natural thing
				252	to do in order to release the ramdisk memory is to destroy dirty buffers.
				253
				254	These are two special cases. Normal usage imply the device driver
				255	to issue a sync on the device (without waiting I/O completion) and
				256	then an invalidate_buffers call that doesn't trash dirty buffers.
				257
				258	For handling cache coherency with the blkdev pagecache the 'update' case
				259	is been introduced. It is needed to re-read from disk any pinned
				260	buffer. NOTE: re-reading from disk is destructive so we can do it only
				261	when we assume nobody is changing the buffercache under our I/O and when
				262	we think the disk contains more recent information than the buffercache.
				263	The update == 1 pass marks the buffers we need to update, the update == 2
				264	pass does the actual I/O. */
Peter Zijlstra	f98393a	2007-05-06 14:49:54 -0700	[diff] [blame]	265	void invalidate_bdev(struct block_device *bdev)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	266	{
Andrew Morton	0e1dfc6	2006-07-30 03:03:28 -0700	[diff] [blame]	267	struct address_space *mapping = bdev->bd_inode->i_mapping;
				268
				269	if (mapping->nrpages == 0)
				270	return;
				271
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	272	invalidate_bh_lrus();
Andrew Morton	fc0ecff	2007-02-10 01:45:39 -0800	[diff] [blame]	273	invalidate_mapping_pages(mapping, 0, -1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	274	}
				275
				276	/*
				277	* Kick pdflush then try to free up some ZONE_NORMAL memory.
				278	*/
				279	static void free_more_memory(void)
				280	{
Mel Gorman	19770b3	2008-04-28 02:12:18 -0700	[diff] [blame]	281	struct zone *zone;
Mel Gorman	0e88460	2008-04-28 02:12:14 -0700	[diff] [blame]	282	int nid;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	283
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	284	wakeup_flusher_threads(1024);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	285	yield();
				286
Mel Gorman	0e88460	2008-04-28 02:12:14 -0700	[diff] [blame]	287	for_each_online_node(nid) {
Mel Gorman	19770b3	2008-04-28 02:12:18 -0700	[diff] [blame]	288	(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
				289	gfp_zone(GFP_NOFS), NULL,
				290	&zone);
				291	if (zone)
Mel Gorman	54a6eb5	2008-04-28 02:12:16 -0700	[diff] [blame]	292	try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
KAMEZAWA Hiroyuki	327c0e9	2009-03-31 15:23:31 -0700	[diff] [blame]	293	GFP_NOFS, NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	294	}
				295	}
				296
				297	/*
				298	* I/O completion handler for block_read_full_page() - pages
				299	* which come unlocked at the end of I/O.
				300	*/
				301	static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
				302	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	303	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	304	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	305	struct buffer_head *tmp;
				306	struct page *page;
				307	int page_uptodate = 1;
				308
				309	BUG_ON(!buffer_async_read(bh));
				310
				311	page = bh->b_page;
				312	if (uptodate) {
				313	set_buffer_uptodate(bh);
				314	} else {
				315	clear_buffer_uptodate(bh);
Keith Mannthey	08bafc0	2008-11-25 10:24:35 +0100	[diff] [blame]	316	if (!quiet_error(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	317	buffer_io_error(bh);
				318	SetPageError(page);
				319	}
				320
				321	/*
				322	* Be _very_ careful from here on. Bad things can happen if
				323	* two buffer heads end IO at almost the same time and both
				324	* decide that the page is now completely done.
				325	*/
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	326	first = page_buffers(page);
				327	local_irq_save(flags);
				328	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	329	clear_buffer_async_read(bh);
				330	unlock_buffer(bh);
				331	tmp = bh;
				332	do {
				333	if (!buffer_uptodate(tmp))
				334	page_uptodate = 0;
				335	if (buffer_async_read(tmp)) {
				336	BUG_ON(!buffer_locked(tmp));
				337	goto still_busy;
				338	}
				339	tmp = tmp->b_this_page;
				340	} while (tmp != bh);
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	341	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				342	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	343
				344	/*
				345	* If none of the buffers had errors and they are all
				346	* uptodate then we can set the page uptodate.
				347	*/
				348	if (page_uptodate && !PageError(page))
				349	SetPageUptodate(page);
				350	unlock_page(page);
				351	return;
				352
				353	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	354	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				355	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	356	return;
				357	}
				358
				359	/*
				360	* Completion handler for block_write_full_page() - pages which are unlocked
				361	* during I/O, and which have PageWriteback cleared upon I/O completion.
				362	*/
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	363	void end_buffer_async_write(struct buffer_head *bh, int uptodate)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	364	{
				365	char b[BDEVNAME_SIZE];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	366	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	367	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	368	struct buffer_head *tmp;
				369	struct page *page;
				370
				371	BUG_ON(!buffer_async_write(bh));
				372
				373	page = bh->b_page;
				374	if (uptodate) {
				375	set_buffer_uptodate(bh);
				376	} else {
Keith Mannthey	08bafc0	2008-11-25 10:24:35 +0100	[diff] [blame]	377	if (!quiet_error(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	378	buffer_io_error(bh);
				379	printk(KERN_WARNING "lost page write due to "
				380	"I/O error on %s\n",
				381	bdevname(bh->b_bdev, b));
				382	}
				383	set_bit(AS_EIO, &page->mapping->flags);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	384	set_buffer_write_io_error(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	385	clear_buffer_uptodate(bh);
				386	SetPageError(page);
				387	}
				388
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	389	first = page_buffers(page);
				390	local_irq_save(flags);
				391	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
				392
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	393	clear_buffer_async_write(bh);
				394	unlock_buffer(bh);
				395	tmp = bh->b_this_page;
				396	while (tmp != bh) {
				397	if (buffer_async_write(tmp)) {
				398	BUG_ON(!buffer_locked(tmp));
				399	goto still_busy;
				400	}
				401	tmp = tmp->b_this_page;
				402	}
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	403	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				404	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	405	end_page_writeback(page);
				406	return;
				407
				408	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	409	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				410	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	411	return;
				412	}
				413
				414	/*
				415	* If a page's buffers are under async readin (end_buffer_async_read
				416	* completion) then there is a possibility that another thread of
				417	* control could lock one of the buffers after it has completed
				418	* but while some of the other buffers have not completed. This
				419	* locked buffer would confuse end_buffer_async_read() into not unlocking
				420	* the page. So the absence of BH_Async_Read tells end_buffer_async_read()
				421	* that this buffer is not under async I/O.
				422	*
				423	* The page comes unlocked when it has no locked buffer_async buffers
				424	* left.
				425	*
				426	* PageLocked prevents anyone starting new async I/O reads any of
				427	* the buffers.
				428	*
				429	* PageWriteback is used to prevent simultaneous writeout of the same
				430	* page.
				431	*
				432	* PageLocked prevents anyone from starting writeback of a page which is
				433	* under read I/O (PageWriteback is only ever set against a locked page).
				434	*/
				435	static void mark_buffer_async_read(struct buffer_head *bh)
				436	{
				437	bh->b_end_io = end_buffer_async_read;
				438	set_buffer_async_read(bh);
				439	}
				440
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	441	void mark_buffer_async_write_endio(struct buffer_head *bh,
				442	bh_end_io_t *handler)
				443	{
				444	bh->b_end_io = handler;
				445	set_buffer_async_write(bh);
				446	}
				447
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	448	void mark_buffer_async_write(struct buffer_head *bh)
				449	{
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	450	mark_buffer_async_write_endio(bh, end_buffer_async_write);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	451	}
				452	EXPORT_SYMBOL(mark_buffer_async_write);
				453
				454
				455	/*
				456	* fs/buffer.c contains helper functions for buffer-backed address space's
				457	* fsync functions. A common requirement for buffer-based filesystems is
				458	* that certain data from the backing blockdev needs to be written out for
				459	* a successful fsync(). For example, ext2 indirect blocks need to be
				460	* written back and waited upon before fsync() returns.
				461	*
				462	* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
				463	* inode_has_buffers() and invalidate_inode_buffers() are provided for the
				464	* management of a list of dependent buffers at ->i_mapping->private_list.
				465	*
				466	* Locking is a little subtle: try_to_free_buffers() will remove buffers
				467	* from their controlling inode's queue when they are being freed. But
				468	* try_to_free_buffers() will be operating against the blockdev mapping
				469	* at the time, not against the S_ISREG file which depends on those buffers.
				470	* So the locking for private_list is via the private_lock in the address_space
				471	* which backs the buffers. Which is different from the address_space
				472	* against which the buffers are listed. So for a particular address_space,
				473	* mapping->private_lock does not protect mapping->private_list! In fact,
				474	* mapping->private_list will always be protected by the backing blockdev's
				475	* ->private_lock.
				476	*
				477	* Which introduces a requirement: all buffers on an address_space's
				478	* ->private_list must be from the same address_space: the blockdev's.
				479	*
				480	* address_spaces which do not place buffers at ->private_list via these
				481	* utility functions are free to use private_lock and private_list for
				482	* whatever they want. The only requirement is that list_empty(private_list)
				483	* be true at clear_inode() time.
				484	*
				485	* FIXME: clear_inode should not call invalidate_inode_buffers(). The
				486	* filesystems should do that. invalidate_inode_buffers() should just go
				487	* BUG_ON(!list_empty).
				488	*
				489	* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
				490	* take an address_space, not an inode. And it should be called
				491	* mark_buffer_dirty_fsync() to clearly define why those buffers are being
				492	* queued up.
				493	*
				494	* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
				495	* list if it is already on a list. Because if the buffer is on a list,
				496	* it must already be on the right one. If not, the filesystem is being
				497	* silly. This will save a ton of locking. But first we have to ensure
				498	* that buffers are taken off the old inode's list when they are freed
				499	* (presumably in truncate). That requires careful auditing of all
				500	* filesystems (do it inside bforget()). It could also be done by bringing
				501	* b_inode back.
				502	*/
				503
				504	/*
				505	* The buffer's backing address_space's private_lock must be held
				506	*/
Thomas Petazzoni	dbacefc	2008-07-29 22:33:47 -0700	[diff] [blame]	507	static void __remove_assoc_queue(struct buffer_head *bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	508	{
				509	list_del_init(&bh->b_assoc_buffers);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	510	WARN_ON(!bh->b_assoc_map);
				511	if (buffer_write_io_error(bh))
				512	set_bit(AS_EIO, &bh->b_assoc_map->flags);
				513	bh->b_assoc_map = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	514	}
				515
				516	int inode_has_buffers(struct inode *inode)
				517	{
				518	return !list_empty(&inode->i_data.private_list);
				519	}
				520
				521	/*
				522	* osync is designed to support O_SYNC io. It waits synchronously for
				523	* all already-submitted IO to complete, but does not queue any new
				524	* writes to the disk.
				525	*
				526	* To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
				527	* you dirty the buffers, and then use osync_inode_buffers to wait for
				528	* completion. Any other dirty buffers which are not yet queued for
				529	* write will not be flushed to disk by the osync.
				530	*/
				531	static int osync_buffers_list(spinlock_t lock, struct list_head list)
				532	{
				533	struct buffer_head *bh;
				534	struct list_head *p;
				535	int err = 0;
				536
				537	spin_lock(lock);
				538	repeat:
				539	list_for_each_prev(p, list) {
				540	bh = BH_ENTRY(p);
				541	if (buffer_locked(bh)) {
				542	get_bh(bh);
				543	spin_unlock(lock);
				544	wait_on_buffer(bh);
				545	if (!buffer_uptodate(bh))
				546	err = -EIO;
				547	brelse(bh);
				548	spin_lock(lock);
				549	goto repeat;
				550	}
				551	}
				552	spin_unlock(lock);
				553	return err;
				554	}
				555
Jens Axboe	053c525	2009-04-08 13:44:08 +0200	[diff] [blame]	556	void do_thaw_all(struct work_struct *work)
Eric Sandeen	c2d7543	2009-03-31 15:23:46 -0700	[diff] [blame]	557	{
				558	struct super_block *sb;
				559	char b[BDEVNAME_SIZE];
				560
				561	spin_lock(&sb_lock);
				562	restart:
				563	list_for_each_entry(sb, &super_blocks, s_list) {
				564	sb->s_count++;
				565	spin_unlock(&sb_lock);
				566	down_read(&sb->s_umount);
				567	while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
				568	printk(KERN_WARNING "Emergency Thaw on %s\n",
				569	bdevname(sb->s_bdev, b));
				570	up_read(&sb->s_umount);
				571	spin_lock(&sb_lock);
				572	if (__put_super_and_need_restart(sb))
				573	goto restart;
				574	}
				575	spin_unlock(&sb_lock);
Jens Axboe	053c525	2009-04-08 13:44:08 +0200	[diff] [blame]	576	kfree(work);
Eric Sandeen	c2d7543	2009-03-31 15:23:46 -0700	[diff] [blame]	577	printk(KERN_WARNING "Emergency Thaw complete\n");
				578	}
				579
				580	/**
				581	* emergency_thaw_all -- forcibly thaw every frozen filesystem
				582	*
				583	* Used for emergency unfreeze of all filesystems via SysRq
				584	*/
				585	void emergency_thaw_all(void)
				586	{
Jens Axboe	053c525	2009-04-08 13:44:08 +0200	[diff] [blame]	587	struct work_struct *work;
				588
				589	work = kmalloc(sizeof(*work), GFP_ATOMIC);
				590	if (work) {
				591	INIT_WORK(work, do_thaw_all);
				592	schedule_work(work);
				593	}
Eric Sandeen	c2d7543	2009-03-31 15:23:46 -0700	[diff] [blame]	594	}
				595
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	596	/**
Randy Dunlap	78a4a50	2008-02-29 22:02:31 -0800	[diff] [blame]	597	* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	598	* @mapping: the mapping which wants those buffers written
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	599	*
				600	* Starts I/O against the buffers at mapping->private_list, and waits upon
				601	* that I/O.
				602	*
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	603	* Basically, this is a convenience function for fsync().
				604	* @mapping is a file or directory which needs those buffers to be written for
				605	* a successful fsync().
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	606	*/
				607	int sync_mapping_buffers(struct address_space *mapping)
				608	{
				609	struct address_space *buffer_mapping = mapping->assoc_mapping;
				610
				611	if (buffer_mapping == NULL \|\| list_empty(&mapping->private_list))
				612	return 0;
				613
				614	return fsync_buffers_list(&buffer_mapping->private_lock,
				615	&mapping->private_list);
				616	}
				617	EXPORT_SYMBOL(sync_mapping_buffers);
				618
				619	/*
				620	* Called when we've recently written block `bblock', and it is known that
				621	* `bblock' was for a buffer_boundary() buffer. This means that the block at
				622	* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
				623	* dirty, schedule it for IO. So that indirects merge nicely with their data.
				624	*/
				625	void write_boundary_block(struct block_device *bdev,
				626	sector_t bblock, unsigned blocksize)
				627	{
				628	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
				629	if (bh) {
				630	if (buffer_dirty(bh))
				631	ll_rw_block(WRITE, 1, &bh);
				632	put_bh(bh);
				633	}
				634	}
				635
				636	void mark_buffer_dirty_inode(struct buffer_head bh, struct inode inode)
				637	{
				638	struct address_space *mapping = inode->i_mapping;
				639	struct address_space *buffer_mapping = bh->b_page->mapping;
				640
				641	mark_buffer_dirty(bh);
				642	if (!mapping->assoc_mapping) {
				643	mapping->assoc_mapping = buffer_mapping;
				644	} else {
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	645	BUG_ON(mapping->assoc_mapping != buffer_mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	646	}
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	647	if (!bh->b_assoc_map) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	648	spin_lock(&buffer_mapping->private_lock);
				649	list_move_tail(&bh->b_assoc_buffers,
				650	&mapping->private_list);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	651	bh->b_assoc_map = mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	652	spin_unlock(&buffer_mapping->private_lock);
				653	}
				654	}
				655	EXPORT_SYMBOL(mark_buffer_dirty_inode);
				656
				657	/*
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	658	* Mark the page dirty, and set it dirty in the radix tree, and mark the inode
				659	* dirty.
				660	*
				661	* If warn is true, then emit a warning if the page is not uptodate and has
				662	* not been truncated.
				663	*/
Linus Torvalds	a8e7d49	2009-03-19 11:32:05 -0700	[diff] [blame]	664	static void __set_page_dirty(struct page *page,
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	665	struct address_space *mapping, int warn)
				666	{
Nick Piggin	19fd623	2008-07-25 19:45:32 -0700	[diff] [blame]	667	spin_lock_irq(&mapping->tree_lock);
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	668	if (page->mapping) { /* Race with truncate? */
				669	WARN_ON_ONCE(warn && !PageUptodate(page));
Edward Shishkin	e3a7cca	2009-03-31 15:19:39 -0700	[diff] [blame]	670	account_page_dirtied(page, mapping);
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	671	radix_tree_tag_set(&mapping->page_tree,
				672	page_index(page), PAGECACHE_TAG_DIRTY);
				673	}
Nick Piggin	19fd623	2008-07-25 19:45:32 -0700	[diff] [blame]	674	spin_unlock_irq(&mapping->tree_lock);
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	675	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	676	}
				677
				678	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	679	* Add a page to the dirty page list.
				680	*
				681	* It is a sad fact of life that this function is called from several places
				682	* deeply under spinlocking. It may not sleep.
				683	*
				684	* If the page has buffers, the uptodate buffers are set dirty, to preserve
				685	* dirty-state coherency between the page and the buffers. It the page does
				686	* not have buffers then when they are later attached they will all be set
				687	* dirty.
				688	*
				689	* The buffers are dirtied before the page is dirtied. There's a small race
				690	* window in which a writepage caller may see the page cleanness but not the
				691	* buffer dirtiness. That's fine. If this code were to set the page dirty
				692	* before the buffers, a concurrent writepage caller could clear the page dirty
				693	* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
				694	* page on the dirty page list.
				695	*
				696	* We use private_lock to lock against try_to_free_buffers while using the
				697	* page's buffer list. Also use this to protect against clean buffers being
				698	* added to the page after it was set dirty.
				699	*
				700	* FIXME: may need to call ->reservepage here as well. That's rather up to the
				701	* address_space though.
				702	*/
				703	int __set_page_dirty_buffers(struct page *page)
				704	{
Linus Torvalds	a8e7d49	2009-03-19 11:32:05 -0700	[diff] [blame]	705	int newly_dirty;
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	706	struct address_space *mapping = page_mapping(page);
Nick Piggin	ebf7a22	2006-10-10 04:36:54 +0200	[diff] [blame]	707
				708	if (unlikely(!mapping))
				709	return !TestSetPageDirty(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	710
				711	spin_lock(&mapping->private_lock);
				712	if (page_has_buffers(page)) {
				713	struct buffer_head *head = page_buffers(page);
				714	struct buffer_head *bh = head;
				715
				716	do {
				717	set_buffer_dirty(bh);
				718	bh = bh->b_this_page;
				719	} while (bh != head);
				720	}
Linus Torvalds	a8e7d49	2009-03-19 11:32:05 -0700	[diff] [blame]	721	newly_dirty = !TestSetPageDirty(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	722	spin_unlock(&mapping->private_lock);
				723
Linus Torvalds	a8e7d49	2009-03-19 11:32:05 -0700	[diff] [blame]	724	if (newly_dirty)
				725	__set_page_dirty(page, mapping, 1);
				726	return newly_dirty;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	727	}
				728	EXPORT_SYMBOL(__set_page_dirty_buffers);
				729
				730	/*
				731	* Write out and wait upon a list of buffers.
				732	*
				733	* We have conflicting pressures: we want to make sure that all
				734	* initially dirty buffers get waited on, but that any subsequently
				735	* dirtied buffers don't. After all, we don't want fsync to last
				736	* forever if somebody is actively writing to the file.
				737	*
				738	* Do this in two main stages: first we copy dirty buffers to a
				739	* temporary inode list, queueing the writes as we go. Then we clean
				740	* up, waiting for those writes to complete.
				741	*
				742	* During this second stage, any subsequent updates to the file may end
				743	* up refiling the buffer on the original inode's dirty list again, so
				744	* there is a chance we will end up with a buffer queued for write but
				745	* not yet completed on that list. So, as a final cleanup we go through
				746	* the osync code to catch these locked, dirty buffers without requeuing
				747	* any newly dirty buffers for write.
				748	*/
				749	static int fsync_buffers_list(spinlock_t lock, struct list_head list)
				750	{
				751	struct buffer_head *bh;
				752	struct list_head tmp;
Jens Axboe	9cf6b72	2009-04-06 14:48:03 +0200	[diff] [blame]	753	struct address_space mapping, prev_mapping = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	754	int err = 0, err2;
				755
				756	INIT_LIST_HEAD(&tmp);
				757
				758	spin_lock(lock);
				759	while (!list_empty(list)) {
				760	bh = BH_ENTRY(list->next);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	761	mapping = bh->b_assoc_map;
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	762	__remove_assoc_queue(bh);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	763	/* Avoid race with mark_buffer_dirty_inode() which does
				764	* a lockless check and we rely on seeing the dirty bit */
				765	smp_mb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	766	if (buffer_dirty(bh) \|\| buffer_locked(bh)) {
				767	list_add(&bh->b_assoc_buffers, &tmp);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	768	bh->b_assoc_map = mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	769	if (buffer_dirty(bh)) {
				770	get_bh(bh);
				771	spin_unlock(lock);
				772	/*
				773	* Ensure any pending I/O completes so that
				774	* ll_rw_block() actually writes the current
				775	* contents - it is a noop if I/O is still in
				776	* flight on potentially older contents.
				777	*/
Jens Axboe	9cf6b72	2009-04-06 14:48:03 +0200	[diff] [blame]	778	ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
				779
				780	/*
				781	* Kick off IO for the previous mapping. Note
				782	* that we will not run the very last mapping,
				783	* wait_on_buffer() will do that for us
				784	* through sync_buffer().
				785	*/
				786	if (prev_mapping && prev_mapping != mapping)
				787	blk_run_address_space(prev_mapping);
				788	prev_mapping = mapping;
				789
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	790	brelse(bh);
				791	spin_lock(lock);
				792	}
				793	}
				794	}
				795
				796	while (!list_empty(&tmp)) {
				797	bh = BH_ENTRY(tmp.prev);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	798	get_bh(bh);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	799	mapping = bh->b_assoc_map;
				800	__remove_assoc_queue(bh);
				801	/* Avoid race with mark_buffer_dirty_inode() which does
				802	* a lockless check and we rely on seeing the dirty bit */
				803	smp_mb();
				804	if (buffer_dirty(bh)) {
				805	list_add(&bh->b_assoc_buffers,
Jan Kara	e389229	2008-03-04 14:28:33 -0800	[diff] [blame]	806	&mapping->private_list);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	807	bh->b_assoc_map = mapping;
				808	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	809	spin_unlock(lock);
				810	wait_on_buffer(bh);
				811	if (!buffer_uptodate(bh))
				812	err = -EIO;
				813	brelse(bh);
				814	spin_lock(lock);
				815	}
				816
				817	spin_unlock(lock);
				818	err2 = osync_buffers_list(lock, list);
				819	if (err)
				820	return err;
				821	else
				822	return err2;
				823	}
				824
				825	/*
				826	* Invalidate any and all dirty buffers on a given inode. We are
				827	* probably unmounting the fs, but that doesn't mean we have already
				828	* done a sync(). Just drop the buffers from the inode list.
				829	*
				830	* NOTE: we take the inode's blockdev's mapping's private_lock. Which
				831	* assumes that all the buffers are against the blockdev. Not true
				832	* for reiserfs.
				833	*/
				834	void invalidate_inode_buffers(struct inode *inode)
				835	{
				836	if (inode_has_buffers(inode)) {
				837	struct address_space *mapping = &inode->i_data;
				838	struct list_head *list = &mapping->private_list;
				839	struct address_space *buffer_mapping = mapping->assoc_mapping;
				840
				841	spin_lock(&buffer_mapping->private_lock);
				842	while (!list_empty(list))
				843	__remove_assoc_queue(BH_ENTRY(list->next));
				844	spin_unlock(&buffer_mapping->private_lock);
				845	}
				846	}
Jan Kara	52b19ac	2008-09-23 18:24:08 +0200	[diff] [blame]	847	EXPORT_SYMBOL(invalidate_inode_buffers);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	848
				849	/*
				850	* Remove any clean buffers from the inode's buffer list. This is called
				851	* when we're trying to free the inode itself. Those buffers can pin it.
				852	*
				853	* Returns true if all buffers were removed.
				854	*/
				855	int remove_inode_buffers(struct inode *inode)
				856	{
				857	int ret = 1;
				858
				859	if (inode_has_buffers(inode)) {
				860	struct address_space *mapping = &inode->i_data;
				861	struct list_head *list = &mapping->private_list;
				862	struct address_space *buffer_mapping = mapping->assoc_mapping;
				863
				864	spin_lock(&buffer_mapping->private_lock);
				865	while (!list_empty(list)) {
				866	struct buffer_head *bh = BH_ENTRY(list->next);
				867	if (buffer_dirty(bh)) {
				868	ret = 0;
				869	break;
				870	}
				871	__remove_assoc_queue(bh);
				872	}
				873	spin_unlock(&buffer_mapping->private_lock);
				874	}
				875	return ret;
				876	}
				877
				878	/*
				879	* Create the appropriate buffers when given a page for data area and
				880	* the size of each buffer.. Use the bh->b_this_page linked list to
				881	* follow the buffers created. Return NULL if unable to create more
				882	* buffers.
				883	*
				884	* The retry flag is used to differentiate async IO (paging, swapping)
				885	* which may not fail from ordinary buffer allocations.
				886	*/
				887	struct buffer_head alloc_page_buffers(struct page page, unsigned long size,
				888	int retry)
				889	{
				890	struct buffer_head bh, head;
				891	long offset;
				892
				893	try_again:
				894	head = NULL;
				895	offset = PAGE_SIZE;
				896	while ((offset -= size) >= 0) {
				897	bh = alloc_buffer_head(GFP_NOFS);
				898	if (!bh)
				899	goto no_grow;
				900
				901	bh->b_bdev = NULL;
				902	bh->b_this_page = head;
				903	bh->b_blocknr = -1;
				904	head = bh;
				905
				906	bh->b_state = 0;
				907	atomic_set(&bh->b_count, 0);
Chris Mason	fc5cd58	2006-02-01 03:06:48 -0800	[diff] [blame]	908	bh->b_private = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	909	bh->b_size = size;
				910
				911	/* Link the buffer to its page */
				912	set_bh_page(bh, page, offset);
				913
Nathan Scott	01ffe33	2006-01-17 09:02:07 +1100	[diff] [blame]	914	init_buffer(bh, NULL, NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	915	}
				916	return head;
				917	/*
				918	* In case anything failed, we just free everything we got.
				919	*/
				920	no_grow:
				921	if (head) {
				922	do {
				923	bh = head;
				924	head = head->b_this_page;
				925	free_buffer_head(bh);
				926	} while (head);
				927	}
				928
				929	/*
				930	* Return failure for non-async IO requests. Async IO requests
				931	* are not allowed to fail, so we have to wait until buffer heads
				932	* become available. But we don't want tasks sleeping with
				933	* partially complete buffers, so all were released above.
				934	*/
				935	if (!retry)
				936	return NULL;
				937
				938	/* We're _really_ low on memory. Now we just
				939	* wait for old buffer heads to become free due to
				940	* finishing IO. Since this is an async request and
				941	* the reserve list is empty, we're sure there are
				942	* async buffer heads in use.
				943	*/
				944	free_more_memory();
				945	goto try_again;
				946	}
				947	EXPORT_SYMBOL_GPL(alloc_page_buffers);
				948
				949	static inline void
				950	link_dev_buffers(struct page page, struct buffer_head head)
				951	{
				952	struct buffer_head bh, tail;
				953
				954	bh = head;
				955	do {
				956	tail = bh;
				957	bh = bh->b_this_page;
				958	} while (bh);
				959	tail->b_this_page = head;
				960	attach_page_buffers(page, head);
				961	}
				962
				963	/*
				964	* Initialise the state of a blockdev page's buffers.
				965	*/
				966	static void
				967	init_page_buffers(struct page page, struct block_device bdev,
				968	sector_t block, int size)
				969	{
				970	struct buffer_head *head = page_buffers(page);
				971	struct buffer_head *bh = head;
				972	int uptodate = PageUptodate(page);
				973
				974	do {
				975	if (!buffer_mapped(bh)) {
				976	init_buffer(bh, NULL, NULL);
				977	bh->b_bdev = bdev;
				978	bh->b_blocknr = block;
				979	if (uptodate)
				980	set_buffer_uptodate(bh);
				981	set_buffer_mapped(bh);
				982	}
				983	block++;
				984	bh = bh->b_this_page;
				985	} while (bh != head);
				986	}
				987
				988	/*
				989	* Create the page-cache page that contains the requested block.
				990	*
				991	* This is user purely for blockdev mappings.
				992	*/
				993	static struct page *
				994	grow_dev_page(struct block_device *bdev, sector_t block,
				995	pgoff_t index, int size)
				996	{
				997	struct inode *inode = bdev->bd_inode;
				998	struct page *page;
				999	struct buffer_head *bh;
				1000
Christoph Lameter	ea12589	2007-05-16 22:11:21 -0700	[diff] [blame]	1001	page = find_or_create_page(inode->i_mapping, index,
Mel Gorman	769848c	2007-07-17 04:03:05 -0700	[diff] [blame]	1002	(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)\|__GFP_MOVABLE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1003	if (!page)
				1004	return NULL;
				1005
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	1006	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1007
				1008	if (page_has_buffers(page)) {
				1009	bh = page_buffers(page);
				1010	if (bh->b_size == size) {
				1011	init_page_buffers(page, bdev, block, size);
				1012	return page;
				1013	}
				1014	if (!try_to_free_buffers(page))
				1015	goto failed;
				1016	}
				1017
				1018	/*
				1019	* Allocate some buffers for this page
				1020	*/
				1021	bh = alloc_page_buffers(page, size, 0);
				1022	if (!bh)
				1023	goto failed;
				1024
				1025	/*
				1026	* Link the page to the buffers and initialise them. Take the
				1027	* lock to be atomic wrt __find_get_block(), which does not
				1028	* run under the page lock.
				1029	*/
				1030	spin_lock(&inode->i_mapping->private_lock);
				1031	link_dev_buffers(page, bh);
				1032	init_page_buffers(page, bdev, block, size);
				1033	spin_unlock(&inode->i_mapping->private_lock);
				1034	return page;
				1035
				1036	failed:
				1037	BUG();
				1038	unlock_page(page);
				1039	page_cache_release(page);
				1040	return NULL;
				1041	}
				1042
				1043	/*
				1044	* Create buffers for the specified block device block's page. If
				1045	* that page was dirty, the buffers are set dirty also.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1046	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1047	static int
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1048	grow_buffers(struct block_device *bdev, sector_t block, int size)
				1049	{
				1050	struct page *page;
				1051	pgoff_t index;
				1052	int sizebits;
				1053
				1054	sizebits = -1;
				1055	do {
				1056	sizebits++;
				1057	} while ((size << sizebits) < PAGE_SIZE);
				1058
				1059	index = block >> sizebits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1060
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1061	/*
				1062	* Check for a block which wants to lie outside our maximum possible
				1063	* pagecache index. (this comparison is done using sector_t types).
				1064	*/
				1065	if (unlikely(index != block >> sizebits)) {
				1066	char b[BDEVNAME_SIZE];
				1067
				1068	printk(KERN_ERR "%s: requested out-of-range block %llu for "
				1069	"device %s\n",
Harvey Harrison	8e24eea	2008-04-30 00:55:09 -0700	[diff] [blame]	1070	__func__, (unsigned long long)block,
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1071	bdevname(bdev, b));
				1072	return -EIO;
				1073	}
				1074	block = index << sizebits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1075	/* Create a page with the proper size buffers.. */
				1076	page = grow_dev_page(bdev, block, index, size);
				1077	if (!page)
				1078	return 0;
				1079	unlock_page(page);
				1080	page_cache_release(page);
				1081	return 1;
				1082	}
				1083
Adrian Bunk	75c96f8	2005-05-05 16:16:09 -0700	[diff] [blame]	1084	static struct buffer_head *
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1085	__getblk_slow(struct block_device *bdev, sector_t block, int size)
				1086	{
				1087	/* Size must be multiple of hard sectorsize */
Martin K. Petersen	e1defc4	2009-05-22 17:17:49 -0400	[diff] [blame]	1088	if (unlikely(size & (bdev_logical_block_size(bdev)-1) \|\|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1089	(size < 512 \|\| size > PAGE_SIZE))) {
				1090	printk(KERN_ERR "getblk(): invalid block size %d requested\n",
				1091	size);
Martin K. Petersen	e1defc4	2009-05-22 17:17:49 -0400	[diff] [blame]	1092	printk(KERN_ERR "logical block size: %d\n",
				1093	bdev_logical_block_size(bdev));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1094
				1095	dump_stack();
				1096	return NULL;
				1097	}
				1098
				1099	for (;;) {
				1100	struct buffer_head * bh;
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1101	int ret;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1102
				1103	bh = __find_get_block(bdev, block, size);
				1104	if (bh)
				1105	return bh;
				1106
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1107	ret = grow_buffers(bdev, block, size);
				1108	if (ret < 0)
				1109	return NULL;
				1110	if (ret == 0)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1111	free_more_memory();
				1112	}
				1113	}
				1114
				1115	/*
				1116	* The relationship between dirty buffers and dirty pages:
				1117	*
				1118	* Whenever a page has any dirty buffers, the page's dirty bit is set, and
				1119	* the page is tagged dirty in its radix tree.
				1120	*
				1121	* At all times, the dirtiness of the buffers represents the dirtiness of
				1122	* subsections of the page. If the page has buffers, the page dirty bit is
				1123	* merely a hint about the true dirty state.
				1124	*
				1125	* When a page is set dirty in its entirety, all its buffers are marked dirty
				1126	* (if the page has buffers).
				1127	*
				1128	* When a buffer is marked dirty, its page is dirtied, but the page's other
				1129	* buffers are not.
				1130	*
				1131	* Also. When blockdev buffers are explicitly read with bread(), they
				1132	* individually become uptodate. But their backing page remains not
				1133	* uptodate - even if all of its buffers are uptodate. A subsequent
				1134	* block_read_full_page() against that page will discover all the uptodate
				1135	* buffers, will set the page uptodate and will perform no I/O.
				1136	*/
				1137
				1138	/**
				1139	* mark_buffer_dirty - mark a buffer_head as needing writeout
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1140	* @bh: the buffer_head to mark dirty
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1141	*
				1142	* mark_buffer_dirty() will set the dirty bit against the buffer, then set its
				1143	* backing page dirty, then tag the page as dirty in its address_space's radix
				1144	* tree and then attach the address_space's inode to its superblock's dirty
				1145	* inode list.
				1146	*
				1147	* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
				1148	* mapping->tree_lock and the global inode_lock.
				1149	*/
Harvey Harrison	fc9b52c	2008-02-08 04:19:52 -0800	[diff] [blame]	1150	void mark_buffer_dirty(struct buffer_head *bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1151	{
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	1152	WARN_ON_ONCE(!buffer_uptodate(bh));
Linus Torvalds	1be62dc	2008-04-04 14:38:17 -0700	[diff] [blame]	1153
				1154	/*
				1155	* Very carefully optimize the it-is-already-dirty case.
				1156	*
				1157	* Don't let the final "is it dirty" escape to before we
				1158	* perhaps modified the buffer.
				1159	*/
				1160	if (buffer_dirty(bh)) {
				1161	smp_mb();
				1162	if (buffer_dirty(bh))
				1163	return;
				1164	}
				1165
Linus Torvalds	a8e7d49	2009-03-19 11:32:05 -0700	[diff] [blame]	1166	if (!test_set_buffer_dirty(bh)) {
				1167	struct page *page = bh->b_page;
Linus Torvalds	8e9d78e	2009-08-21 17:40:08 -0700	[diff] [blame]	1168	if (!TestSetPageDirty(page)) {
				1169	struct address_space *mapping = page_mapping(page);
				1170	if (mapping)
				1171	__set_page_dirty(page, mapping, 0);
				1172	}
Linus Torvalds	a8e7d49	2009-03-19 11:32:05 -0700	[diff] [blame]	1173	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1174	}
				1175
				1176	/*
				1177	* Decrement a buffer_head's reference count. If all buffers against a page
				1178	* have zero reference count, are clean and unlocked, and if the page is clean
				1179	* and unlocked then try_to_free_buffers() may strip the buffers from the page
				1180	* in preparation for freeing it (sometimes, rarely, buffers are removed from
				1181	* a page but it ends up not being freed, and buffers may later be reattached).
				1182	*/
				1183	void __brelse(struct buffer_head * buf)
				1184	{
				1185	if (atomic_read(&buf->b_count)) {
				1186	put_bh(buf);
				1187	return;
				1188	}
Arjan van de Ven	5c752ad	2008-07-25 19:45:40 -0700	[diff] [blame]	1189	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1190	}
				1191
				1192	/*
				1193	* bforget() is like brelse(), except it discards any
				1194	* potentially dirty data.
				1195	*/
				1196	void __bforget(struct buffer_head *bh)
				1197	{
				1198	clear_buffer_dirty(bh);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	1199	if (bh->b_assoc_map) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1200	struct address_space *buffer_mapping = bh->b_page->mapping;
				1201
				1202	spin_lock(&buffer_mapping->private_lock);
				1203	list_del_init(&bh->b_assoc_buffers);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	1204	bh->b_assoc_map = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1205	spin_unlock(&buffer_mapping->private_lock);
				1206	}
				1207	__brelse(bh);
				1208	}
				1209
				1210	static struct buffer_head __bread_slow(struct buffer_head bh)
				1211	{
				1212	lock_buffer(bh);
				1213	if (buffer_uptodate(bh)) {
				1214	unlock_buffer(bh);
				1215	return bh;
				1216	} else {
				1217	get_bh(bh);
				1218	bh->b_end_io = end_buffer_read_sync;
				1219	submit_bh(READ, bh);
				1220	wait_on_buffer(bh);
				1221	if (buffer_uptodate(bh))
				1222	return bh;
				1223	}
				1224	brelse(bh);
				1225	return NULL;
				1226	}
				1227
				1228	/*
				1229	* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
				1230	* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
				1231	* refcount elevated by one when they're in an LRU. A buffer can only appear
				1232	* once in a particular CPU's LRU. A single buffer can be present in multiple
				1233	* CPU's LRUs at the same time.
				1234	*
				1235	* This is a transparent caching front-end to sb_bread(), sb_getblk() and
				1236	* sb_find_get_block().
				1237	*
				1238	* The LRUs themselves only need locking against invalidate_bh_lrus. We use
				1239	* a local interrupt disable for that.
				1240	*/
				1241
				1242	#define BH_LRU_SIZE 8
				1243
				1244	struct bh_lru {
				1245	struct buffer_head *bhs[BH_LRU_SIZE];
				1246	};
				1247
				1248	static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
				1249
				1250	#ifdef CONFIG_SMP
				1251	#define bh_lru_lock() local_irq_disable()
				1252	#define bh_lru_unlock() local_irq_enable()
				1253	#else
				1254	#define bh_lru_lock() preempt_disable()
				1255	#define bh_lru_unlock() preempt_enable()
				1256	#endif
				1257
				1258	static inline void check_irqs_on(void)
				1259	{
				1260	#ifdef irqs_disabled
				1261	BUG_ON(irqs_disabled());
				1262	#endif
				1263	}
				1264
				1265	/*
				1266	* The LRU management algorithm is dopey-but-simple. Sorry.
				1267	*/
				1268	static void bh_lru_install(struct buffer_head *bh)
				1269	{
				1270	struct buffer_head *evictee = NULL;
				1271	struct bh_lru *lru;
				1272
				1273	check_irqs_on();
				1274	bh_lru_lock();
				1275	lru = &__get_cpu_var(bh_lrus);
				1276	if (lru->bhs[0] != bh) {
				1277	struct buffer_head *bhs[BH_LRU_SIZE];
				1278	int in;
				1279	int out = 0;
				1280
				1281	get_bh(bh);
				1282	bhs[out++] = bh;
				1283	for (in = 0; in < BH_LRU_SIZE; in++) {
				1284	struct buffer_head *bh2 = lru->bhs[in];
				1285
				1286	if (bh2 == bh) {
				1287	__brelse(bh2);
				1288	} else {
				1289	if (out >= BH_LRU_SIZE) {
				1290	BUG_ON(evictee != NULL);
				1291	evictee = bh2;
				1292	} else {
				1293	bhs[out++] = bh2;
				1294	}
				1295	}
				1296	}
				1297	while (out < BH_LRU_SIZE)
				1298	bhs[out++] = NULL;
				1299	memcpy(lru->bhs, bhs, sizeof(bhs));
				1300	}
				1301	bh_lru_unlock();
				1302
				1303	if (evictee)
				1304	__brelse(evictee);
				1305	}
				1306
				1307	/*
				1308	* Look up the bh in this cpu's LRU. If it's there, move it to the head.
				1309	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1310	static struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1311	lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1312	{
				1313	struct buffer_head *ret = NULL;
				1314	struct bh_lru *lru;
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1315	unsigned int i;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1316
				1317	check_irqs_on();
				1318	bh_lru_lock();
				1319	lru = &__get_cpu_var(bh_lrus);
				1320	for (i = 0; i < BH_LRU_SIZE; i++) {
				1321	struct buffer_head *bh = lru->bhs[i];
				1322
				1323	if (bh && bh->b_bdev == bdev &&
				1324	bh->b_blocknr == block && bh->b_size == size) {
				1325	if (i) {
				1326	while (i) {
				1327	lru->bhs[i] = lru->bhs[i - 1];
				1328	i--;
				1329	}
				1330	lru->bhs[0] = bh;
				1331	}
				1332	get_bh(bh);
				1333	ret = bh;
				1334	break;
				1335	}
				1336	}
				1337	bh_lru_unlock();
				1338	return ret;
				1339	}
				1340
				1341	/*
				1342	* Perform a pagecache lookup for the matching buffer. If it's there, refresh
				1343	* it in the LRU and mark it as accessed. If it is not present then return
				1344	* NULL
				1345	*/
				1346	struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1347	__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1348	{
				1349	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
				1350
				1351	if (bh == NULL) {
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1352	bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1353	if (bh)
				1354	bh_lru_install(bh);
				1355	}
				1356	if (bh)
				1357	touch_buffer(bh);
				1358	return bh;
				1359	}
				1360	EXPORT_SYMBOL(__find_get_block);
				1361
				1362	/*
				1363	* __getblk will locate (and, if necessary, create) the buffer_head
				1364	* which corresponds to the passed block_device, block and size. The
				1365	* returned buffer has its reference count incremented.
				1366	*
				1367	* __getblk() cannot fail - it just keeps trying. If you pass it an
				1368	* illegal block number, __getblk() will happily return a buffer_head
				1369	* which represents the non-existent block. Very weird.
				1370	*
				1371	* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
				1372	* attempt is failing. FIXME, perhaps?
				1373	*/
				1374	struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1375	__getblk(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1376	{
				1377	struct buffer_head *bh = __find_get_block(bdev, block, size);
				1378
				1379	might_sleep();
				1380	if (bh == NULL)
				1381	bh = __getblk_slow(bdev, block, size);
				1382	return bh;
				1383	}
				1384	EXPORT_SYMBOL(__getblk);
				1385
				1386	/*
				1387	* Do async read-ahead on a buffer..
				1388	*/
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1389	void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1390	{
				1391	struct buffer_head *bh = __getblk(bdev, block, size);
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1392	if (likely(bh)) {
				1393	ll_rw_block(READA, 1, &bh);
				1394	brelse(bh);
				1395	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1396	}
				1397	EXPORT_SYMBOL(__breadahead);
				1398
				1399	/**
				1400	* __bread() - reads a specified block and returns the bh
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1401	* @bdev: the block_device to read from
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1402	* @block: number of block
				1403	* @size: size (in bytes) to read
				1404	*
				1405	* Reads a specified block, and returns buffer head that contains it.
				1406	* It returns NULL if the block was unreadable.
				1407	*/
				1408	struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1409	__bread(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1410	{
				1411	struct buffer_head *bh = __getblk(bdev, block, size);
				1412
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1413	if (likely(bh) && !buffer_uptodate(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1414	bh = __bread_slow(bh);
				1415	return bh;
				1416	}
				1417	EXPORT_SYMBOL(__bread);
				1418
				1419	/*
				1420	* invalidate_bh_lrus() is called rarely - but not only at unmount.
				1421	* This doesn't race because it runs in each cpu either in irq
				1422	* or with preempt disabled.
				1423	*/
				1424	static void invalidate_bh_lru(void *arg)
				1425	{
				1426	struct bh_lru *b = &get_cpu_var(bh_lrus);
				1427	int i;
				1428
				1429	for (i = 0; i < BH_LRU_SIZE; i++) {
				1430	brelse(b->bhs[i]);
				1431	b->bhs[i] = NULL;
				1432	}
				1433	put_cpu_var(bh_lrus);
				1434	}
				1435
Peter Zijlstra	f9a1439	2007-05-06 14:49:55 -0700	[diff] [blame]	1436	void invalidate_bh_lrus(void)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1437	{
Jens Axboe	15c8b6c	2008-05-09 09:39:44 +0200	[diff] [blame]	1438	on_each_cpu(invalidate_bh_lru, NULL, 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1439	}
Nick Piggin	9db5579	2008-02-08 04:19:49 -0800	[diff] [blame]	1440	EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1441
				1442	void set_bh_page(struct buffer_head *bh,
				1443	struct page *page, unsigned long offset)
				1444	{
				1445	bh->b_page = page;
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	1446	BUG_ON(offset >= PAGE_SIZE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1447	if (PageHighMem(page))
				1448	/*
				1449	* This catches illegal uses and preserves the offset:
				1450	*/
				1451	bh->b_data = (char *)(0 + offset);
				1452	else
				1453	bh->b_data = page_address(page) + offset;
				1454	}
				1455	EXPORT_SYMBOL(set_bh_page);
				1456
				1457	/*
				1458	* Called when truncating a buffer on a page completely.
				1459	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1460	static void discard_buffer(struct buffer_head * bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1461	{
				1462	lock_buffer(bh);
				1463	clear_buffer_dirty(bh);
				1464	bh->b_bdev = NULL;
				1465	clear_buffer_mapped(bh);
				1466	clear_buffer_req(bh);
				1467	clear_buffer_new(bh);
				1468	clear_buffer_delay(bh);
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	1469	clear_buffer_unwritten(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1470	unlock_buffer(bh);
				1471	}
				1472
				1473	/**
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1474	* block_invalidatepage - invalidate part of all of a buffer-backed page
				1475	*
				1476	* @page: the page which is affected
				1477	* @offset: the index of the truncation point
				1478	*
				1479	* block_invalidatepage() is called when all or part of the page has become
				1480	* invalidatedby a truncate operation.
				1481	*
				1482	* block_invalidatepage() does not have to release all buffers, but it must
				1483	* ensure that no dirty buffer is left outside @offset and that no I/O
				1484	* is underway against any of the blocks which are outside the truncation
				1485	* point. Because the caller is about to free (and possibly reuse) those
				1486	* blocks on-disk.
				1487	*/
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1488	void block_invalidatepage(struct page *page, unsigned long offset)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1489	{
				1490	struct buffer_head head, bh, *next;
				1491	unsigned int curr_off = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1492
				1493	BUG_ON(!PageLocked(page));
				1494	if (!page_has_buffers(page))
				1495	goto out;
				1496
				1497	head = page_buffers(page);
				1498	bh = head;
				1499	do {
				1500	unsigned int next_off = curr_off + bh->b_size;
				1501	next = bh->b_this_page;
				1502
				1503	/*
				1504	* is this block fully invalidated?
				1505	*/
				1506	if (offset <= curr_off)
				1507	discard_buffer(bh);
				1508	curr_off = next_off;
				1509	bh = next;
				1510	} while (bh != head);
				1511
				1512	/*
				1513	* We release buffers only if the entire page is being invalidated.
				1514	* The get_block cached value has been unconditionally invalidated,
				1515	* so real IO is not possible anymore.
				1516	*/
				1517	if (offset == 0)
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1518	try_to_release_page(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1519	out:
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1520	return;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1521	}
				1522	EXPORT_SYMBOL(block_invalidatepage);
				1523
				1524	/*
				1525	* We attach and possibly dirty the buffers atomically wrt
				1526	* __set_page_dirty_buffers() via private_lock. try_to_free_buffers
				1527	* is already excluded via the page lock.
				1528	*/
				1529	void create_empty_buffers(struct page *page,
				1530	unsigned long blocksize, unsigned long b_state)
				1531	{
				1532	struct buffer_head bh, head, *tail;
				1533
				1534	head = alloc_page_buffers(page, blocksize, 1);
				1535	bh = head;
				1536	do {
				1537	bh->b_state \|= b_state;
				1538	tail = bh;
				1539	bh = bh->b_this_page;
				1540	} while (bh);
				1541	tail->b_this_page = head;
				1542
				1543	spin_lock(&page->mapping->private_lock);
				1544	if (PageUptodate(page) \|\| PageDirty(page)) {
				1545	bh = head;
				1546	do {
				1547	if (PageDirty(page))
				1548	set_buffer_dirty(bh);
				1549	if (PageUptodate(page))
				1550	set_buffer_uptodate(bh);
				1551	bh = bh->b_this_page;
				1552	} while (bh != head);
				1553	}
				1554	attach_page_buffers(page, head);
				1555	spin_unlock(&page->mapping->private_lock);
				1556	}
				1557	EXPORT_SYMBOL(create_empty_buffers);
				1558
				1559	/*
				1560	* We are taking a block for data and we don't want any output from any
				1561	* buffer-cache aliases starting from return from that function and
				1562	* until the moment when something will explicitly mark the buffer
				1563	* dirty (hopefully that will not happen until we will free that block ;-)
				1564	* We don't even need to mark it not-uptodate - nobody can expect
				1565	* anything from a newly allocated buffer anyway. We used to used
				1566	* unmap_buffer() for such invalidation, but that was wrong. We definitely
				1567	* don't want to mark the alias unmapped, for example - it would confuse
				1568	* anyone who might pick it with bread() afterwards...
				1569	*
				1570	* Also.. Note that bforget() doesn't lock the buffer. So there can
				1571	* be writeout I/O going on against recently-freed buffers. We don't
				1572	* wait on that I/O in bforget() - it's more efficient to wait on the I/O
				1573	* only if we really need to. That happens here.
				1574	*/
				1575	void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
				1576	{
				1577	struct buffer_head *old_bh;
				1578
				1579	might_sleep();
				1580
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1581	old_bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1582	if (old_bh) {
				1583	clear_buffer_dirty(old_bh);
				1584	wait_on_buffer(old_bh);
				1585	clear_buffer_req(old_bh);
				1586	__brelse(old_bh);
				1587	}
				1588	}
				1589	EXPORT_SYMBOL(unmap_underlying_metadata);
				1590
				1591	/*
				1592	* NOTE! All mapped/uptodate combinations are valid:
				1593	*
				1594	* Mapped Uptodate Meaning
				1595	*
				1596	* No No "unknown" - must do get_block()
				1597	* No Yes "hole" - zero-filled
				1598	* Yes No "allocated" - allocated on disk, not read in
				1599	* Yes Yes "valid" - allocated and up-to-date in memory.
				1600	*
				1601	* "Dirty" is valid only with the last case (mapped+uptodate).
				1602	*/
				1603
				1604	/*
				1605	* While block_write_full_page is writing back the dirty buffers under
				1606	* the page lock, whoever dirtied the buffers may decide to clean them
				1607	* again at any time. We handle that by only looking at the buffer
				1608	* state inside lock_buffer().
				1609	*
				1610	* If block_write_full_page() is called for regular writeback
				1611	* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
				1612	* locked buffer. This only can happen if someone has written the buffer
				1613	* directly, with submit_bh(). At the address_space level PageWriteback
				1614	* prevents this contention from occurring.
Theodore Ts'o	6e34eed	2009-04-07 18:12:43 -0400	[diff] [blame]	1615	*
				1616	* If block_write_full_page() is called with wbc->sync_mode ==
				1617	* WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
				1618	* causes the writes to be flagged as synchronous writes, but the
				1619	* block device queue will NOT be unplugged, since usually many pages
				1620	* will be pushed to the out before the higher-level caller actually
				1621	* waits for the writes to be completed. The various wait functions,
				1622	* such as wait_on_writeback_range() will ultimately call sync_page()
				1623	* which will ultimately call blk_run_backing_dev(), which will end up
				1624	* unplugging the device queue.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1625	*/
				1626	static int __block_write_full_page(struct inode inode, struct page page,
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	1627	get_block_t get_block, struct writeback_control wbc,
				1628	bh_end_io_t *handler)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1629	{
				1630	int err;
				1631	sector_t block;
				1632	sector_t last_block;
Andrew Morton	f0fbd5f	2005-05-05 16:15:48 -0700	[diff] [blame]	1633	struct buffer_head bh, head;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1634	const unsigned blocksize = 1 << inode->i_blkbits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1635	int nr_underway = 0;
Theodore Ts'o	6e34eed	2009-04-07 18:12:43 -0400	[diff] [blame]	1636	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
				1637	WRITE_SYNC_PLUG : WRITE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1638
				1639	BUG_ON(!PageLocked(page));
				1640
				1641	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
				1642
				1643	if (!page_has_buffers(page)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1644	create_empty_buffers(page, blocksize,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1645	(1 << BH_Dirty)\|(1 << BH_Uptodate));
				1646	}
				1647
				1648	/*
				1649	* Be very careful. We have no exclusion from __set_page_dirty_buffers
				1650	* here, and the (potentially unmapped) buffers may become dirty at
				1651	* any time. If a buffer becomes dirty here after we've inspected it
				1652	* then we just miss that fact, and the page stays dirty.
				1653	*
				1654	* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
				1655	* handle that here by just cleaning them.
				1656	*/
				1657
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	1658	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1659	head = page_buffers(page);
				1660	bh = head;
				1661
				1662	/*
				1663	* Get all the dirty buffers mapped to disk addresses and
				1664	* handle any aliases from the underlying blockdev's mapping.
				1665	*/
				1666	do {
				1667	if (block > last_block) {
				1668	/*
				1669	* mapped buffers outside i_size will occur, because
				1670	* this page can be outside i_size when there is a
				1671	* truncate in progress.
				1672	*/
				1673	/*
				1674	* The buffer was zeroed by block_write_full_page()
				1675	*/
				1676	clear_buffer_dirty(bh);
				1677	set_buffer_uptodate(bh);
Alex Tomas	29a814d	2008-07-11 19:27:31 -0400	[diff] [blame]	1678	} else if ((!buffer_mapped(bh) \|\| buffer_delay(bh)) &&
				1679	buffer_dirty(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1680	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1681	err = get_block(inode, block, bh, 1);
				1682	if (err)
				1683	goto recover;
Alex Tomas	29a814d	2008-07-11 19:27:31 -0400	[diff] [blame]	1684	clear_buffer_delay(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1685	if (buffer_new(bh)) {
				1686	/* blockdev mappings never come here */
				1687	clear_buffer_new(bh);
				1688	unmap_underlying_metadata(bh->b_bdev,
				1689	bh->b_blocknr);
				1690	}
				1691	}
				1692	bh = bh->b_this_page;
				1693	block++;
				1694	} while (bh != head);
				1695
				1696	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1697	if (!buffer_mapped(bh))
				1698	continue;
				1699	/*
				1700	* If it's a fully non-blocking write attempt and we cannot
				1701	* lock the buffer then redirty the page. Note that this can
				1702	* potentially cause a busy-wait loop from pdflush and kswapd
				1703	* activity, but those code paths have their own higher-level
				1704	* throttling.
				1705	*/
				1706	if (wbc->sync_mode != WB_SYNC_NONE \|\| !wbc->nonblocking) {
				1707	lock_buffer(bh);
Nick Piggin	ca5de40	2008-08-02 12:02:13 +0200	[diff] [blame]	1708	} else if (!trylock_buffer(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1709	redirty_page_for_writepage(wbc, page);
				1710	continue;
				1711	}
				1712	if (test_clear_buffer_dirty(bh)) {
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	1713	mark_buffer_async_write_endio(bh, handler);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1714	} else {
				1715	unlock_buffer(bh);
				1716	}
				1717	} while ((bh = bh->b_this_page) != head);
				1718
				1719	/*
				1720	* The page and its buffers are protected by PageWriteback(), so we can
				1721	* drop the bh refcounts early.
				1722	*/
				1723	BUG_ON(PageWriteback(page));
				1724	set_page_writeback(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1725
				1726	do {
				1727	struct buffer_head *next = bh->b_this_page;
				1728	if (buffer_async_write(bh)) {
Theodore Ts'o	a64c861	2009-03-27 22:14:10 -0400	[diff] [blame]	1729	submit_bh(write_op, bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1730	nr_underway++;
				1731	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1732	bh = next;
				1733	} while (bh != head);
Andrew Morton	05937ba	2005-05-05 16:15:47 -0700	[diff] [blame]	1734	unlock_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1735
				1736	err = 0;
				1737	done:
				1738	if (nr_underway == 0) {
				1739	/*
				1740	* The page was marked dirty, but the buffers were
				1741	* clean. Someone wrote them back by hand with
				1742	* ll_rw_block/submit_bh. A rare case.
				1743	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1744	end_page_writeback(page);
Nick Piggin	3d67f2d	2007-05-06 14:49:05 -0700	[diff] [blame]	1745
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1746	/*
				1747	* The page and buffer_heads can be released at any time from
				1748	* here on.
				1749	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1750	}
				1751	return err;
				1752
				1753	recover:
				1754	/*
				1755	* ENOSPC, or some other error. We may already have added some
				1756	* blocks to the file, so we need to write these out to avoid
				1757	* exposing stale data.
				1758	* The page is currently locked and not marked for writeback
				1759	*/
				1760	bh = head;
				1761	/* Recovery: lock and submit the mapped buffers */
				1762	do {
Alex Tomas	29a814d	2008-07-11 19:27:31 -0400	[diff] [blame]	1763	if (buffer_mapped(bh) && buffer_dirty(bh) &&
				1764	!buffer_delay(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1765	lock_buffer(bh);
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	1766	mark_buffer_async_write_endio(bh, handler);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1767	} else {
				1768	/*
				1769	* The buffer may have been set dirty during
				1770	* attachment to a dirty page.
				1771	*/
				1772	clear_buffer_dirty(bh);
				1773	}
				1774	} while ((bh = bh->b_this_page) != head);
				1775	SetPageError(page);
				1776	BUG_ON(PageWriteback(page));
Andrew Morton	7e4c369	2007-05-08 00:23:27 -0700	[diff] [blame]	1777	mapping_set_error(page->mapping, err);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1778	set_page_writeback(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1779	do {
				1780	struct buffer_head *next = bh->b_this_page;
				1781	if (buffer_async_write(bh)) {
				1782	clear_buffer_dirty(bh);
Theodore Ts'o	a64c861	2009-03-27 22:14:10 -0400	[diff] [blame]	1783	submit_bh(write_op, bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1784	nr_underway++;
				1785	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1786	bh = next;
				1787	} while (bh != head);
Nick Piggin	ffda9d3	2007-02-20 13:57:54 -0800	[diff] [blame]	1788	unlock_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1789	goto done;
				1790	}
				1791
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1792	/*
				1793	* If a page has any new buffers, zero them out here, and mark them uptodate
				1794	* and dirty so they'll be written out (in order to prevent uninitialised
				1795	* block data from leaking). And clear the new bit.
				1796	*/
				1797	void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
				1798	{
				1799	unsigned int block_start, block_end;
				1800	struct buffer_head head, bh;
				1801
				1802	BUG_ON(!PageLocked(page));
				1803	if (!page_has_buffers(page))
				1804	return;
				1805
				1806	bh = head = page_buffers(page);
				1807	block_start = 0;
				1808	do {
				1809	block_end = block_start + bh->b_size;
				1810
				1811	if (buffer_new(bh)) {
				1812	if (block_end > from && block_start < to) {
				1813	if (!PageUptodate(page)) {
				1814	unsigned start, size;
				1815
				1816	start = max(from, block_start);
				1817	size = min(to, block_end) - start;
				1818
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	1819	zero_user(page, start, size);
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1820	set_buffer_uptodate(bh);
				1821	}
				1822
				1823	clear_buffer_new(bh);
				1824	mark_buffer_dirty(bh);
				1825	}
				1826	}
				1827
				1828	block_start = block_end;
				1829	bh = bh->b_this_page;
				1830	} while (bh != head);
				1831	}
				1832	EXPORT_SYMBOL(page_zero_new_buffers);
				1833
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1834	static int __block_prepare_write(struct inode inode, struct page page,
				1835	unsigned from, unsigned to, get_block_t *get_block)
				1836	{
				1837	unsigned block_start, block_end;
				1838	sector_t block;
				1839	int err = 0;
				1840	unsigned blocksize, bbits;
				1841	struct buffer_head bh, head, wait[2], *wait_bh=wait;
				1842
				1843	BUG_ON(!PageLocked(page));
				1844	BUG_ON(from > PAGE_CACHE_SIZE);
				1845	BUG_ON(to > PAGE_CACHE_SIZE);
				1846	BUG_ON(from > to);
				1847
				1848	blocksize = 1 << inode->i_blkbits;
				1849	if (!page_has_buffers(page))
				1850	create_empty_buffers(page, blocksize, 0);
				1851	head = page_buffers(page);
				1852
				1853	bbits = inode->i_blkbits;
				1854	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
				1855
				1856	for(bh = head, block_start = 0; bh != head \|\| !block_start;
				1857	block++, block_start=block_end, bh = bh->b_this_page) {
				1858	block_end = block_start + blocksize;
				1859	if (block_end <= from \|\| block_start >= to) {
				1860	if (PageUptodate(page)) {
				1861	if (!buffer_uptodate(bh))
				1862	set_buffer_uptodate(bh);
				1863	}
				1864	continue;
				1865	}
				1866	if (buffer_new(bh))
				1867	clear_buffer_new(bh);
				1868	if (!buffer_mapped(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1869	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1870	err = get_block(inode, block, bh, 1);
				1871	if (err)
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1872	break;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1873	if (buffer_new(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1874	unmap_underlying_metadata(bh->b_bdev,
				1875	bh->b_blocknr);
				1876	if (PageUptodate(page)) {
Nick Piggin	637aff4	2007-10-16 01:25:00 -0700	[diff] [blame]	1877	clear_buffer_new(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1878	set_buffer_uptodate(bh);
Nick Piggin	637aff4	2007-10-16 01:25:00 -0700	[diff] [blame]	1879	mark_buffer_dirty(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1880	continue;
				1881	}
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	1882	if (block_end > to \|\| block_start < from)
				1883	zero_user_segments(page,
				1884	to, block_end,
				1885	block_start, from);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1886	continue;
				1887	}
				1888	}
				1889	if (PageUptodate(page)) {
				1890	if (!buffer_uptodate(bh))
				1891	set_buffer_uptodate(bh);
				1892	continue;
				1893	}
				1894	if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	1895	!buffer_unwritten(bh) &&
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1896	(block_start < from \|\| block_end > to)) {
				1897	ll_rw_block(READ, 1, &bh);
				1898	*wait_bh++=bh;
				1899	}
				1900	}
				1901	/*
				1902	* If we issued read requests - let them complete.
				1903	*/
				1904	while(wait_bh > wait) {
				1905	wait_on_buffer(*--wait_bh);
				1906	if (!buffer_uptodate(*wait_bh))
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1907	err = -EIO;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1908	}
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1909	if (unlikely(err))
				1910	page_zero_new_buffers(page, from, to);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1911	return err;
				1912	}
				1913
				1914	static int __block_commit_write(struct inode inode, struct page page,
				1915	unsigned from, unsigned to)
				1916	{
				1917	unsigned block_start, block_end;
				1918	int partial = 0;
				1919	unsigned blocksize;
				1920	struct buffer_head bh, head;
				1921
				1922	blocksize = 1 << inode->i_blkbits;
				1923
				1924	for(bh = head = page_buffers(page), block_start = 0;
				1925	bh != head \|\| !block_start;
				1926	block_start=block_end, bh = bh->b_this_page) {
				1927	block_end = block_start + blocksize;
				1928	if (block_end <= from \|\| block_start >= to) {
				1929	if (!buffer_uptodate(bh))
				1930	partial = 1;
				1931	} else {
				1932	set_buffer_uptodate(bh);
				1933	mark_buffer_dirty(bh);
				1934	}
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1935	clear_buffer_new(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1936	}
				1937
				1938	/*
				1939	* If this is a partial write which happened to make all buffers
				1940	* uptodate then we can optimize away a bogus readpage() for
				1941	* the next read(). Here we 'discover' whether the page went
				1942	* uptodate as a result of this (potentially partial) write.
				1943	*/
				1944	if (!partial)
				1945	SetPageUptodate(page);
				1946	return 0;
				1947	}
				1948
				1949	/*
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1950	* block_write_begin takes care of the basic task of block allocation and
				1951	* bringing partial write blocks uptodate first.
				1952	*
				1953	* If *pagep is not NULL, then block_write_begin uses the locked page
				1954	* at *pagep rather than allocating its own. In this case, the page will
				1955	* not be unlocked or deallocated on failure.
				1956	*/
				1957	int block_write_begin(struct file file, struct address_space mapping,
				1958	loff_t pos, unsigned len, unsigned flags,
				1959	struct page pagep, void fsdata,
				1960	get_block_t *get_block)
				1961	{
				1962	struct inode *inode = mapping->host;
				1963	int status = 0;
				1964	struct page *page;
				1965	pgoff_t index;
				1966	unsigned start, end;
				1967	int ownpage = 0;
				1968
				1969	index = pos >> PAGE_CACHE_SHIFT;
				1970	start = pos & (PAGE_CACHE_SIZE - 1);
				1971	end = start + len;
				1972
				1973	page = *pagep;
				1974	if (page == NULL) {
				1975	ownpage = 1;
Nick Piggin	54566b2	2009-01-04 12:00:53 -0800	[diff] [blame]	1976	page = grab_cache_page_write_begin(mapping, index, flags);
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1977	if (!page) {
				1978	status = -ENOMEM;
				1979	goto out;
				1980	}
				1981	*pagep = page;
				1982	} else
				1983	BUG_ON(!PageLocked(page));
				1984
				1985	status = __block_prepare_write(inode, page, start, end, get_block);
				1986	if (unlikely(status)) {
				1987	ClearPageUptodate(page);
				1988
				1989	if (ownpage) {
				1990	unlock_page(page);
				1991	page_cache_release(page);
				1992	*pagep = NULL;
				1993
				1994	/*
				1995	* prepare_write() may have instantiated a few blocks
				1996	* outside i_size. Trim these off again. Don't need
				1997	* i_size_read because we hold i_mutex.
				1998	*/
				1999	if (pos + len > inode->i_size)
				2000	vmtruncate(inode, inode->i_size);
				2001	}
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2002	}
				2003
				2004	out:
				2005	return status;
				2006	}
				2007	EXPORT_SYMBOL(block_write_begin);
				2008
				2009	int block_write_end(struct file file, struct address_space mapping,
				2010	loff_t pos, unsigned len, unsigned copied,
				2011	struct page page, void fsdata)
				2012	{
				2013	struct inode *inode = mapping->host;
				2014	unsigned start;
				2015
				2016	start = pos & (PAGE_CACHE_SIZE - 1);
				2017
				2018	if (unlikely(copied < len)) {
				2019	/*
				2020	* The buffers that were written will now be uptodate, so we
				2021	* don't have to worry about a readpage reading them and
				2022	* overwriting a partial write. However if we have encountered
				2023	* a short write and only partially written into a buffer, it
				2024	* will not be marked uptodate, so a readpage might come in and
				2025	* destroy our partial write.
				2026	*
				2027	* Do the simplest thing, and just treat any short write to a
				2028	* non uptodate page as a zero-length write, and force the
				2029	* caller to redo the whole thing.
				2030	*/
				2031	if (!PageUptodate(page))
				2032	copied = 0;
				2033
				2034	page_zero_new_buffers(page, start+copied, start+len);
				2035	}
				2036	flush_dcache_page(page);
				2037
				2038	/* This could be a short (even 0-length) commit */
				2039	__block_commit_write(inode, page, start, start+copied);
				2040
				2041	return copied;
				2042	}
				2043	EXPORT_SYMBOL(block_write_end);
				2044
				2045	int generic_write_end(struct file file, struct address_space mapping,
				2046	loff_t pos, unsigned len, unsigned copied,
				2047	struct page page, void fsdata)
				2048	{
				2049	struct inode *inode = mapping->host;
Jan Kara	c7d206b	2008-07-11 19:27:31 -0400	[diff] [blame]	2050	int i_size_changed = 0;
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2051
				2052	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
				2053
				2054	/*
				2055	* No need to use i_size_read() here, the i_size
				2056	* cannot change under us because we hold i_mutex.
				2057	*
				2058	* But it's important to update i_size while still holding page lock:
				2059	* page writeout could otherwise come in and zero beyond i_size.
				2060	*/
				2061	if (pos+copied > inode->i_size) {
				2062	i_size_write(inode, pos+copied);
Jan Kara	c7d206b	2008-07-11 19:27:31 -0400	[diff] [blame]	2063	i_size_changed = 1;
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2064	}
				2065
				2066	unlock_page(page);
				2067	page_cache_release(page);
				2068
Jan Kara	c7d206b	2008-07-11 19:27:31 -0400	[diff] [blame]	2069	/*
				2070	* Don't mark the inode dirty under page lock. First, it unnecessarily
				2071	* makes the holding time of page lock longer. Second, it forces lock
				2072	* ordering of page lock and transaction start for journaling
				2073	* filesystems.
				2074	*/
				2075	if (i_size_changed)
				2076	mark_inode_dirty(inode);
				2077
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2078	return copied;
				2079	}
				2080	EXPORT_SYMBOL(generic_write_end);
				2081
				2082	/*
Hisashi Hifumi	8ab22b9	2008-07-28 15:46:36 -0700	[diff] [blame]	2083	* block_is_partially_uptodate checks whether buffers within a page are
				2084	* uptodate or not.
				2085	*
				2086	* Returns true if all buffers which correspond to a file portion
				2087	* we want to read are uptodate.
				2088	*/
				2089	int block_is_partially_uptodate(struct page page, read_descriptor_t desc,
				2090	unsigned long from)
				2091	{
				2092	struct inode *inode = page->mapping->host;
				2093	unsigned block_start, block_end, blocksize;
				2094	unsigned to;
				2095	struct buffer_head bh, head;
				2096	int ret = 1;
				2097
				2098	if (!page_has_buffers(page))
				2099	return 0;
				2100
				2101	blocksize = 1 << inode->i_blkbits;
				2102	to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
				2103	to = from + to;
				2104	if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
				2105	return 0;
				2106
				2107	head = page_buffers(page);
				2108	bh = head;
				2109	block_start = 0;
				2110	do {
				2111	block_end = block_start + blocksize;
				2112	if (block_end > from && block_start < to) {
				2113	if (!buffer_uptodate(bh)) {
				2114	ret = 0;
				2115	break;
				2116	}
				2117	if (block_end >= to)
				2118	break;
				2119	}
				2120	block_start = block_end;
				2121	bh = bh->b_this_page;
				2122	} while (bh != head);
				2123
				2124	return ret;
				2125	}
				2126	EXPORT_SYMBOL(block_is_partially_uptodate);
				2127
				2128	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2129	* Generic "read page" function for block devices that have the normal
				2130	* get_block functionality. This is most of the block device filesystems.
				2131	* Reads the page asynchronously --- the unlock_buffer() and
				2132	* set/clear_buffer_uptodate() functions propagate buffer state into the
				2133	* page struct once IO has completed.
				2134	*/
				2135	int block_read_full_page(struct page page, get_block_t get_block)
				2136	{
				2137	struct inode *inode = page->mapping->host;
				2138	sector_t iblock, lblock;
				2139	struct buffer_head bh, head, *arr[MAX_BUF_PER_PAGE];
				2140	unsigned int blocksize;
				2141	int nr, i;
				2142	int fully_mapped = 1;
				2143
Matt Mackall	cd7619d	2005-05-01 08:59:01 -0700	[diff] [blame]	2144	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2145	blocksize = 1 << inode->i_blkbits;
				2146	if (!page_has_buffers(page))
				2147	create_empty_buffers(page, blocksize, 0);
				2148	head = page_buffers(page);
				2149
				2150	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
				2151	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
				2152	bh = head;
				2153	nr = 0;
				2154	i = 0;
				2155
				2156	do {
				2157	if (buffer_uptodate(bh))
				2158	continue;
				2159
				2160	if (!buffer_mapped(bh)) {
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2161	int err = 0;
				2162
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2163	fully_mapped = 0;
				2164	if (iblock < lblock) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2165	WARN_ON(bh->b_size != blocksize);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2166	err = get_block(inode, iblock, bh, 0);
				2167	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2168	SetPageError(page);
				2169	}
				2170	if (!buffer_mapped(bh)) {
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2171	zero_user(page, i * blocksize, blocksize);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2172	if (!err)
				2173	set_buffer_uptodate(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2174	continue;
				2175	}
				2176	/*
				2177	* get_block() might have updated the buffer
				2178	* synchronously
				2179	*/
				2180	if (buffer_uptodate(bh))
				2181	continue;
				2182	}
				2183	arr[nr++] = bh;
				2184	} while (i++, iblock++, (bh = bh->b_this_page) != head);
				2185
				2186	if (fully_mapped)
				2187	SetPageMappedToDisk(page);
				2188
				2189	if (!nr) {
				2190	/*
				2191	* All buffers are uptodate - we can set the page uptodate
				2192	* as well. But not if get_block() returned an error.
				2193	*/
				2194	if (!PageError(page))
				2195	SetPageUptodate(page);
				2196	unlock_page(page);
				2197	return 0;
				2198	}
				2199
				2200	/* Stage two: lock the buffers */
				2201	for (i = 0; i < nr; i++) {
				2202	bh = arr[i];
				2203	lock_buffer(bh);
				2204	mark_buffer_async_read(bh);
				2205	}
				2206
				2207	/*
				2208	* Stage 3: start the IO. Check for uptodateness
				2209	* inside the buffer lock in case another process reading
				2210	* the underlying blockdev brought it uptodate (the sct fix).
				2211	*/
				2212	for (i = 0; i < nr; i++) {
				2213	bh = arr[i];
				2214	if (buffer_uptodate(bh))
				2215	end_buffer_async_read(bh, 1);
				2216	else
				2217	submit_bh(READ, bh);
				2218	}
				2219	return 0;
				2220	}
				2221
				2222	/* utility function for filesystems that need to do work on expanding
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2223	* truncates. Uses filesystem pagecache writes to allow the filesystem to
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2224	* deal with the hole.
				2225	*/
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2226	int generic_cont_expand_simple(struct inode *inode, loff_t size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2227	{
				2228	struct address_space *mapping = inode->i_mapping;
				2229	struct page *page;
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2230	void *fsdata;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2231	unsigned long limit;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2232	int err;
				2233
				2234	err = -EFBIG;
				2235	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
				2236	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
				2237	send_sig(SIGXFSZ, current, 0);
				2238	goto out;
				2239	}
				2240	if (size > inode->i_sb->s_maxbytes)
				2241	goto out;
				2242
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2243	err = pagecache_write_begin(NULL, mapping, size, 0,
				2244	AOP_FLAG_UNINTERRUPTIBLE\|AOP_FLAG_CONT_EXPAND,
				2245	&page, &fsdata);
				2246	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2247	goto out;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2248
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2249	err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
				2250	BUG_ON(err > 0);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2251
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2252	out:
				2253	return err;
				2254	}
				2255
Adrian Bunk	f1e3af7	2008-04-29 00:59:01 -0700	[diff] [blame]	2256	static int cont_expand_zero(struct file file, struct address_space mapping,
				2257	loff_t pos, loff_t *bytes)
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2258	{
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2259	struct inode *inode = mapping->host;
				2260	unsigned blocksize = 1 << inode->i_blkbits;
				2261	struct page *page;
				2262	void *fsdata;
				2263	pgoff_t index, curidx;
				2264	loff_t curpos;
				2265	unsigned zerofrom, offset, len;
				2266	int err = 0;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2267
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2268	index = pos >> PAGE_CACHE_SHIFT;
				2269	offset = pos & ~PAGE_CACHE_MASK;
				2270
				2271	while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
				2272	zerofrom = curpos & ~PAGE_CACHE_MASK;
				2273	if (zerofrom & (blocksize-1)) {
				2274	*bytes \|= (blocksize-1);
				2275	(*bytes)++;
				2276	}
				2277	len = PAGE_CACHE_SIZE - zerofrom;
				2278
				2279	err = pagecache_write_begin(file, mapping, curpos, len,
				2280	AOP_FLAG_UNINTERRUPTIBLE,
				2281	&page, &fsdata);
				2282	if (err)
				2283	goto out;
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2284	zero_user(page, zerofrom, len);
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2285	err = pagecache_write_end(file, mapping, curpos, len, len,
				2286	page, fsdata);
				2287	if (err < 0)
				2288	goto out;
				2289	BUG_ON(err != len);
				2290	err = 0;
OGAWA Hirofumi	061e974	2008-04-28 02:16:28 -0700	[diff] [blame]	2291
				2292	balance_dirty_pages_ratelimited(mapping);
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2293	}
				2294
				2295	/* page covers the boundary, find the boundary offset */
				2296	if (index == curidx) {
				2297	zerofrom = curpos & ~PAGE_CACHE_MASK;
				2298	/* if we will expand the thing last block will be filled */
				2299	if (offset <= zerofrom) {
				2300	goto out;
				2301	}
				2302	if (zerofrom & (blocksize-1)) {
				2303	*bytes \|= (blocksize-1);
				2304	(*bytes)++;
				2305	}
				2306	len = offset - zerofrom;
				2307
				2308	err = pagecache_write_begin(file, mapping, curpos, len,
				2309	AOP_FLAG_UNINTERRUPTIBLE,
				2310	&page, &fsdata);
				2311	if (err)
				2312	goto out;
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2313	zero_user(page, zerofrom, len);
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2314	err = pagecache_write_end(file, mapping, curpos, len, len,
				2315	page, fsdata);
				2316	if (err < 0)
				2317	goto out;
				2318	BUG_ON(err != len);
				2319	err = 0;
				2320	}
				2321	out:
				2322	return err;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2323	}
				2324
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2325	/*
				2326	* For moronic filesystems that do not allow holes in file.
				2327	* We may have to extend the file.
				2328	*/
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2329	int cont_write_begin(struct file file, struct address_space mapping,
				2330	loff_t pos, unsigned len, unsigned flags,
				2331	struct page pagep, void fsdata,
				2332	get_block_t get_block, loff_t bytes)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2333	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2334	struct inode *inode = mapping->host;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2335	unsigned blocksize = 1 << inode->i_blkbits;
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2336	unsigned zerofrom;
				2337	int err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2338
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2339	err = cont_expand_zero(file, mapping, pos, bytes);
				2340	if (err)
				2341	goto out;
				2342
				2343	zerofrom = *bytes & ~PAGE_CACHE_MASK;
				2344	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
				2345	*bytes \|= (blocksize-1);
				2346	(*bytes)++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2347	}
				2348
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2349	*pagep = NULL;
				2350	err = block_write_begin(file, mapping, pos, len,
				2351	flags, pagep, fsdata, get_block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2352	out:
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2353	return err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2354	}
				2355
				2356	int block_prepare_write(struct page *page, unsigned from, unsigned to,
				2357	get_block_t *get_block)
				2358	{
				2359	struct inode *inode = page->mapping->host;
				2360	int err = __block_prepare_write(inode, page, from, to, get_block);
				2361	if (err)
				2362	ClearPageUptodate(page);
				2363	return err;
				2364	}
				2365
				2366	int block_commit_write(struct page *page, unsigned from, unsigned to)
				2367	{
				2368	struct inode *inode = page->mapping->host;
				2369	__block_commit_write(inode,page,from,to);
				2370	return 0;
				2371	}
				2372
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2373	/*
				2374	* block_page_mkwrite() is not allowed to change the file size as it gets
				2375	* called from a page fault handler when a page is first dirtied. Hence we must
				2376	* be careful to check for EOF conditions here. We set the page up correctly
				2377	* for a written page which means we get ENOSPC checking when writing into
				2378	* holes and correct delalloc and unwritten extent mapping on filesystems that
				2379	* support these features.
				2380	*
				2381	* We are not allowed to take the i_mutex here so we have to play games to
				2382	* protect against truncate races as the page could now be beyond EOF. Because
				2383	* vmtruncate() writes the inode size before removing pages, once we have the
				2384	* page lock we can determine safely if the page is beyond EOF. If it is not
				2385	* beyond EOF, then the page is guaranteed safe against truncation until we
				2386	* unlock the page.
				2387	*/
				2388	int
Nick Piggin	c2ec175	2009-03-31 15:23:21 -0700	[diff] [blame]	2389	block_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf,
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2390	get_block_t get_block)
				2391	{
Nick Piggin	c2ec175	2009-03-31 15:23:21 -0700	[diff] [blame]	2392	struct page *page = vmf->page;
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2393	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
				2394	unsigned long end;
				2395	loff_t size;
Nick Piggin	56a76f8	2009-03-31 15:23:23 -0700	[diff] [blame]	2396	int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2397
				2398	lock_page(page);
				2399	size = i_size_read(inode);
				2400	if ((page->mapping != inode->i_mapping) \|\|
Nick Piggin	1833633	2007-07-20 00:31:45 -0700	[diff] [blame]	2401	(page_offset(page) > size)) {
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2402	/* page got truncated out from underneath us */
Nick Piggin	b827e49	2009-04-30 15:08:16 -0700	[diff] [blame]	2403	unlock_page(page);
				2404	goto out;
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2405	}
				2406
				2407	/* page is wholly or partially inside EOF */
				2408	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
				2409	end = size & ~PAGE_CACHE_MASK;
				2410	else
				2411	end = PAGE_CACHE_SIZE;
				2412
				2413	ret = block_prepare_write(page, 0, end, get_block);
				2414	if (!ret)
				2415	ret = block_commit_write(page, 0, end);
				2416
Nick Piggin	56a76f8	2009-03-31 15:23:23 -0700	[diff] [blame]	2417	if (unlikely(ret)) {
Nick Piggin	b827e49	2009-04-30 15:08:16 -0700	[diff] [blame]	2418	unlock_page(page);
Nick Piggin	56a76f8	2009-03-31 15:23:23 -0700	[diff] [blame]	2419	if (ret == -ENOMEM)
				2420	ret = VM_FAULT_OOM;
				2421	else /* -ENOSPC, -EIO, etc */
				2422	ret = VM_FAULT_SIGBUS;
Nick Piggin	b827e49	2009-04-30 15:08:16 -0700	[diff] [blame]	2423	} else
				2424	ret = VM_FAULT_LOCKED;
Nick Piggin	c2ec175	2009-03-31 15:23:21 -0700	[diff] [blame]	2425
Nick Piggin	b827e49	2009-04-30 15:08:16 -0700	[diff] [blame]	2426	out:
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2427	return ret;
				2428	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2429
				2430	/*
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2431	* nobh_write_begin()'s prereads are special: the buffer_heads are freed
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2432	* immediately, while under the page lock. So it needs a special end_io
				2433	* handler which does not touch the bh after unlocking it.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2434	*/
				2435	static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
				2436	{
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	2437	__end_buffer_read_notouch(bh, uptodate);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2438	}
				2439
				2440	/*
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2441	* Attach the singly-linked list of buffers created by nobh_write_begin, to
				2442	* the page (converting it to circular linked list and taking care of page
				2443	* dirty races).
				2444	*/
				2445	static void attach_nobh_buffers(struct page page, struct buffer_head head)
				2446	{
				2447	struct buffer_head *bh;
				2448
				2449	BUG_ON(!PageLocked(page));
				2450
				2451	spin_lock(&page->mapping->private_lock);
				2452	bh = head;
				2453	do {
				2454	if (PageDirty(page))
				2455	set_buffer_dirty(bh);
				2456	if (!bh->b_this_page)
				2457	bh->b_this_page = head;
				2458	bh = bh->b_this_page;
				2459	} while (bh != head);
				2460	attach_page_buffers(page, head);
				2461	spin_unlock(&page->mapping->private_lock);
				2462	}
				2463
				2464	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2465	* On entry, the page is fully not uptodate.
				2466	* On exit the page is fully uptodate in the areas outside (from,to)
				2467	*/
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2468	int nobh_write_begin(struct file file, struct address_space mapping,
				2469	loff_t pos, unsigned len, unsigned flags,
				2470	struct page pagep, void fsdata,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2471	get_block_t *get_block)
				2472	{
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2473	struct inode *inode = mapping->host;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2474	const unsigned blkbits = inode->i_blkbits;
				2475	const unsigned blocksize = 1 << blkbits;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2476	struct buffer_head head, bh;
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2477	struct page *page;
				2478	pgoff_t index;
				2479	unsigned from, to;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2480	unsigned block_in_page;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2481	unsigned block_start, block_end;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2482	sector_t block_in_file;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2483	int nr_reads = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2484	int ret = 0;
				2485	int is_mapped_to_disk = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2486
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2487	index = pos >> PAGE_CACHE_SHIFT;
				2488	from = pos & (PAGE_CACHE_SIZE - 1);
				2489	to = from + len;
				2490
Nick Piggin	54566b2	2009-01-04 12:00:53 -0800	[diff] [blame]	2491	page = grab_cache_page_write_begin(mapping, index, flags);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2492	if (!page)
				2493	return -ENOMEM;
				2494	*pagep = page;
				2495	*fsdata = NULL;
				2496
				2497	if (page_has_buffers(page)) {
				2498	unlock_page(page);
				2499	page_cache_release(page);
				2500	*pagep = NULL;
				2501	return block_write_begin(file, mapping, pos, len, flags, pagep,
				2502	fsdata, get_block);
				2503	}
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2504
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2505	if (PageMappedToDisk(page))
				2506	return 0;
				2507
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2508	/*
				2509	* Allocate buffers so that we can keep track of state, and potentially
				2510	* attach them to the page if an error occurs. In the common case of
				2511	* no error, they will just be freed again without ever being attached
				2512	* to the page (which is all OK, because we're under the page lock).
				2513	*
				2514	* Be careful: the buffer linked list is a NULL terminated one, rather
				2515	* than the circular one we're used to.
				2516	*/
				2517	head = alloc_page_buffers(page, blocksize, 0);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2518	if (!head) {
				2519	ret = -ENOMEM;
				2520	goto out_release;
				2521	}
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2522
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2523	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2524
				2525	/*
				2526	* We loop across all blocks in the page, whether or not they are
				2527	* part of the affected region. This is so we can discover if the
				2528	* page is fully mapped-to-disk.
				2529	*/
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2530	for (block_start = 0, block_in_page = 0, bh = head;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2531	block_start < PAGE_CACHE_SIZE;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2532	block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2533	int create;
				2534
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2535	block_end = block_start + blocksize;
				2536	bh->b_state = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2537	create = 1;
				2538	if (block_start >= to)
				2539	create = 0;
				2540	ret = get_block(inode, block_in_file + block_in_page,
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2541	bh, create);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2542	if (ret)
				2543	goto failed;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2544	if (!buffer_mapped(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2545	is_mapped_to_disk = 0;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2546	if (buffer_new(bh))
				2547	unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
				2548	if (PageUptodate(page)) {
				2549	set_buffer_uptodate(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2550	continue;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2551	}
				2552	if (buffer_new(bh) \|\| !buffer_mapped(bh)) {
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2553	zero_user_segments(page, block_start, from,
				2554	to, block_end);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2555	continue;
				2556	}
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2557	if (buffer_uptodate(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2558	continue; /* reiserfs does this */
				2559	if (block_start < from \|\| block_end > to) {
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2560	lock_buffer(bh);
				2561	bh->b_end_io = end_buffer_read_nobh;
				2562	submit_bh(READ, bh);
				2563	nr_reads++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2564	}
				2565	}
				2566
				2567	if (nr_reads) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2568	/*
				2569	* The page is locked, so these buffers are protected from
				2570	* any VM or truncate activity. Hence we don't need to care
				2571	* for the buffer_head refcounts.
				2572	*/
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2573	for (bh = head; bh; bh = bh->b_this_page) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2574	wait_on_buffer(bh);
				2575	if (!buffer_uptodate(bh))
				2576	ret = -EIO;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2577	}
				2578	if (ret)
				2579	goto failed;
				2580	}
				2581
				2582	if (is_mapped_to_disk)
				2583	SetPageMappedToDisk(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2584
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2585	fsdata = head; / to be released by nobh_write_end */
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2586
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2587	return 0;
				2588
				2589	failed:
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2590	BUG_ON(!ret);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2591	/*
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2592	* Error recovery is a bit difficult. We need to zero out blocks that
				2593	* were newly allocated, and dirty them to ensure they get written out.
				2594	* Buffers need to be attached to the page at this point, otherwise
				2595	* the handling of potential IO errors during writeout would be hard
				2596	* (could try doing synchronous writeout, but what if that fails too?)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2597	*/
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2598	attach_nobh_buffers(page, head);
				2599	page_zero_new_buffers(page, from, to);
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2600
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2601	out_release:
				2602	unlock_page(page);
				2603	page_cache_release(page);
				2604	*pagep = NULL;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2605
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2606	if (pos + len > inode->i_size)
				2607	vmtruncate(inode, inode->i_size);
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2608
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2609	return ret;
				2610	}
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2611	EXPORT_SYMBOL(nobh_write_begin);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2612
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2613	int nobh_write_end(struct file file, struct address_space mapping,
				2614	loff_t pos, unsigned len, unsigned copied,
				2615	struct page page, void fsdata)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2616	{
				2617	struct inode *inode = page->mapping->host;
Nick Piggin	efdc313	2007-10-21 06:57:41 +0200	[diff] [blame]	2618	struct buffer_head *head = fsdata;
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2619	struct buffer_head *bh;
Dmitri Monakhov	5b41e74	2008-03-28 14:15:52 -0700	[diff] [blame]	2620	BUG_ON(fsdata != NULL && page_has_buffers(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2621
Dave Kleikamp	d4cf109	2009-02-06 14:59:26 -0600	[diff] [blame]	2622	if (unlikely(copied < len) && head)
Dmitri Monakhov	5b41e74	2008-03-28 14:15:52 -0700	[diff] [blame]	2623	attach_nobh_buffers(page, head);
				2624	if (page_has_buffers(page))
				2625	return generic_write_end(file, mapping, pos, len,
				2626	copied, page, fsdata);
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2627
Nick Piggin	22c8ca7	2007-02-20 13:58:09 -0800	[diff] [blame]	2628	SetPageUptodate(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2629	set_page_dirty(page);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2630	if (pos+copied > inode->i_size) {
				2631	i_size_write(inode, pos+copied);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2632	mark_inode_dirty(inode);
				2633	}
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2634
				2635	unlock_page(page);
				2636	page_cache_release(page);
				2637
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2638	while (head) {
				2639	bh = head;
				2640	head = head->b_this_page;
				2641	free_buffer_head(bh);
				2642	}
				2643
				2644	return copied;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2645	}
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2646	EXPORT_SYMBOL(nobh_write_end);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2647
				2648	/*
				2649	* nobh_writepage() - based on block_full_write_page() except
				2650	* that it tries to operate without attaching bufferheads to
				2651	* the page.
				2652	*/
				2653	int nobh_writepage(struct page page, get_block_t get_block,
				2654	struct writeback_control *wbc)
				2655	{
				2656	struct inode * const inode = page->mapping->host;
				2657	loff_t i_size = i_size_read(inode);
				2658	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2659	unsigned offset;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2660	int ret;
				2661
				2662	/* Is the page fully inside i_size? */
				2663	if (page->index < end_index)
				2664	goto out;
				2665
				2666	/* Is the page fully outside i_size? (truncate in progress) */
				2667	offset = i_size & (PAGE_CACHE_SIZE-1);
				2668	if (page->index >= end_index+1 \|\| !offset) {
				2669	/*
				2670	* The page may have dirty, unmapped buffers. For example,
				2671	* they may have been added in ext3_writepage(). Make them
				2672	* freeable here, so the page does not leak.
				2673	*/
				2674	#if 0
				2675	/* Not really sure about this - do we need this ? */
				2676	if (page->mapping->a_ops->invalidatepage)
				2677	page->mapping->a_ops->invalidatepage(page, offset);
				2678	#endif
				2679	unlock_page(page);
				2680	return 0; /* don't care */
				2681	}
				2682
				2683	/*
				2684	* The page straddles i_size. It must be zeroed out on each and every
				2685	* writepage invocation because it may be mmapped. "A file is mapped
				2686	* in multiples of the page size. For a file that is not a multiple of
				2687	* the page size, the remaining memory is zeroed when mapped, and
				2688	* writes to that region are not written out to the file."
				2689	*/
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2690	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2691	out:
				2692	ret = mpage_writepage(page, get_block, wbc);
				2693	if (ret == -EAGAIN)
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	2694	ret = __block_write_full_page(inode, page, get_block, wbc,
				2695	end_buffer_async_write);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2696	return ret;
				2697	}
				2698	EXPORT_SYMBOL(nobh_writepage);
				2699
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2700	int nobh_truncate_page(struct address_space *mapping,
				2701	loff_t from, get_block_t *get_block)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2702	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2703	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2704	unsigned offset = from & (PAGE_CACHE_SIZE-1);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2705	unsigned blocksize;
				2706	sector_t iblock;
				2707	unsigned length, pos;
				2708	struct inode *inode = mapping->host;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2709	struct page *page;
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2710	struct buffer_head map_bh;
				2711	int err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2712
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2713	blocksize = 1 << inode->i_blkbits;
				2714	length = offset & (blocksize - 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2715
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2716	/* Block boundary? Nothing to do */
				2717	if (!length)
				2718	return 0;
				2719
				2720	length = blocksize - length;
				2721	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
				2722
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2723	page = grab_cache_page(mapping, index);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2724	err = -ENOMEM;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2725	if (!page)
				2726	goto out;
				2727
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2728	if (page_has_buffers(page)) {
				2729	has_buffers:
				2730	unlock_page(page);
				2731	page_cache_release(page);
				2732	return block_truncate_page(mapping, from, get_block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2733	}
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2734
				2735	/* Find the buffer that contains "offset" */
				2736	pos = blocksize;
				2737	while (offset >= pos) {
				2738	iblock++;
				2739	pos += blocksize;
				2740	}
				2741
Theodore Ts'o	460bcf5	2009-05-12 07:37:56 -0400	[diff] [blame]	2742	map_bh.b_size = blocksize;
				2743	map_bh.b_state = 0;
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2744	err = get_block(inode, iblock, &map_bh, 0);
				2745	if (err)
				2746	goto unlock;
				2747	/* unmapped? It's a hole - nothing to do */
				2748	if (!buffer_mapped(&map_bh))
				2749	goto unlock;
				2750
				2751	/* Ok, it's mapped. Make sure it's up-to-date */
				2752	if (!PageUptodate(page)) {
				2753	err = mapping->a_ops->readpage(NULL, page);
				2754	if (err) {
				2755	page_cache_release(page);
				2756	goto out;
				2757	}
				2758	lock_page(page);
				2759	if (!PageUptodate(page)) {
				2760	err = -EIO;
				2761	goto unlock;
				2762	}
				2763	if (page_has_buffers(page))
				2764	goto has_buffers;
				2765	}
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2766	zero_user(page, offset, length);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2767	set_page_dirty(page);
				2768	err = 0;
				2769
				2770	unlock:
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2771	unlock_page(page);
				2772	page_cache_release(page);
				2773	out:
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2774	return err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2775	}
				2776	EXPORT_SYMBOL(nobh_truncate_page);
				2777
				2778	int block_truncate_page(struct address_space *mapping,
				2779	loff_t from, get_block_t *get_block)
				2780	{
				2781	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2782	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				2783	unsigned blocksize;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2784	sector_t iblock;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2785	unsigned length, pos;
				2786	struct inode *inode = mapping->host;
				2787	struct page *page;
				2788	struct buffer_head *bh;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2789	int err;
				2790
				2791	blocksize = 1 << inode->i_blkbits;
				2792	length = offset & (blocksize - 1);
				2793
				2794	/* Block boundary? Nothing to do */
				2795	if (!length)
				2796	return 0;
				2797
				2798	length = blocksize - length;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2799	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2800
				2801	page = grab_cache_page(mapping, index);
				2802	err = -ENOMEM;
				2803	if (!page)
				2804	goto out;
				2805
				2806	if (!page_has_buffers(page))
				2807	create_empty_buffers(page, blocksize, 0);
				2808
				2809	/* Find the buffer that contains "offset" */
				2810	bh = page_buffers(page);
				2811	pos = blocksize;
				2812	while (offset >= pos) {
				2813	bh = bh->b_this_page;
				2814	iblock++;
				2815	pos += blocksize;
				2816	}
				2817
				2818	err = 0;
				2819	if (!buffer_mapped(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2820	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2821	err = get_block(inode, iblock, bh, 0);
				2822	if (err)
				2823	goto unlock;
				2824	/* unmapped? It's a hole - nothing to do */
				2825	if (!buffer_mapped(bh))
				2826	goto unlock;
				2827	}
				2828
				2829	/* Ok, it's mapped. Make sure it's up-to-date */
				2830	if (PageUptodate(page))
				2831	set_buffer_uptodate(bh);
				2832
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	2833	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2834	err = -EIO;
				2835	ll_rw_block(READ, 1, &bh);
				2836	wait_on_buffer(bh);
				2837	/* Uhhuh. Read error. Complain and punt. */
				2838	if (!buffer_uptodate(bh))
				2839	goto unlock;
				2840	}
				2841
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2842	zero_user(page, offset, length);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2843	mark_buffer_dirty(bh);
				2844	err = 0;
				2845
				2846	unlock:
				2847	unlock_page(page);
				2848	page_cache_release(page);
				2849	out:
				2850	return err;
				2851	}
				2852
				2853	/*
				2854	* The generic ->writepage function for buffer-backed address_spaces
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	2855	* this form passes in the end_io handler used to finish the IO.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2856	*/
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	2857	int block_write_full_page_endio(struct page page, get_block_t get_block,
				2858	struct writeback_control wbc, bh_end_io_t handler)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2859	{
				2860	struct inode * const inode = page->mapping->host;
				2861	loff_t i_size = i_size_read(inode);
				2862	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2863	unsigned offset;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2864
				2865	/* Is the page fully inside i_size? */
				2866	if (page->index < end_index)
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	2867	return __block_write_full_page(inode, page, get_block, wbc,
				2868	handler);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2869
				2870	/* Is the page fully outside i_size? (truncate in progress) */
				2871	offset = i_size & (PAGE_CACHE_SIZE-1);
				2872	if (page->index >= end_index+1 \|\| !offset) {
				2873	/*
				2874	* The page may have dirty, unmapped buffers. For example,
				2875	* they may have been added in ext3_writepage(). Make them
				2876	* freeable here, so the page does not leak.
				2877	*/
Jan Kara	aaa4059	2005-10-30 15:00:16 -0800	[diff] [blame]	2878	do_invalidatepage(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2879	unlock_page(page);
				2880	return 0; /* don't care */
				2881	}
				2882
				2883	/*
				2884	* The page straddles i_size. It must be zeroed out on each and every
				2885	* writepage invokation because it may be mmapped. "A file is mapped
				2886	* in multiples of the page size. For a file that is not a multiple of
				2887	* the page size, the remaining memory is zeroed when mapped, and
				2888	* writes to that region are not written out to the file."
				2889	*/
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2890	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	2891	return __block_write_full_page(inode, page, get_block, wbc, handler);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2892	}
				2893
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	2894	/*
				2895	* The generic ->writepage function for buffer-backed address_spaces
				2896	*/
				2897	int block_write_full_page(struct page page, get_block_t get_block,
				2898	struct writeback_control *wbc)
				2899	{
				2900	return block_write_full_page_endio(page, get_block, wbc,
				2901	end_buffer_async_write);
				2902	}
				2903
				2904
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2905	sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
				2906	get_block_t *get_block)
				2907	{
				2908	struct buffer_head tmp;
				2909	struct inode *inode = mapping->host;
				2910	tmp.b_state = 0;
				2911	tmp.b_blocknr = 0;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2912	tmp.b_size = 1 << inode->i_blkbits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2913	get_block(inode, block, &tmp, 0);
				2914	return tmp.b_blocknr;
				2915	}
				2916
NeilBrown	6712ecf	2007-09-27 12:47:43 +0200	[diff] [blame]	2917	static void end_bio_bh_io_sync(struct bio *bio, int err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2918	{
				2919	struct buffer_head *bh = bio->bi_private;
				2920
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2921	if (err == -EOPNOTSUPP) {
				2922	set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
				2923	set_bit(BH_Eopnotsupp, &bh->b_state);
				2924	}
				2925
Keith Mannthey	08bafc0	2008-11-25 10:24:35 +0100	[diff] [blame]	2926	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
				2927	set_bit(BH_Quiet, &bh->b_state);
				2928
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2929	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
				2930	bio_put(bio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2931	}
				2932
				2933	int submit_bh(int rw, struct buffer_head * bh)
				2934	{
				2935	struct bio *bio;
				2936	int ret = 0;
				2937
				2938	BUG_ON(!buffer_locked(bh));
				2939	BUG_ON(!buffer_mapped(bh));
				2940	BUG_ON(!bh->b_end_io);
Aneesh Kumar K.V	8fb0e34	2009-05-12 16:22:37 -0400	[diff] [blame]	2941	BUG_ON(buffer_delay(bh));
				2942	BUG_ON(buffer_unwritten(bh));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2943
Jens Axboe	48fd4f9	2008-08-22 10:00:36 +0200	[diff] [blame]	2944	/*
				2945	* Mask in barrier bit for a write (could be either a WRITE or a
				2946	* WRITE_SYNC
				2947	*/
				2948	if (buffer_ordered(bh) && (rw & WRITE))
				2949	rw \|= WRITE_BARRIER;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2950
				2951	/*
Jens Axboe	48fd4f9	2008-08-22 10:00:36 +0200	[diff] [blame]	2952	* Only clear out a write error when rewriting
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2953	*/
Jens Axboe	48fd4f9	2008-08-22 10:00:36 +0200	[diff] [blame]	2954	if (test_set_buffer_req(bh) && (rw & WRITE))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2955	clear_buffer_write_io_error(bh);
				2956
				2957	/*
				2958	* from here on down, it's all bio -- do the initial mapping,
				2959	* submit_bio -> generic_make_request may further map this bio around
				2960	*/
				2961	bio = bio_alloc(GFP_NOIO, 1);
				2962
				2963	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
				2964	bio->bi_bdev = bh->b_bdev;
				2965	bio->bi_io_vec[0].bv_page = bh->b_page;
				2966	bio->bi_io_vec[0].bv_len = bh->b_size;
				2967	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
				2968
				2969	bio->bi_vcnt = 1;
				2970	bio->bi_idx = 0;
				2971	bio->bi_size = bh->b_size;
				2972
				2973	bio->bi_end_io = end_bio_bh_io_sync;
				2974	bio->bi_private = bh;
				2975
				2976	bio_get(bio);
				2977	submit_bio(rw, bio);
				2978
				2979	if (bio_flagged(bio, BIO_EOPNOTSUPP))
				2980	ret = -EOPNOTSUPP;
				2981
				2982	bio_put(bio);
				2983	return ret;
				2984	}
				2985
				2986	/**
				2987	* ll_rw_block: low-level access to block devices (DEPRECATED)
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2988	* @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2989	* @nr: number of &struct buffer_heads in the array
				2990	* @bhs: array of pointers to &struct buffer_head
				2991	*
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2992	* ll_rw_block() takes an array of pointers to &struct buffer_heads, and
				2993	* requests an I/O operation on them, either a %READ or a %WRITE. The third
				2994	* %SWRITE is like %WRITE only we make sure that the current data in buffers
				2995	* are sent to disk. The fourth %READA option is described in the documentation
				2996	* for generic_make_request() which ll_rw_block() calls.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2997	*
				2998	* This function drops any buffer that it cannot get a lock on (with the
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2999	* BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
				3000	* clean when doing a write request, and any buffer that appears to be
				3001	* up-to-date when doing read request. Further it marks as clean buffers that
				3002	* are processed for writing (the buffer cache won't assume that they are
				3003	* actually clean until the buffer gets unlocked).
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3004	*
				3005	* ll_rw_block sets b_end_io to simple completion handler that marks
				3006	* the buffer up-to-date (if approriate), unlocks the buffer and wakes
				3007	* any waiters.
				3008	*
				3009	* All of the buffers must be for the same device, and must also be a
				3010	* multiple of the current approved size for the device.
				3011	*/
				3012	void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
				3013	{
				3014	int i;
				3015
				3016	for (i = 0; i < nr; i++) {
				3017	struct buffer_head *bh = bhs[i];
				3018
Jens Axboe	9cf6b72	2009-04-06 14:48:03 +0200	[diff] [blame]	3019	if (rw == SWRITE \|\| rw == SWRITE_SYNC \|\| rw == SWRITE_SYNC_PLUG)
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	3020	lock_buffer(bh);
Nick Piggin	ca5de40	2008-08-02 12:02:13 +0200	[diff] [blame]	3021	else if (!trylock_buffer(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3022	continue;
				3023
Jens Axboe	9cf6b72	2009-04-06 14:48:03 +0200	[diff] [blame]	3024	if (rw == WRITE \|\| rw == SWRITE \|\| rw == SWRITE_SYNC \|\|
				3025	rw == SWRITE_SYNC_PLUG) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3026	if (test_clear_buffer_dirty(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	3027	bh->b_end_io = end_buffer_write_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	3028	get_bh(bh);
Jens Axboe	18ce375	2008-07-01 09:07:34 +0200	[diff] [blame]	3029	if (rw == SWRITE_SYNC)
				3030	submit_bh(WRITE_SYNC, bh);
				3031	else
				3032	submit_bh(WRITE, bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3033	continue;
				3034	}
				3035	} else {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3036	if (!buffer_uptodate(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	3037	bh->b_end_io = end_buffer_read_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	3038	get_bh(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3039	submit_bh(rw, bh);
				3040	continue;
				3041	}
				3042	}
				3043	unlock_buffer(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3044	}
				3045	}
				3046
				3047	/*
				3048	* For a data-integrity writeout, we need to wait upon any in-progress I/O
				3049	* and then start new I/O and then wait upon it. The caller must have a ref on
				3050	* the buffer_head.
				3051	*/
				3052	int sync_dirty_buffer(struct buffer_head *bh)
				3053	{
				3054	int ret = 0;
				3055
				3056	WARN_ON(atomic_read(&bh->b_count) < 1);
				3057	lock_buffer(bh);
				3058	if (test_clear_buffer_dirty(bh)) {
				3059	get_bh(bh);
				3060	bh->b_end_io = end_buffer_write_sync;
Jens Axboe	1aa2a7c	2009-04-06 14:48:08 +0200	[diff] [blame]	3061	ret = submit_bh(WRITE_SYNC, bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3062	wait_on_buffer(bh);
				3063	if (buffer_eopnotsupp(bh)) {
				3064	clear_buffer_eopnotsupp(bh);
				3065	ret = -EOPNOTSUPP;
				3066	}
				3067	if (!ret && !buffer_uptodate(bh))
				3068	ret = -EIO;
				3069	} else {
				3070	unlock_buffer(bh);
				3071	}
				3072	return ret;
				3073	}
				3074
				3075	/*
				3076	* try_to_free_buffers() checks if all the buffers on this particular page
				3077	* are unused, and releases them if so.
				3078	*
				3079	* Exclusion against try_to_free_buffers may be obtained by either
				3080	* locking the page or by holding its mapping's private_lock.
				3081	*
				3082	* If the page is dirty but all the buffers are clean then we need to
				3083	* be sure to mark the page clean as well. This is because the page
				3084	* may be against a block device, and a later reattachment of buffers
				3085	* to a dirty page will set all buffers dirty. Which would corrupt
				3086	* filesystem data on the same device.
				3087	*
				3088	* The same applies to regular filesystem pages: if all the buffers are
				3089	* clean then we set the page clean and proceed. To do that, we require
				3090	* total exclusion from __set_page_dirty_buffers(). That is obtained with
				3091	* private_lock.
				3092	*
				3093	* try_to_free_buffers() is non-blocking.
				3094	*/
				3095	static inline int buffer_busy(struct buffer_head *bh)
				3096	{
				3097	return atomic_read(&bh->b_count) \|
				3098	(bh->b_state & ((1 << BH_Dirty) \| (1 << BH_Lock)));
				3099	}
				3100
				3101	static int
				3102	drop_buffers(struct page page, struct buffer_head *buffers_to_free)
				3103	{
				3104	struct buffer_head *head = page_buffers(page);
				3105	struct buffer_head *bh;
				3106
				3107	bh = head;
				3108	do {
akpm@osdl.org	de7d5a3	2005-05-01 08:58:39 -0700	[diff] [blame]	3109	if (buffer_write_io_error(bh) && page->mapping)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3110	set_bit(AS_EIO, &page->mapping->flags);
				3111	if (buffer_busy(bh))
				3112	goto failed;
				3113	bh = bh->b_this_page;
				3114	} while (bh != head);
				3115
				3116	do {
				3117	struct buffer_head *next = bh->b_this_page;
				3118
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	3119	if (bh->b_assoc_map)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3120	__remove_assoc_queue(bh);
				3121	bh = next;
				3122	} while (bh != head);
				3123	*buffers_to_free = head;
				3124	__clear_page_buffers(page);
				3125	return 1;
				3126	failed:
				3127	return 0;
				3128	}
				3129
				3130	int try_to_free_buffers(struct page *page)
				3131	{
				3132	struct address_space * const mapping = page->mapping;
				3133	struct buffer_head *buffers_to_free = NULL;
				3134	int ret = 0;
				3135
				3136	BUG_ON(!PageLocked(page));
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	3137	if (PageWriteback(page))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3138	return 0;
				3139
				3140	if (mapping == NULL) { /* can this still happen? */
				3141	ret = drop_buffers(page, &buffers_to_free);
				3142	goto out;
				3143	}
				3144
				3145	spin_lock(&mapping->private_lock);
				3146	ret = drop_buffers(page, &buffers_to_free);
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	3147
				3148	/*
				3149	* If the filesystem writes its buffers by hand (eg ext3)
				3150	* then we can have clean buffers against a dirty page. We
				3151	* clean the page here; otherwise the VM will never notice
				3152	* that the filesystem did any IO at all.
				3153	*
				3154	* Also, during truncate, discard_buffer will have marked all
				3155	* the page's buffers clean. We discover that here and clean
				3156	* the page also.
Nick Piggin	87df724	2007-01-30 14:36:27 +1100	[diff] [blame]	3157	*
				3158	* private_lock must be held over this entire operation in order
				3159	* to synchronise against __set_page_dirty_buffers and prevent the
				3160	* dirty bit from being lost.
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	3161	*/
				3162	if (ret)
				3163	cancel_dirty_page(page, PAGE_CACHE_SIZE);
Nick Piggin	87df724	2007-01-30 14:36:27 +1100	[diff] [blame]	3164	spin_unlock(&mapping->private_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3165	out:
				3166	if (buffers_to_free) {
				3167	struct buffer_head *bh = buffers_to_free;
				3168
				3169	do {
				3170	struct buffer_head *next = bh->b_this_page;
				3171	free_buffer_head(bh);
				3172	bh = next;
				3173	} while (bh != buffers_to_free);
				3174	}
				3175	return ret;
				3176	}
				3177	EXPORT_SYMBOL(try_to_free_buffers);
				3178
NeilBrown	3978d71	2006-03-26 01:37:17 -0800	[diff] [blame]	3179	void block_sync_page(struct page *page)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3180	{
				3181	struct address_space *mapping;
				3182
				3183	smp_mb();
				3184	mapping = page_mapping(page);
				3185	if (mapping)
				3186	blk_run_backing_dev(mapping->backing_dev_info, page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3187	}
				3188
				3189	/*
				3190	* There are no bdflush tunables left. But distributions are
				3191	* still running obsolete flush daemons, so we terminate them here.
				3192	*
				3193	* Use of bdflush() is deprecated and will be removed in a future kernel.
				3194	* The `pdflush' kernel threads fully replace bdflush daemons and this call.
				3195	*/
Heiko Carstens	bdc480e	2009-01-14 14:14:12 +0100	[diff] [blame]	3196	SYSCALL_DEFINE2(bdflush, int, func, long, data)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3197	{
				3198	static int msg_count;
				3199
				3200	if (!capable(CAP_SYS_ADMIN))
				3201	return -EPERM;
				3202
				3203	if (msg_count < 5) {
				3204	msg_count++;
				3205	printk(KERN_INFO
				3206	"warning: process `%s' used the obsolete bdflush"
				3207	" system call\n", current->comm);
				3208	printk(KERN_INFO "Fix your initscripts?\n");
				3209	}
				3210
				3211	if (func == 1)
				3212	do_exit(0);
				3213	return 0;
				3214	}
				3215
				3216	/*
				3217	* Buffer-head allocation
				3218	*/
Christoph Lameter	e18b890	2006-12-06 20:33:20 -0800	[diff] [blame]	3219	static struct kmem_cache *bh_cachep;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3220
				3221	/*
				3222	* Once the number of bh's in the machine exceeds this level, we start
				3223	* stripping them in writeback.
				3224	*/
				3225	static int max_buffer_heads;
				3226
				3227	int buffer_heads_over_limit;
				3228
				3229	struct bh_accounting {
				3230	int nr; /* Number of live bh's */
				3231	int ratelimit; /* Limit cacheline bouncing */
				3232	};
				3233
				3234	static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
				3235
				3236	static void recalc_bh_state(void)
				3237	{
				3238	int i;
				3239	int tot = 0;
				3240
				3241	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
				3242	return;
				3243	__get_cpu_var(bh_accounting).ratelimit = 0;
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	3244	for_each_online_cpu(i)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3245	tot += per_cpu(bh_accounting, i).nr;
				3246	buffer_heads_over_limit = (tot > max_buffer_heads);
				3247	}
				3248
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	3249	struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3250	{
Christoph Lameter	488514d	2008-04-28 02:12:05 -0700	[diff] [blame]	3251	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3252	if (ret) {
Christoph Lameter	a35afb8	2007-05-16 22:10:57 -0700	[diff] [blame]	3253	INIT_LIST_HEAD(&ret->b_assoc_buffers);
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3254	get_cpu_var(bh_accounting).nr++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3255	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3256	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3257	}
				3258	return ret;
				3259	}
				3260	EXPORT_SYMBOL(alloc_buffer_head);
				3261
				3262	void free_buffer_head(struct buffer_head *bh)
				3263	{
				3264	BUG_ON(!list_empty(&bh->b_assoc_buffers));
				3265	kmem_cache_free(bh_cachep, bh);
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3266	get_cpu_var(bh_accounting).nr--;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3267	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3268	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3269	}
				3270	EXPORT_SYMBOL(free_buffer_head);
				3271
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3272	static void buffer_exit_cpu(int cpu)
				3273	{
				3274	int i;
				3275	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
				3276
				3277	for (i = 0; i < BH_LRU_SIZE; i++) {
				3278	brelse(b->bhs[i]);
				3279	b->bhs[i] = NULL;
				3280	}
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	3281	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
				3282	per_cpu(bh_accounting, cpu).nr = 0;
				3283	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3284	}
				3285
				3286	static int buffer_cpu_notify(struct notifier_block *self,
				3287	unsigned long action, void *hcpu)
				3288	{
Rafael J. Wysocki	8bb7844	2007-05-09 02:35:10 -0700	[diff] [blame]	3289	if (action == CPU_DEAD \|\| action == CPU_DEAD_FROZEN)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3290	buffer_exit_cpu((unsigned long)hcpu);
				3291	return NOTIFY_OK;
				3292	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3293
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3294	/**
Randy Dunlap	a6b9191	2008-03-19 17:01:00 -0700	[diff] [blame]	3295	* bh_uptodate_or_lock - Test whether the buffer is uptodate
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3296	* @bh: struct buffer_head
				3297	*
				3298	* Return true if the buffer is up-to-date and false,
				3299	* with the buffer locked, if not.
				3300	*/
				3301	int bh_uptodate_or_lock(struct buffer_head *bh)
				3302	{
				3303	if (!buffer_uptodate(bh)) {
				3304	lock_buffer(bh);
				3305	if (!buffer_uptodate(bh))
				3306	return 0;
				3307	unlock_buffer(bh);
				3308	}
				3309	return 1;
				3310	}
				3311	EXPORT_SYMBOL(bh_uptodate_or_lock);
				3312
				3313	/**
Randy Dunlap	a6b9191	2008-03-19 17:01:00 -0700	[diff] [blame]	3314	* bh_submit_read - Submit a locked buffer for reading
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3315	* @bh: struct buffer_head
				3316	*
				3317	* Returns zero on success and -EIO on error.
				3318	*/
				3319	int bh_submit_read(struct buffer_head *bh)
				3320	{
				3321	BUG_ON(!buffer_locked(bh));
				3322
				3323	if (buffer_uptodate(bh)) {
				3324	unlock_buffer(bh);
				3325	return 0;
				3326	}
				3327
				3328	get_bh(bh);
				3329	bh->b_end_io = end_buffer_read_sync;
				3330	submit_bh(READ, bh);
				3331	wait_on_buffer(bh);
				3332	if (buffer_uptodate(bh))
				3333	return 0;
				3334	return -EIO;
				3335	}
				3336	EXPORT_SYMBOL(bh_submit_read);
				3337
Christoph Lameter	b98938c	2008-02-04 22:28:36 -0800	[diff] [blame]	3338	static void
Alexey Dobriyan	51cc506	2008-07-25 19:45:34 -0700	[diff] [blame]	3339	init_buffer_head(void *data)
Christoph Lameter	b98938c	2008-02-04 22:28:36 -0800	[diff] [blame]	3340	{
				3341	struct buffer_head *bh = data;
				3342
				3343	memset(bh, 0, sizeof(*bh));
				3344	INIT_LIST_HEAD(&bh->b_assoc_buffers);
				3345	}
				3346
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3347	void __init buffer_init(void)
				3348	{
				3349	int nrpages;
				3350
Christoph Lameter	b98938c	2008-02-04 22:28:36 -0800	[diff] [blame]	3351	bh_cachep = kmem_cache_create("buffer_head",
				3352	sizeof(struct buffer_head), 0,
				3353	(SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|
				3354	SLAB_MEM_SPREAD),
				3355	init_buffer_head);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3356
				3357	/*
				3358	* Limit the bh occupancy to 10% of ZONE_NORMAL
				3359	*/
				3360	nrpages = (nr_free_buffer_pages() * 10) / 100;
				3361	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
				3362	hotcpu_notifier(buffer_cpu_notify, 0);
				3363	}
				3364
				3365	EXPORT_SYMBOL(__bforget);
				3366	EXPORT_SYMBOL(__brelse);
				3367	EXPORT_SYMBOL(__wait_on_buffer);
				3368	EXPORT_SYMBOL(block_commit_write);
				3369	EXPORT_SYMBOL(block_prepare_write);
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	3370	EXPORT_SYMBOL(block_page_mkwrite);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3371	EXPORT_SYMBOL(block_read_full_page);
				3372	EXPORT_SYMBOL(block_sync_page);
				3373	EXPORT_SYMBOL(block_truncate_page);
				3374	EXPORT_SYMBOL(block_write_full_page);
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	3375	EXPORT_SYMBOL(block_write_full_page_endio);
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	3376	EXPORT_SYMBOL(cont_write_begin);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3377	EXPORT_SYMBOL(end_buffer_read_sync);
				3378	EXPORT_SYMBOL(end_buffer_write_sync);
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	3379	EXPORT_SYMBOL(end_buffer_async_write);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3380	EXPORT_SYMBOL(file_fsync);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3381	EXPORT_SYMBOL(generic_block_bmap);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	3382	EXPORT_SYMBOL(generic_cont_expand_simple);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3383	EXPORT_SYMBOL(init_buffer);
				3384	EXPORT_SYMBOL(invalidate_bdev);
				3385	EXPORT_SYMBOL(ll_rw_block);
				3386	EXPORT_SYMBOL(mark_buffer_dirty);
				3387	EXPORT_SYMBOL(submit_bh);
				3388	EXPORT_SYMBOL(sync_dirty_buffer);
				3389	EXPORT_SYMBOL(unlock_buffer);