Blame - fs/xfs/linux-2.6/xfs_buf.c - kernel/msm-4.19

blob: 4cd46abe84341978a76e3af4edf5dc6e270560e1 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
Nathan Scott	eedb553	2005-09-02 16:39:56 +1000	[diff] [blame]	2	* Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3	*
				4	* This program is free software; you can redistribute it and/or modify it
				5	* under the terms of version 2 of the GNU General Public License as
				6	* published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it would be useful, but
				9	* WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
				11	*
				12	* Further, this software is distributed without any warranty that it is
				13	* free of the rightful claim of any third person regarding infringement
				14	* or the like. Any license provided herein, whether implied or
				15	* otherwise, applies only to this software file. Patent licenses, if
				16	* any, provided herein do not apply to combinations of this program with
				17	* other software, or any other product whatsoever.
				18	*
				19	* You should have received a copy of the GNU General Public License along
				20	* with this program; if not, write the Free Software Foundation, Inc., 59
				21	* Temple Place - Suite 330, Boston MA 02111-1307, USA.
				22	*
				23	* Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
				24	* Mountain View, CA 94043, or:
				25	*
				26	* http://www.sgi.com
				27	*
				28	* For further information regarding this notice, see:
				29	*
				30	* http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
				31	*/
				32
				33	/*
				34	* The xfs_buf.c code provides an abstract buffer cache model on top
				35	* of the Linux page cache. Cached metadata blocks for a file system
				36	* are hashed to the inode for the block device. xfs_buf.c assembles
				37	* buffers (xfs_buf_t) on demand to aggregate such cached pages for I/O.
				38	*
				39	* Written by Steve Lord, Jim Mostek, Russell Cattelan
				40	* and Rajagopal Ananthanarayanan ("ananth") at SGI.
				41	*
				42	*/
				43
				44	#include <linux/stddef.h>
				45	#include <linux/errno.h>
				46	#include <linux/slab.h>
				47	#include <linux/pagemap.h>
				48	#include <linux/init.h>
				49	#include <linux/vmalloc.h>
				50	#include <linux/bio.h>
				51	#include <linux/sysctl.h>
				52	#include <linux/proc_fs.h>
				53	#include <linux/workqueue.h>
				54	#include <linux/percpu.h>
				55	#include <linux/blkdev.h>
				56	#include <linux/hash.h>
Christoph Hellwig	4df08c5	2005-09-05 08:34:18 +1000	[diff] [blame]	57	#include <linux/kthread.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	58
				59	#include "xfs_linux.h"
				60
				61	/*
				62	* File wide globals
				63	*/
				64
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	65	STATIC kmem_cache_t *pagebuf_zone;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	66	STATIC kmem_shaker_t pagebuf_shake;
Al Viro	27496a8	2005-10-21 03:20:48 -0400	[diff] [blame]	67	STATIC int xfsbufd_wakeup(int, gfp_t);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	68	STATIC void pagebuf_delwri_queue(xfs_buf_t *, int);
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	69
				70	STATIC struct workqueue_struct *xfslogd_workqueue;
Christoph Hellwig	0829c36	2005-09-02 16:58:49 +1000	[diff] [blame]	71	struct workqueue_struct *xfsdatad_workqueue;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	72
				73	/*
				74	* Pagebuf debugging
				75	*/
				76
				77	#ifdef PAGEBUF_TRACE
				78	void
				79	pagebuf_trace(
				80	xfs_buf_t *pb,
				81	char *id,
				82	void *data,
				83	void *ra)
				84	{
				85	ktrace_enter(pagebuf_trace_buf,
				86	pb, id,
				87	(void *)(unsigned long)pb->pb_flags,
				88	(void *)(unsigned long)pb->pb_hold.counter,
				89	(void *)(unsigned long)pb->pb_sema.count.counter,
				90	(void *)current,
				91	data, ra,
				92	(void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff),
				93	(void *)(unsigned long)(pb->pb_file_offset & 0xffffffff),
				94	(void *)(unsigned long)pb->pb_buffer_length,
				95	NULL, NULL, NULL, NULL, NULL);
				96	}
				97	ktrace_t *pagebuf_trace_buf;
				98	#define PAGEBUF_TRACE_SIZE 4096
				99	#define PB_TRACE(pb, id, data) \
				100	pagebuf_trace(pb, id, (void )data, (void )__builtin_return_address(0))
				101	#else
				102	#define PB_TRACE(pb, id, data) do { } while (0)
				103	#endif
				104
				105	#ifdef PAGEBUF_LOCK_TRACKING
				106	# define PB_SET_OWNER(pb) ((pb)->pb_last_holder = current->pid)
				107	# define PB_CLEAR_OWNER(pb) ((pb)->pb_last_holder = -1)
				108	# define PB_GET_OWNER(pb) ((pb)->pb_last_holder)
				109	#else
				110	# define PB_SET_OWNER(pb) do { } while (0)
				111	# define PB_CLEAR_OWNER(pb) do { } while (0)
				112	# define PB_GET_OWNER(pb) do { } while (0)
				113	#endif
				114
				115	/*
				116	* Pagebuf allocation / freeing.
				117	*/
				118
				119	#define pb_to_gfp(flags) \
				120	((((flags) & PBF_READ_AHEAD) ? __GFP_NORETRY : \
				121	((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) \| __GFP_NOWARN)
				122
				123	#define pb_to_km(flags) \
				124	(((flags) & PBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
				125
				126
				127	#define pagebuf_allocate(flags) \
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	128	kmem_zone_alloc(pagebuf_zone, pb_to_km(flags))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	129	#define pagebuf_deallocate(pb) \
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	130	kmem_zone_free(pagebuf_zone, (pb));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	131
				132	/*
				133	* Page Region interfaces.
				134	*
				135	* For pages in filesystems where the blocksize is smaller than the
				136	* pagesize, we use the page->private field (long) to hold a bitmap
				137	* of uptodate regions within the page.
				138	*
				139	* Each such region is "bytes per page / bits per long" bytes long.
				140	*
				141	* NBPPR == number-of-bytes-per-page-region
				142	* BTOPR == bytes-to-page-region (rounded up)
				143	* BTOPRT == bytes-to-page-region-truncated (rounded down)
				144	*/
				145	#if (BITS_PER_LONG == 32)
				146	#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */
				147	#elif (BITS_PER_LONG == 64)
				148	#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */
				149	#else
				150	#error BITS_PER_LONG must be 32 or 64
				151	#endif
				152	#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG)
				153	#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
				154	#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT))
				155
				156	STATIC unsigned long
				157	page_region_mask(
				158	size_t offset,
				159	size_t length)
				160	{
				161	unsigned long mask;
				162	int first, final;
				163
				164	first = BTOPR(offset);
				165	final = BTOPRT(offset + length - 1);
				166	first = min(first, final);
				167
				168	mask = ~0UL;
				169	mask <<= BITS_PER_LONG - (final - first);
				170	mask >>= BITS_PER_LONG - (final);
				171
				172	ASSERT(offset + length <= PAGE_CACHE_SIZE);
				173	ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
				174
				175	return mask;
				176	}
				177
				178	STATIC inline void
				179	set_page_region(
				180	struct page *page,
				181	size_t offset,
				182	size_t length)
				183	{
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame^]	184	set_page_private(page,
				185	page_private(page) \| page_region_mask(offset, length));
				186	if (page_private(page) == ~0UL)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	187	SetPageUptodate(page);
				188	}
				189
				190	STATIC inline int
				191	test_page_region(
				192	struct page *page,
				193	size_t offset,
				194	size_t length)
				195	{
				196	unsigned long mask = page_region_mask(offset, length);
				197
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame^]	198	return (mask && (page_private(page) & mask) == mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	199	}
				200
				201	/*
				202	* Mapping of multi-page buffers into contiguous virtual space
				203	*/
				204
				205	typedef struct a_list {
				206	void *vm_addr;
				207	struct a_list *next;
				208	} a_list_t;
				209
				210	STATIC a_list_t *as_free_head;
				211	STATIC int as_list_len;
				212	STATIC DEFINE_SPINLOCK(as_lock);
				213
				214	/*
				215	* Try to batch vunmaps because they are costly.
				216	*/
				217	STATIC void
				218	free_address(
				219	void *addr)
				220	{
				221	a_list_t *aentry;
				222
				223	aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH);
				224	if (likely(aentry)) {
				225	spin_lock(&as_lock);
				226	aentry->next = as_free_head;
				227	aentry->vm_addr = addr;
				228	as_free_head = aentry;
				229	as_list_len++;
				230	spin_unlock(&as_lock);
				231	} else {
				232	vunmap(addr);
				233	}
				234	}
				235
				236	STATIC void
				237	purge_addresses(void)
				238	{
				239	a_list_t aentry, old;
				240
				241	if (as_free_head == NULL)
				242	return;
				243
				244	spin_lock(&as_lock);
				245	aentry = as_free_head;
				246	as_free_head = NULL;
				247	as_list_len = 0;
				248	spin_unlock(&as_lock);
				249
				250	while ((old = aentry) != NULL) {
				251	vunmap(aentry->vm_addr);
				252	aentry = aentry->next;
				253	kfree(old);
				254	}
				255	}
				256
				257	/*
				258	* Internal pagebuf object manipulation
				259	*/
				260
				261	STATIC void
				262	_pagebuf_initialize(
				263	xfs_buf_t *pb,
				264	xfs_buftarg_t *target,
				265	loff_t range_base,
				266	size_t range_length,
				267	page_buf_flags_t flags)
				268	{
				269	/*
				270	* We don't want certain flags to appear in pb->pb_flags.
				271	*/
				272	flags &= ~(PBF_LOCK\|PBF_MAPPED\|PBF_DONT_BLOCK\|PBF_READ_AHEAD);
				273
				274	memset(pb, 0, sizeof(xfs_buf_t));
				275	atomic_set(&pb->pb_hold, 1);
				276	init_MUTEX_LOCKED(&pb->pb_iodonesema);
				277	INIT_LIST_HEAD(&pb->pb_list);
				278	INIT_LIST_HEAD(&pb->pb_hash_list);
				279	init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */
				280	PB_SET_OWNER(pb);
				281	pb->pb_target = target;
				282	pb->pb_file_offset = range_base;
				283	/*
				284	* Set buffer_length and count_desired to the same value initially.
				285	* I/O routines should use count_desired, which will be the same in
				286	* most cases but may be reset (e.g. XFS recovery).
				287	*/
				288	pb->pb_buffer_length = pb->pb_count_desired = range_length;
				289	pb->pb_flags = flags \| PBF_NONE;
				290	pb->pb_bn = XFS_BUF_DADDR_NULL;
				291	atomic_set(&pb->pb_pin_count, 0);
				292	init_waitqueue_head(&pb->pb_waiters);
				293
				294	XFS_STATS_INC(pb_create);
				295	PB_TRACE(pb, "initialize", target);
				296	}
				297
				298	/*
				299	* Allocate a page array capable of holding a specified number
				300	* of pages, and point the page buf at it.
				301	*/
				302	STATIC int
				303	_pagebuf_get_pages(
				304	xfs_buf_t *pb,
				305	int page_count,
				306	page_buf_flags_t flags)
				307	{
				308	/* Make sure that we have a page list */
				309	if (pb->pb_pages == NULL) {
				310	pb->pb_offset = page_buf_poff(pb->pb_file_offset);
				311	pb->pb_page_count = page_count;
				312	if (page_count <= PB_PAGES) {
				313	pb->pb_pages = pb->pb_page_array;
				314	} else {
				315	pb->pb_pages = kmem_alloc(sizeof(struct page )
				316	page_count, pb_to_km(flags));
				317	if (pb->pb_pages == NULL)
				318	return -ENOMEM;
				319	}
				320	memset(pb->pb_pages, 0, sizeof(struct page ) page_count);
				321	}
				322	return 0;
				323	}
				324
				325	/*
				326	* Frees pb_pages if it was malloced.
				327	*/
				328	STATIC void
				329	_pagebuf_free_pages(
				330	xfs_buf_t *bp)
				331	{
				332	if (bp->pb_pages != bp->pb_page_array) {
				333	kmem_free(bp->pb_pages,
				334	bp->pb_page_count * sizeof(struct page *));
				335	}
				336	}
				337
				338	/*
				339	* Releases the specified buffer.
				340	*
				341	* The modification state of any associated pages is left unchanged.
				342	* The buffer most not be on any hash - use pagebuf_rele instead for
				343	* hashed and refcounted buffers
				344	*/
				345	void
				346	pagebuf_free(
				347	xfs_buf_t *bp)
				348	{
				349	PB_TRACE(bp, "free", 0);
				350
				351	ASSERT(list_empty(&bp->pb_hash_list));
				352
				353	if (bp->pb_flags & _PBF_PAGE_CACHE) {
				354	uint i;
				355
				356	if ((bp->pb_flags & PBF_MAPPED) && (bp->pb_page_count > 1))
				357	free_address(bp->pb_addr - bp->pb_offset);
				358
				359	for (i = 0; i < bp->pb_page_count; i++)
				360	page_cache_release(bp->pb_pages[i]);
				361	_pagebuf_free_pages(bp);
				362	} else if (bp->pb_flags & _PBF_KMEM_ALLOC) {
				363	/*
				364	* XXX(hch): bp->pb_count_desired might be incorrect (see
				365	* pagebuf_associate_memory for details), but fortunately
				366	* the Linux version of kmem_free ignores the len argument..
				367	*/
				368	kmem_free(bp->pb_addr, bp->pb_count_desired);
				369	_pagebuf_free_pages(bp);
				370	}
				371
				372	pagebuf_deallocate(bp);
				373	}
				374
				375	/*
				376	* Finds all pages for buffer in question and builds it's page list.
				377	*/
				378	STATIC int
				379	_pagebuf_lookup_pages(
				380	xfs_buf_t *bp,
				381	uint flags)
				382	{
				383	struct address_space *mapping = bp->pb_target->pbr_mapping;
				384	size_t blocksize = bp->pb_target->pbr_bsize;
				385	size_t size = bp->pb_count_desired;
				386	size_t nbytes, offset;
Al Viro	27496a8	2005-10-21 03:20:48 -0400	[diff] [blame]	387	gfp_t gfp_mask = pb_to_gfp(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	388	unsigned short page_count, i;
				389	pgoff_t first;
				390	loff_t end;
				391	int error;
				392
				393	end = bp->pb_file_offset + bp->pb_buffer_length;
				394	page_count = page_buf_btoc(end) - page_buf_btoct(bp->pb_file_offset);
				395
				396	error = _pagebuf_get_pages(bp, page_count, flags);
				397	if (unlikely(error))
				398	return error;
				399	bp->pb_flags \|= _PBF_PAGE_CACHE;
				400
				401	offset = bp->pb_offset;
				402	first = bp->pb_file_offset >> PAGE_CACHE_SHIFT;
				403
				404	for (i = 0; i < bp->pb_page_count; i++) {
				405	struct page *page;
				406	uint retries = 0;
				407
				408	retry:
				409	page = find_or_create_page(mapping, first + i, gfp_mask);
				410	if (unlikely(page == NULL)) {
				411	if (flags & PBF_READ_AHEAD) {
				412	bp->pb_page_count = i;
				413	for (i = 0; i < bp->pb_page_count; i++)
				414	unlock_page(bp->pb_pages[i]);
				415	return -ENOMEM;
				416	}
				417
				418	/*
				419	* This could deadlock.
				420	*
				421	* But until all the XFS lowlevel code is revamped to
				422	* handle buffer allocation failures we can't do much.
				423	*/
				424	if (!(++retries % 100))
				425	printk(KERN_ERR
				426	"XFS: possible memory allocation "
				427	"deadlock in %s (mode:0x%x)\n",
				428	__FUNCTION__, gfp_mask);
				429
				430	XFS_STATS_INC(pb_page_retries);
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	431	xfsbufd_wakeup(0, gfp_mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	432	blk_congestion_wait(WRITE, HZ/50);
				433	goto retry;
				434	}
				435
				436	XFS_STATS_INC(pb_page_found);
				437
				438	nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
				439	size -= nbytes;
				440
				441	if (!PageUptodate(page)) {
				442	page_count--;
				443	if (blocksize >= PAGE_CACHE_SIZE) {
				444	if (flags & PBF_READ)
				445	bp->pb_locked = 1;
				446	} else if (!PagePrivate(page)) {
				447	if (test_page_region(page, offset, nbytes))
				448	page_count++;
				449	}
				450	}
				451
				452	bp->pb_pages[i] = page;
				453	offset = 0;
				454	}
				455
				456	if (!bp->pb_locked) {
				457	for (i = 0; i < bp->pb_page_count; i++)
				458	unlock_page(bp->pb_pages[i]);
				459	}
				460
				461	if (page_count) {
				462	/* if we have any uptodate pages, mark that in the buffer */
				463	bp->pb_flags &= ~PBF_NONE;
				464
				465	/* if some pages aren't uptodate, mark that in the buffer */
				466	if (page_count != bp->pb_page_count)
				467	bp->pb_flags \|= PBF_PARTIAL;
				468	}
				469
				470	PB_TRACE(bp, "lookup_pages", (long)page_count);
				471	return error;
				472	}
				473
				474	/*
				475	* Map buffer into kernel address-space if nessecary.
				476	*/
				477	STATIC int
				478	_pagebuf_map_pages(
				479	xfs_buf_t *bp,
				480	uint flags)
				481	{
				482	/* A single page buffer is always mappable */
				483	if (bp->pb_page_count == 1) {
				484	bp->pb_addr = page_address(bp->pb_pages[0]) + bp->pb_offset;
				485	bp->pb_flags \|= PBF_MAPPED;
				486	} else if (flags & PBF_MAPPED) {
				487	if (as_list_len > 64)
				488	purge_addresses();
				489	bp->pb_addr = vmap(bp->pb_pages, bp->pb_page_count,
				490	VM_MAP, PAGE_KERNEL);
				491	if (unlikely(bp->pb_addr == NULL))
				492	return -ENOMEM;
				493	bp->pb_addr += bp->pb_offset;
				494	bp->pb_flags \|= PBF_MAPPED;
				495	}
				496
				497	return 0;
				498	}
				499
				500	/*
				501	* Finding and Reading Buffers
				502	*/
				503
				504	/*
				505	* _pagebuf_find
				506	*
				507	* Looks up, and creates if absent, a lockable buffer for
				508	* a given range of an inode. The buffer is returned
				509	* locked. If other overlapping buffers exist, they are
				510	* released before the new buffer is created and locked,
				511	* which may imply that this call will block until those buffers
				512	* are unlocked. No I/O is implied by this call.
				513	*/
				514	xfs_buf_t *
				515	_pagebuf_find(
				516	xfs_buftarg_t btp, / block device target */
				517	loff_t ioff, /* starting offset of range */
				518	size_t isize, /* length of range */
				519	page_buf_flags_t flags, /* PBF_TRYLOCK */
				520	xfs_buf_t new_pb)/ newly allocated buffer */
				521	{
				522	loff_t range_base;
				523	size_t range_length;
				524	xfs_bufhash_t *hash;
				525	xfs_buf_t pb, n;
				526
				527	range_base = (ioff << BBSHIFT);
				528	range_length = (isize << BBSHIFT);
				529
				530	/* Check for IOs smaller than the sector size / not sector aligned */
				531	ASSERT(!(range_length < (1 << btp->pbr_sshift)));
				532	ASSERT(!(range_base & (loff_t)btp->pbr_smask));
				533
				534	hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
				535
				536	spin_lock(&hash->bh_lock);
				537
				538	list_for_each_entry_safe(pb, n, &hash->bh_list, pb_hash_list) {
				539	ASSERT(btp == pb->pb_target);
				540	if (pb->pb_file_offset == range_base &&
				541	pb->pb_buffer_length == range_length) {
				542	/*
				543	* If we look at something bring it to the
				544	* front of the list for next time.
				545	*/
				546	atomic_inc(&pb->pb_hold);
				547	list_move(&pb->pb_hash_list, &hash->bh_list);
				548	goto found;
				549	}
				550	}
				551
				552	/* No match found */
				553	if (new_pb) {
				554	_pagebuf_initialize(new_pb, btp, range_base,
				555	range_length, flags);
				556	new_pb->pb_hash = hash;
				557	list_add(&new_pb->pb_hash_list, &hash->bh_list);
				558	} else {
				559	XFS_STATS_INC(pb_miss_locked);
				560	}
				561
				562	spin_unlock(&hash->bh_lock);
				563	return new_pb;
				564
				565	found:
				566	spin_unlock(&hash->bh_lock);
				567
				568	/* Attempt to get the semaphore without sleeping,
				569	* if this does not work then we need to drop the
				570	* spinlock and do a hard attempt on the semaphore.
				571	*/
				572	if (down_trylock(&pb->pb_sema)) {
				573	if (!(flags & PBF_TRYLOCK)) {
				574	/* wait for buffer ownership */
				575	PB_TRACE(pb, "get_lock", 0);
				576	pagebuf_lock(pb);
				577	XFS_STATS_INC(pb_get_locked_waited);
				578	} else {
				579	/* We asked for a trylock and failed, no need
				580	* to look at file offset and length here, we
				581	* know that this pagebuf at least overlaps our
				582	* pagebuf and is locked, therefore our buffer
				583	* either does not exist, or is this buffer
				584	*/
				585
				586	pagebuf_rele(pb);
				587	XFS_STATS_INC(pb_busy_locked);
				588	return (NULL);
				589	}
				590	} else {
				591	/* trylock worked */
				592	PB_SET_OWNER(pb);
				593	}
				594
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	595	if (pb->pb_flags & PBF_STALE) {
				596	ASSERT((pb->pb_flags & _PBF_DELWRI_Q) == 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	597	pb->pb_flags &= PBF_MAPPED;
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	598	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	599	PB_TRACE(pb, "got_lock", 0);
				600	XFS_STATS_INC(pb_get_locked);
				601	return (pb);
				602	}
				603
				604	/*
				605	* xfs_buf_get_flags assembles a buffer covering the specified range.
				606	*
				607	* Storage in memory for all portions of the buffer will be allocated,
				608	* although backing storage may not be.
				609	*/
				610	xfs_buf_t *
				611	xfs_buf_get_flags( /* allocate a buffer */
				612	xfs_buftarg_t target,/ target for buffer */
				613	loff_t ioff, /* starting offset of range */
				614	size_t isize, /* length of range */
				615	page_buf_flags_t flags) /* PBF_TRYLOCK */
				616	{
				617	xfs_buf_t pb, new_pb;
				618	int error = 0, i;
				619
				620	new_pb = pagebuf_allocate(flags);
				621	if (unlikely(!new_pb))
				622	return NULL;
				623
				624	pb = _pagebuf_find(target, ioff, isize, flags, new_pb);
				625	if (pb == new_pb) {
				626	error = _pagebuf_lookup_pages(pb, flags);
				627	if (error)
				628	goto no_buffer;
				629	} else {
				630	pagebuf_deallocate(new_pb);
				631	if (unlikely(pb == NULL))
				632	return NULL;
				633	}
				634
				635	for (i = 0; i < pb->pb_page_count; i++)
				636	mark_page_accessed(pb->pb_pages[i]);
				637
				638	if (!(pb->pb_flags & PBF_MAPPED)) {
				639	error = _pagebuf_map_pages(pb, flags);
				640	if (unlikely(error)) {
				641	printk(KERN_WARNING "%s: failed to map pages\n",
				642	__FUNCTION__);
				643	goto no_buffer;
				644	}
				645	}
				646
				647	XFS_STATS_INC(pb_get);
				648
				649	/*
				650	* Always fill in the block number now, the mapped cases can do
				651	* their own overlay of this later.
				652	*/
				653	pb->pb_bn = ioff;
				654	pb->pb_count_desired = pb->pb_buffer_length;
				655
				656	PB_TRACE(pb, "get", (unsigned long)flags);
				657	return pb;
				658
				659	no_buffer:
				660	if (flags & (PBF_LOCK \| PBF_TRYLOCK))
				661	pagebuf_unlock(pb);
				662	pagebuf_rele(pb);
				663	return NULL;
				664	}
				665
				666	xfs_buf_t *
				667	xfs_buf_read_flags(
				668	xfs_buftarg_t *target,
				669	loff_t ioff,
				670	size_t isize,
				671	page_buf_flags_t flags)
				672	{
				673	xfs_buf_t *pb;
				674
				675	flags \|= PBF_READ;
				676
				677	pb = xfs_buf_get_flags(target, ioff, isize, flags);
				678	if (pb) {
				679	if (PBF_NOT_DONE(pb)) {
				680	PB_TRACE(pb, "read", (unsigned long)flags);
				681	XFS_STATS_INC(pb_get_read);
				682	pagebuf_iostart(pb, flags);
				683	} else if (flags & PBF_ASYNC) {
				684	PB_TRACE(pb, "read_async", (unsigned long)flags);
				685	/*
				686	* Read ahead call which is already satisfied,
				687	* drop the buffer
				688	*/
				689	goto no_buffer;
				690	} else {
				691	PB_TRACE(pb, "read_done", (unsigned long)flags);
				692	/* We do not want read in the flags */
				693	pb->pb_flags &= ~PBF_READ;
				694	}
				695	}
				696
				697	return pb;
				698
				699	no_buffer:
				700	if (flags & (PBF_LOCK \| PBF_TRYLOCK))
				701	pagebuf_unlock(pb);
				702	pagebuf_rele(pb);
				703	return NULL;
				704	}
				705
				706	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	707	* If we are not low on memory then do the readahead in a deadlock
				708	* safe manner.
				709	*/
				710	void
				711	pagebuf_readahead(
				712	xfs_buftarg_t *target,
				713	loff_t ioff,
				714	size_t isize,
				715	page_buf_flags_t flags)
				716	{
				717	struct backing_dev_info *bdi;
				718
				719	bdi = target->pbr_mapping->backing_dev_info;
				720	if (bdi_read_congested(bdi))
				721	return;
				722
				723	flags \|= (PBF_TRYLOCK\|PBF_ASYNC\|PBF_READ_AHEAD);
				724	xfs_buf_read_flags(target, ioff, isize, flags);
				725	}
				726
				727	xfs_buf_t *
				728	pagebuf_get_empty(
				729	size_t len,
				730	xfs_buftarg_t *target)
				731	{
				732	xfs_buf_t *pb;
				733
				734	pb = pagebuf_allocate(0);
				735	if (pb)
				736	_pagebuf_initialize(pb, target, 0, len, 0);
				737	return pb;
				738	}
				739
				740	static inline struct page *
				741	mem_to_page(
				742	void *addr)
				743	{
				744	if (((unsigned long)addr < VMALLOC_START) \|\|
				745	((unsigned long)addr >= VMALLOC_END)) {
				746	return virt_to_page(addr);
				747	} else {
				748	return vmalloc_to_page(addr);
				749	}
				750	}
				751
				752	int
				753	pagebuf_associate_memory(
				754	xfs_buf_t *pb,
				755	void *mem,
				756	size_t len)
				757	{
				758	int rval;
				759	int i = 0;
				760	size_t ptr;
				761	size_t end, end_cur;
				762	off_t offset;
				763	int page_count;
				764
				765	page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
				766	offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
				767	if (offset && (len > PAGE_CACHE_SIZE))
				768	page_count++;
				769
				770	/* Free any previous set of page pointers */
				771	if (pb->pb_pages)
				772	_pagebuf_free_pages(pb);
				773
				774	pb->pb_pages = NULL;
				775	pb->pb_addr = mem;
				776
				777	rval = _pagebuf_get_pages(pb, page_count, 0);
				778	if (rval)
				779	return rval;
				780
				781	pb->pb_offset = offset;
				782	ptr = (size_t) mem & PAGE_CACHE_MASK;
				783	end = PAGE_CACHE_ALIGN((size_t) mem + len);
				784	end_cur = end;
				785	/* set up first page */
				786	pb->pb_pages[0] = mem_to_page(mem);
				787
				788	ptr += PAGE_CACHE_SIZE;
				789	pb->pb_page_count = ++i;
				790	while (ptr < end) {
				791	pb->pb_pages[i] = mem_to_page((void *)ptr);
				792	pb->pb_page_count = ++i;
				793	ptr += PAGE_CACHE_SIZE;
				794	}
				795	pb->pb_locked = 0;
				796
				797	pb->pb_count_desired = pb->pb_buffer_length = len;
				798	pb->pb_flags \|= PBF_MAPPED;
				799
				800	return 0;
				801	}
				802
				803	xfs_buf_t *
				804	pagebuf_get_no_daddr(
				805	size_t len,
				806	xfs_buftarg_t *target)
				807	{
				808	size_t malloc_len = len;
				809	xfs_buf_t *bp;
				810	void *data;
				811	int error;
				812
				813	bp = pagebuf_allocate(0);
				814	if (unlikely(bp == NULL))
				815	goto fail;
				816	_pagebuf_initialize(bp, target, 0, len, PBF_FORCEIO);
				817
				818	try_again:
				819	data = kmem_alloc(malloc_len, KM_SLEEP \| KM_MAYFAIL);
				820	if (unlikely(data == NULL))
				821	goto fail_free_buf;
				822
				823	/* check whether alignment matches.. */
				824	if ((__psunsigned_t)data !=
				825	((__psunsigned_t)data & ~target->pbr_smask)) {
				826	/* .. else double the size and try again */
				827	kmem_free(data, malloc_len);
				828	malloc_len <<= 1;
				829	goto try_again;
				830	}
				831
				832	error = pagebuf_associate_memory(bp, data, len);
				833	if (error)
				834	goto fail_free_mem;
				835	bp->pb_flags \|= _PBF_KMEM_ALLOC;
				836
				837	pagebuf_unlock(bp);
				838
				839	PB_TRACE(bp, "no_daddr", data);
				840	return bp;
				841	fail_free_mem:
				842	kmem_free(data, malloc_len);
				843	fail_free_buf:
				844	pagebuf_free(bp);
				845	fail:
				846	return NULL;
				847	}
				848
				849	/*
				850	* pagebuf_hold
				851	*
				852	* Increment reference count on buffer, to hold the buffer concurrently
				853	* with another thread which may release (free) the buffer asynchronously.
				854	*
				855	* Must hold the buffer already to call this function.
				856	*/
				857	void
				858	pagebuf_hold(
				859	xfs_buf_t *pb)
				860	{
				861	atomic_inc(&pb->pb_hold);
				862	PB_TRACE(pb, "hold", 0);
				863	}
				864
				865	/*
				866	* pagebuf_rele
				867	*
				868	* pagebuf_rele releases a hold on the specified buffer. If the
				869	* the hold count is 1, pagebuf_rele calls pagebuf_free.
				870	*/
				871	void
				872	pagebuf_rele(
				873	xfs_buf_t *pb)
				874	{
				875	xfs_bufhash_t *hash = pb->pb_hash;
				876
				877	PB_TRACE(pb, "rele", pb->pb_relse);
				878
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	879	/*
				880	* pagebuf_lookup buffers are not hashed, not delayed write,
				881	* and don't have their own release routines. Special case.
				882	*/
				883	if (unlikely(!hash)) {
				884	ASSERT(!pb->pb_relse);
				885	if (atomic_dec_and_test(&pb->pb_hold))
				886	xfs_buf_free(pb);
				887	return;
				888	}
				889
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	890	if (atomic_dec_and_lock(&pb->pb_hold, &hash->bh_lock)) {
				891	int do_free = 1;
				892
				893	if (pb->pb_relse) {
				894	atomic_inc(&pb->pb_hold);
				895	spin_unlock(&hash->bh_lock);
				896	(*(pb->pb_relse)) (pb);
				897	spin_lock(&hash->bh_lock);
				898	do_free = 0;
				899	}
				900
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	901	if (pb->pb_flags & PBF_FS_MANAGED) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	902	do_free = 0;
				903	}
				904
				905	if (do_free) {
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	906	ASSERT((pb->pb_flags & (PBF_DELWRI\|_PBF_DELWRI_Q)) == 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	907	list_del_init(&pb->pb_hash_list);
				908	spin_unlock(&hash->bh_lock);
				909	pagebuf_free(pb);
				910	} else {
				911	spin_unlock(&hash->bh_lock);
				912	}
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	913	} else {
				914	/*
				915	* Catch reference count leaks
				916	*/
				917	ASSERT(atomic_read(&pb->pb_hold) >= 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	918	}
				919	}
				920
				921
				922	/*
				923	* Mutual exclusion on buffers. Locking model:
				924	*
				925	* Buffers associated with inodes for which buffer locking
				926	* is not enabled are not protected by semaphores, and are
				927	* assumed to be exclusively owned by the caller. There is a
				928	* spinlock in the buffer, used by the caller when concurrent
				929	* access is possible.
				930	*/
				931
				932	/*
				933	* pagebuf_cond_lock
				934	*
				935	* pagebuf_cond_lock locks a buffer object, if it is not already locked.
				936	* Note that this in no way
				937	* locks the underlying pages, so it is only useful for synchronizing
				938	* concurrent use of page buffer objects, not for synchronizing independent
				939	* access to the underlying pages.
				940	*/
				941	int
				942	pagebuf_cond_lock( /* lock buffer, if not locked */
				943	/* returns -EBUSY if locked) */
				944	xfs_buf_t *pb)
				945	{
				946	int locked;
				947
				948	locked = down_trylock(&pb->pb_sema) == 0;
				949	if (locked) {
				950	PB_SET_OWNER(pb);
				951	}
				952	PB_TRACE(pb, "cond_lock", (long)locked);
				953	return(locked ? 0 : -EBUSY);
				954	}
				955
				956	#if defined(DEBUG) \|\| defined(XFS_BLI_TRACE)
				957	/*
				958	* pagebuf_lock_value
				959	*
				960	* Return lock value for a pagebuf
				961	*/
				962	int
				963	pagebuf_lock_value(
				964	xfs_buf_t *pb)
				965	{
				966	return(atomic_read(&pb->pb_sema.count));
				967	}
				968	#endif
				969
				970	/*
				971	* pagebuf_lock
				972	*
				973	* pagebuf_lock locks a buffer object. Note that this in no way
				974	* locks the underlying pages, so it is only useful for synchronizing
				975	* concurrent use of page buffer objects, not for synchronizing independent
				976	* access to the underlying pages.
				977	*/
				978	int
				979	pagebuf_lock(
				980	xfs_buf_t *pb)
				981	{
				982	PB_TRACE(pb, "lock", 0);
				983	if (atomic_read(&pb->pb_io_remaining))
				984	blk_run_address_space(pb->pb_target->pbr_mapping);
				985	down(&pb->pb_sema);
				986	PB_SET_OWNER(pb);
				987	PB_TRACE(pb, "locked", 0);
				988	return 0;
				989	}
				990
				991	/*
				992	* pagebuf_unlock
				993	*
				994	* pagebuf_unlock releases the lock on the buffer object created by
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	995	* pagebuf_lock or pagebuf_cond_lock (not any pinning of underlying pages
				996	* created by pagebuf_pin).
				997	*
				998	* If the buffer is marked delwri but is not queued, do so before we
				999	* unlock the buffer as we need to set flags correctly. We also need to
				1000	* take a reference for the delwri queue because the unlocker is going to
				1001	* drop their's and they don't know we just queued it.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1002	*/
				1003	void
				1004	pagebuf_unlock( /* unlock buffer */
				1005	xfs_buf_t pb) / buffer to unlock */
				1006	{
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	1007	if ((pb->pb_flags & (PBF_DELWRI\|_PBF_DELWRI_Q)) == PBF_DELWRI) {
				1008	atomic_inc(&pb->pb_hold);
				1009	pb->pb_flags \|= PBF_ASYNC;
				1010	pagebuf_delwri_queue(pb, 0);
				1011	}
				1012
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1013	PB_CLEAR_OWNER(pb);
				1014	up(&pb->pb_sema);
				1015	PB_TRACE(pb, "unlock", 0);
				1016	}
				1017
				1018
				1019	/*
				1020	* Pinning Buffer Storage in Memory
				1021	*/
				1022
				1023	/*
				1024	* pagebuf_pin
				1025	*
				1026	* pagebuf_pin locks all of the memory represented by a buffer in
				1027	* memory. Multiple calls to pagebuf_pin and pagebuf_unpin, for
				1028	* the same or different buffers affecting a given page, will
				1029	* properly count the number of outstanding "pin" requests. The
				1030	* buffer may be released after the pagebuf_pin and a different
				1031	* buffer used when calling pagebuf_unpin, if desired.
				1032	* pagebuf_pin should be used by the file system when it wants be
				1033	* assured that no attempt will be made to force the affected
				1034	* memory to disk. It does not assure that a given logical page
				1035	* will not be moved to a different physical page.
				1036	*/
				1037	void
				1038	pagebuf_pin(
				1039	xfs_buf_t *pb)
				1040	{
				1041	atomic_inc(&pb->pb_pin_count);
				1042	PB_TRACE(pb, "pin", (long)pb->pb_pin_count.counter);
				1043	}
				1044
				1045	/*
				1046	* pagebuf_unpin
				1047	*
				1048	* pagebuf_unpin reverses the locking of memory performed by
				1049	* pagebuf_pin. Note that both functions affected the logical
				1050	* pages associated with the buffer, not the buffer itself.
				1051	*/
				1052	void
				1053	pagebuf_unpin(
				1054	xfs_buf_t *pb)
				1055	{
				1056	if (atomic_dec_and_test(&pb->pb_pin_count)) {
				1057	wake_up_all(&pb->pb_waiters);
				1058	}
				1059	PB_TRACE(pb, "unpin", (long)pb->pb_pin_count.counter);
				1060	}
				1061
				1062	int
				1063	pagebuf_ispin(
				1064	xfs_buf_t *pb)
				1065	{
				1066	return atomic_read(&pb->pb_pin_count);
				1067	}
				1068
				1069	/*
				1070	* pagebuf_wait_unpin
				1071	*
				1072	* pagebuf_wait_unpin waits until all of the memory associated
				1073	* with the buffer is not longer locked in memory. It returns
				1074	* immediately if none of the affected pages are locked.
				1075	*/
				1076	static inline void
				1077	_pagebuf_wait_unpin(
				1078	xfs_buf_t *pb)
				1079	{
				1080	DECLARE_WAITQUEUE (wait, current);
				1081
				1082	if (atomic_read(&pb->pb_pin_count) == 0)
				1083	return;
				1084
				1085	add_wait_queue(&pb->pb_waiters, &wait);
				1086	for (;;) {
				1087	set_current_state(TASK_UNINTERRUPTIBLE);
				1088	if (atomic_read(&pb->pb_pin_count) == 0)
				1089	break;
				1090	if (atomic_read(&pb->pb_io_remaining))
				1091	blk_run_address_space(pb->pb_target->pbr_mapping);
				1092	schedule();
				1093	}
				1094	remove_wait_queue(&pb->pb_waiters, &wait);
				1095	set_current_state(TASK_RUNNING);
				1096	}
				1097
				1098	/*
				1099	* Buffer Utility Routines
				1100	*/
				1101
				1102	/*
				1103	* pagebuf_iodone
				1104	*
				1105	* pagebuf_iodone marks a buffer for which I/O is in progress
				1106	* done with respect to that I/O. The pb_iodone routine, if
				1107	* present, will be called as a side-effect.
				1108	*/
				1109	STATIC void
				1110	pagebuf_iodone_work(
				1111	void *v)
				1112	{
				1113	xfs_buf_t bp = (xfs_buf_t )v;
				1114
				1115	if (bp->pb_iodone)
				1116	(*(bp->pb_iodone))(bp);
				1117	else if (bp->pb_flags & PBF_ASYNC)
				1118	xfs_buf_relse(bp);
				1119	}
				1120
				1121	void
				1122	pagebuf_iodone(
				1123	xfs_buf_t *pb,
				1124	int dataio,
				1125	int schedule)
				1126	{
				1127	pb->pb_flags &= ~(PBF_READ \| PBF_WRITE);
				1128	if (pb->pb_error == 0) {
				1129	pb->pb_flags &= ~(PBF_PARTIAL \| PBF_NONE);
				1130	}
				1131
				1132	PB_TRACE(pb, "iodone", pb->pb_iodone);
				1133
				1134	if ((pb->pb_iodone) \|\| (pb->pb_flags & PBF_ASYNC)) {
				1135	if (schedule) {
				1136	INIT_WORK(&pb->pb_iodone_work, pagebuf_iodone_work, pb);
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1137	queue_work(dataio ? xfsdatad_workqueue :
				1138	xfslogd_workqueue, &pb->pb_iodone_work);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1139	} else {
				1140	pagebuf_iodone_work(pb);
				1141	}
				1142	} else {
				1143	up(&pb->pb_iodonesema);
				1144	}
				1145	}
				1146
				1147	/*
				1148	* pagebuf_ioerror
				1149	*
				1150	* pagebuf_ioerror sets the error code for a buffer.
				1151	*/
				1152	void
				1153	pagebuf_ioerror( /* mark/clear buffer error flag */
				1154	xfs_buf_t pb, / buffer to mark */
				1155	int error) /* error to store (0 if none) */
				1156	{
				1157	ASSERT(error >= 0 && error <= 0xffff);
				1158	pb->pb_error = (unsigned short)error;
				1159	PB_TRACE(pb, "ioerror", (unsigned long)error);
				1160	}
				1161
				1162	/*
				1163	* pagebuf_iostart
				1164	*
				1165	* pagebuf_iostart initiates I/O on a buffer, based on the flags supplied.
				1166	* If necessary, it will arrange for any disk space allocation required,
				1167	* and it will break up the request if the block mappings require it.
				1168	* The pb_iodone routine in the buffer supplied will only be called
				1169	* when all of the subsidiary I/O requests, if any, have been completed.
				1170	* pagebuf_iostart calls the pagebuf_ioinitiate routine or
				1171	* pagebuf_iorequest, if the former routine is not defined, to start
				1172	* the I/O on a given low-level request.
				1173	*/
				1174	int
				1175	pagebuf_iostart( /* start I/O on a buffer */
				1176	xfs_buf_t pb, / buffer to start */
				1177	page_buf_flags_t flags) /* PBF_LOCK, PBF_ASYNC, PBF_READ, */
				1178	/* PBF_WRITE, PBF_DELWRI, */
				1179	/* PBF_DONT_BLOCK */
				1180	{
				1181	int status = 0;
				1182
				1183	PB_TRACE(pb, "iostart", (unsigned long)flags);
				1184
				1185	if (flags & PBF_DELWRI) {
				1186	pb->pb_flags &= ~(PBF_READ \| PBF_WRITE \| PBF_ASYNC);
				1187	pb->pb_flags \|= flags & (PBF_DELWRI \| PBF_ASYNC);
				1188	pagebuf_delwri_queue(pb, 1);
				1189	return status;
				1190	}
				1191
				1192	pb->pb_flags &= ~(PBF_READ \| PBF_WRITE \| PBF_ASYNC \| PBF_DELWRI \| \
				1193	PBF_READ_AHEAD \| _PBF_RUN_QUEUES);
				1194	pb->pb_flags \|= flags & (PBF_READ \| PBF_WRITE \| PBF_ASYNC \| \
				1195	PBF_READ_AHEAD \| _PBF_RUN_QUEUES);
				1196
				1197	BUG_ON(pb->pb_bn == XFS_BUF_DADDR_NULL);
				1198
				1199	/* For writes allow an alternate strategy routine to precede
				1200	* the actual I/O request (which may not be issued at all in
				1201	* a shutdown situation, for example).
				1202	*/
				1203	status = (flags & PBF_WRITE) ?
				1204	pagebuf_iostrategy(pb) : pagebuf_iorequest(pb);
				1205
				1206	/* Wait for I/O if we are not an async request.
				1207	* Note: async I/O request completion will release the buffer,
				1208	* and that can already be done by this point. So using the
				1209	* buffer pointer from here on, after async I/O, is invalid.
				1210	*/
				1211	if (!status && !(flags & PBF_ASYNC))
				1212	status = pagebuf_iowait(pb);
				1213
				1214	return status;
				1215	}
				1216
				1217	/*
				1218	* Helper routine for pagebuf_iorequest
				1219	*/
				1220
				1221	STATIC __inline__ int
				1222	_pagebuf_iolocked(
				1223	xfs_buf_t *pb)
				1224	{
				1225	ASSERT(pb->pb_flags & (PBF_READ\|PBF_WRITE));
				1226	if (pb->pb_flags & PBF_READ)
				1227	return pb->pb_locked;
				1228	return 0;
				1229	}
				1230
				1231	STATIC __inline__ void
				1232	_pagebuf_iodone(
				1233	xfs_buf_t *pb,
				1234	int schedule)
				1235	{
				1236	if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
				1237	pb->pb_locked = 0;
				1238	pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), schedule);
				1239	}
				1240	}
				1241
				1242	STATIC int
				1243	bio_end_io_pagebuf(
				1244	struct bio *bio,
				1245	unsigned int bytes_done,
				1246	int error)
				1247	{
				1248	xfs_buf_t pb = (xfs_buf_t )bio->bi_private;
Nathan Scott	eedb553	2005-09-02 16:39:56 +1000	[diff] [blame]	1249	unsigned int blocksize = pb->pb_target->pbr_bsize;
				1250	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1251
				1252	if (bio->bi_size)
				1253	return 1;
				1254
				1255	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
				1256	pb->pb_error = EIO;
				1257
Nathan Scott	eedb553	2005-09-02 16:39:56 +1000	[diff] [blame]	1258	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1259	struct page *page = bvec->bv_page;
				1260
Nathan Scott	eedb553	2005-09-02 16:39:56 +1000	[diff] [blame]	1261	if (unlikely(pb->pb_error)) {
				1262	if (pb->pb_flags & PBF_READ)
				1263	ClearPageUptodate(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1264	SetPageError(page);
				1265	} else if (blocksize == PAGE_CACHE_SIZE) {
				1266	SetPageUptodate(page);
				1267	} else if (!PagePrivate(page) &&
				1268	(pb->pb_flags & _PBF_PAGE_CACHE)) {
				1269	set_page_region(page, bvec->bv_offset, bvec->bv_len);
				1270	}
				1271
Nathan Scott	eedb553	2005-09-02 16:39:56 +1000	[diff] [blame]	1272	if (--bvec >= bio->bi_io_vec)
				1273	prefetchw(&bvec->bv_page->flags);
				1274
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1275	if (_pagebuf_iolocked(pb)) {
				1276	unlock_page(page);
				1277	}
Nathan Scott	eedb553	2005-09-02 16:39:56 +1000	[diff] [blame]	1278	} while (bvec >= bio->bi_io_vec);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1279
				1280	_pagebuf_iodone(pb, 1);
				1281	bio_put(bio);
				1282	return 0;
				1283	}
				1284
				1285	STATIC void
				1286	_pagebuf_ioapply(
				1287	xfs_buf_t *pb)
				1288	{
				1289	int i, rw, map_i, total_nr_pages, nr_pages;
				1290	struct bio *bio;
				1291	int offset = pb->pb_offset;
				1292	int size = pb->pb_count_desired;
				1293	sector_t sector = pb->pb_bn;
				1294	unsigned int blocksize = pb->pb_target->pbr_bsize;
				1295	int locking = _pagebuf_iolocked(pb);
				1296
				1297	total_nr_pages = pb->pb_page_count;
				1298	map_i = 0;
				1299
				1300	if (pb->pb_flags & _PBF_RUN_QUEUES) {
				1301	pb->pb_flags &= ~_PBF_RUN_QUEUES;
				1302	rw = (pb->pb_flags & PBF_READ) ? READ_SYNC : WRITE_SYNC;
				1303	} else {
				1304	rw = (pb->pb_flags & PBF_READ) ? READ : WRITE;
				1305	}
				1306
				1307	/* Special code path for reading a sub page size pagebuf in --
				1308	* we populate up the whole page, and hence the other metadata
				1309	* in the same page. This optimization is only valid when the
				1310	* filesystem block size and the page size are equal.
				1311	*/
				1312	if ((pb->pb_buffer_length < PAGE_CACHE_SIZE) &&
				1313	(pb->pb_flags & PBF_READ) && locking &&
				1314	(blocksize == PAGE_CACHE_SIZE)) {
				1315	bio = bio_alloc(GFP_NOIO, 1);
				1316
				1317	bio->bi_bdev = pb->pb_target->pbr_bdev;
				1318	bio->bi_sector = sector - (offset >> BBSHIFT);
				1319	bio->bi_end_io = bio_end_io_pagebuf;
				1320	bio->bi_private = pb;
				1321
				1322	bio_add_page(bio, pb->pb_pages[0], PAGE_CACHE_SIZE, 0);
				1323	size = 0;
				1324
				1325	atomic_inc(&pb->pb_io_remaining);
				1326
				1327	goto submit_io;
				1328	}
				1329
				1330	/* Lock down the pages which we need to for the request */
				1331	if (locking && (pb->pb_flags & PBF_WRITE) && (pb->pb_locked == 0)) {
				1332	for (i = 0; size; i++) {
				1333	int nbytes = PAGE_CACHE_SIZE - offset;
				1334	struct page *page = pb->pb_pages[i];
				1335
				1336	if (nbytes > size)
				1337	nbytes = size;
				1338
				1339	lock_page(page);
				1340
				1341	size -= nbytes;
				1342	offset = 0;
				1343	}
				1344	offset = pb->pb_offset;
				1345	size = pb->pb_count_desired;
				1346	}
				1347
				1348	next_chunk:
				1349	atomic_inc(&pb->pb_io_remaining);
				1350	nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
				1351	if (nr_pages > total_nr_pages)
				1352	nr_pages = total_nr_pages;
				1353
				1354	bio = bio_alloc(GFP_NOIO, nr_pages);
				1355	bio->bi_bdev = pb->pb_target->pbr_bdev;
				1356	bio->bi_sector = sector;
				1357	bio->bi_end_io = bio_end_io_pagebuf;
				1358	bio->bi_private = pb;
				1359
				1360	for (; size && nr_pages; nr_pages--, map_i++) {
				1361	int nbytes = PAGE_CACHE_SIZE - offset;
				1362
				1363	if (nbytes > size)
				1364	nbytes = size;
				1365
				1366	if (bio_add_page(bio, pb->pb_pages[map_i],
				1367	nbytes, offset) < nbytes)
				1368	break;
				1369
				1370	offset = 0;
				1371	sector += nbytes >> BBSHIFT;
				1372	size -= nbytes;
				1373	total_nr_pages--;
				1374	}
				1375
				1376	submit_io:
				1377	if (likely(bio->bi_size)) {
				1378	submit_bio(rw, bio);
				1379	if (size)
				1380	goto next_chunk;
				1381	} else {
				1382	bio_put(bio);
				1383	pagebuf_ioerror(pb, EIO);
				1384	}
				1385	}
				1386
				1387	/*
				1388	* pagebuf_iorequest -- the core I/O request routine.
				1389	*/
				1390	int
				1391	pagebuf_iorequest( /* start real I/O */
				1392	xfs_buf_t pb) / buffer to convey to device */
				1393	{
				1394	PB_TRACE(pb, "iorequest", 0);
				1395
				1396	if (pb->pb_flags & PBF_DELWRI) {
				1397	pagebuf_delwri_queue(pb, 1);
				1398	return 0;
				1399	}
				1400
				1401	if (pb->pb_flags & PBF_WRITE) {
				1402	_pagebuf_wait_unpin(pb);
				1403	}
				1404
				1405	pagebuf_hold(pb);
				1406
				1407	/* Set the count to 1 initially, this will stop an I/O
				1408	* completion callout which happens before we have started
				1409	* all the I/O from calling pagebuf_iodone too early.
				1410	*/
				1411	atomic_set(&pb->pb_io_remaining, 1);
				1412	_pagebuf_ioapply(pb);
				1413	_pagebuf_iodone(pb, 0);
				1414
				1415	pagebuf_rele(pb);
				1416	return 0;
				1417	}
				1418
				1419	/*
				1420	* pagebuf_iowait
				1421	*
				1422	* pagebuf_iowait waits for I/O to complete on the buffer supplied.
				1423	* It returns immediately if no I/O is pending. In any case, it returns
				1424	* the error code, if any, or 0 if there is no error.
				1425	*/
				1426	int
				1427	pagebuf_iowait(
				1428	xfs_buf_t *pb)
				1429	{
				1430	PB_TRACE(pb, "iowait", 0);
				1431	if (atomic_read(&pb->pb_io_remaining))
				1432	blk_run_address_space(pb->pb_target->pbr_mapping);
				1433	down(&pb->pb_iodonesema);
				1434	PB_TRACE(pb, "iowaited", (long)pb->pb_error);
				1435	return pb->pb_error;
				1436	}
				1437
				1438	caddr_t
				1439	pagebuf_offset(
				1440	xfs_buf_t *pb,
				1441	size_t offset)
				1442	{
				1443	struct page *page;
				1444
				1445	offset += pb->pb_offset;
				1446
				1447	page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT];
				1448	return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1));
				1449	}
				1450
				1451	/*
				1452	* pagebuf_iomove
				1453	*
				1454	* Move data into or out of a buffer.
				1455	*/
				1456	void
				1457	pagebuf_iomove(
				1458	xfs_buf_t pb, / buffer to process */
				1459	size_t boff, /* starting buffer offset */
				1460	size_t bsize, /* length to copy */
				1461	caddr_t data, /* data address */
				1462	page_buf_rw_t mode) /* read/write flag */
				1463	{
				1464	size_t bend, cpoff, csize;
				1465	struct page *page;
				1466
				1467	bend = boff + bsize;
				1468	while (boff < bend) {
				1469	page = pb->pb_pages[page_buf_btoct(boff + pb->pb_offset)];
				1470	cpoff = page_buf_poff(boff + pb->pb_offset);
				1471	csize = min_t(size_t,
				1472	PAGE_CACHE_SIZE-cpoff, pb->pb_count_desired-boff);
				1473
				1474	ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
				1475
				1476	switch (mode) {
				1477	case PBRW_ZERO:
				1478	memset(page_address(page) + cpoff, 0, csize);
				1479	break;
				1480	case PBRW_READ:
				1481	memcpy(data, page_address(page) + cpoff, csize);
				1482	break;
				1483	case PBRW_WRITE:
				1484	memcpy(page_address(page) + cpoff, data, csize);
				1485	}
				1486
				1487	boff += csize;
				1488	data += csize;
				1489	}
				1490	}
				1491
				1492	/*
				1493	* Handling of buftargs.
				1494	*/
				1495
				1496	/*
				1497	* Wait for any bufs with callbacks that have been submitted but
				1498	* have not yet returned... walk the hash list for the target.
				1499	*/
				1500	void
				1501	xfs_wait_buftarg(
				1502	xfs_buftarg_t *btp)
				1503	{
				1504	xfs_buf_t bp, n;
				1505	xfs_bufhash_t *hash;
				1506	uint i;
				1507
				1508	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
				1509	hash = &btp->bt_hash[i];
				1510	again:
				1511	spin_lock(&hash->bh_lock);
				1512	list_for_each_entry_safe(bp, n, &hash->bh_list, pb_hash_list) {
				1513	ASSERT(btp == bp->pb_target);
				1514	if (!(bp->pb_flags & PBF_FS_MANAGED)) {
				1515	spin_unlock(&hash->bh_lock);
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	1516	/*
				1517	* Catch superblock reference count leaks
				1518	* immediately
				1519	*/
				1520	BUG_ON(bp->pb_bn == 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1521	delay(100);
				1522	goto again;
				1523	}
				1524	}
				1525	spin_unlock(&hash->bh_lock);
				1526	}
				1527	}
				1528
				1529	/*
				1530	* Allocate buffer hash table for a given target.
				1531	* For devices containing metadata (i.e. not the log/realtime devices)
				1532	* we need to allocate a much larger hash table.
				1533	*/
				1534	STATIC void
				1535	xfs_alloc_bufhash(
				1536	xfs_buftarg_t *btp,
				1537	int external)
				1538	{
				1539	unsigned int i;
				1540
				1541	btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */
				1542	btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
				1543	btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) *
				1544	sizeof(xfs_bufhash_t), KM_SLEEP);
				1545	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
				1546	spin_lock_init(&btp->bt_hash[i].bh_lock);
				1547	INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
				1548	}
				1549	}
				1550
				1551	STATIC void
				1552	xfs_free_bufhash(
				1553	xfs_buftarg_t *btp)
				1554	{
				1555	kmem_free(btp->bt_hash,
				1556	(1 << btp->bt_hashshift) * sizeof(xfs_bufhash_t));
				1557	btp->bt_hash = NULL;
				1558	}
				1559
				1560	void
				1561	xfs_free_buftarg(
				1562	xfs_buftarg_t *btp,
				1563	int external)
				1564	{
				1565	xfs_flush_buftarg(btp, 1);
				1566	if (external)
				1567	xfs_blkdev_put(btp->pbr_bdev);
				1568	xfs_free_bufhash(btp);
				1569	iput(btp->pbr_mapping->host);
				1570	kmem_free(btp, sizeof(*btp));
				1571	}
				1572
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1573	STATIC int
				1574	xfs_setsize_buftarg_flags(
				1575	xfs_buftarg_t *btp,
				1576	unsigned int blocksize,
				1577	unsigned int sectorsize,
				1578	int verbose)
				1579	{
				1580	btp->pbr_bsize = blocksize;
				1581	btp->pbr_sshift = ffs(sectorsize) - 1;
				1582	btp->pbr_smask = sectorsize - 1;
				1583
				1584	if (set_blocksize(btp->pbr_bdev, sectorsize)) {
				1585	printk(KERN_WARNING
				1586	"XFS: Cannot set_blocksize to %u on device %s\n",
				1587	sectorsize, XFS_BUFTARG_NAME(btp));
				1588	return EINVAL;
				1589	}
				1590
				1591	if (verbose &&
				1592	(PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
				1593	printk(KERN_WARNING
				1594	"XFS: %u byte sectors in use on device %s. "
				1595	"This is suboptimal; %u or greater is ideal.\n",
				1596	sectorsize, XFS_BUFTARG_NAME(btp),
				1597	(unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
				1598	}
				1599
				1600	return 0;
				1601	}
				1602
				1603	/*
				1604	* When allocating the initial buffer target we have not yet
				1605	* read in the superblock, so don't know what sized sectors
				1606	* are being used is at this early stage. Play safe.
				1607	*/
				1608	STATIC int
				1609	xfs_setsize_buftarg_early(
				1610	xfs_buftarg_t *btp,
				1611	struct block_device *bdev)
				1612	{
				1613	return xfs_setsize_buftarg_flags(btp,
				1614	PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0);
				1615	}
				1616
				1617	int
				1618	xfs_setsize_buftarg(
				1619	xfs_buftarg_t *btp,
				1620	unsigned int blocksize,
				1621	unsigned int sectorsize)
				1622	{
				1623	return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
				1624	}
				1625
				1626	STATIC int
				1627	xfs_mapping_buftarg(
				1628	xfs_buftarg_t *btp,
				1629	struct block_device *bdev)
				1630	{
				1631	struct backing_dev_info *bdi;
				1632	struct inode *inode;
				1633	struct address_space *mapping;
				1634	static struct address_space_operations mapping_aops = {
				1635	.sync_page = block_sync_page,
				1636	};
				1637
				1638	inode = new_inode(bdev->bd_inode->i_sb);
				1639	if (!inode) {
				1640	printk(KERN_WARNING
				1641	"XFS: Cannot allocate mapping inode for device %s\n",
				1642	XFS_BUFTARG_NAME(btp));
				1643	return ENOMEM;
				1644	}
				1645	inode->i_mode = S_IFBLK;
				1646	inode->i_bdev = bdev;
				1647	inode->i_rdev = bdev->bd_dev;
				1648	bdi = blk_get_backing_dev_info(bdev);
				1649	if (!bdi)
				1650	bdi = &default_backing_dev_info;
				1651	mapping = &inode->i_data;
				1652	mapping->a_ops = &mapping_aops;
				1653	mapping->backing_dev_info = bdi;
				1654	mapping_set_gfp_mask(mapping, GFP_NOFS);
				1655	btp->pbr_mapping = mapping;
				1656	return 0;
				1657	}
				1658
				1659	xfs_buftarg_t *
				1660	xfs_alloc_buftarg(
				1661	struct block_device *bdev,
				1662	int external)
				1663	{
				1664	xfs_buftarg_t *btp;
				1665
				1666	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
				1667
				1668	btp->pbr_dev = bdev->bd_dev;
				1669	btp->pbr_bdev = bdev;
				1670	if (xfs_setsize_buftarg_early(btp, bdev))
				1671	goto error;
				1672	if (xfs_mapping_buftarg(btp, bdev))
				1673	goto error;
				1674	xfs_alloc_bufhash(btp, external);
				1675	return btp;
				1676
				1677	error:
				1678	kmem_free(btp, sizeof(*btp));
				1679	return NULL;
				1680	}
				1681
				1682
				1683	/*
				1684	* Pagebuf delayed write buffer handling
				1685	*/
				1686
				1687	STATIC LIST_HEAD(pbd_delwrite_queue);
				1688	STATIC DEFINE_SPINLOCK(pbd_delwrite_lock);
				1689
				1690	STATIC void
				1691	pagebuf_delwri_queue(
				1692	xfs_buf_t *pb,
				1693	int unlock)
				1694	{
				1695	PB_TRACE(pb, "delwri_q", (long)unlock);
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	1696	ASSERT((pb->pb_flags & (PBF_DELWRI\|PBF_ASYNC)) ==
				1697	(PBF_DELWRI\|PBF_ASYNC));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1698
				1699	spin_lock(&pbd_delwrite_lock);
				1700	/* If already in the queue, dequeue and place at tail */
				1701	if (!list_empty(&pb->pb_list)) {
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	1702	ASSERT(pb->pb_flags & _PBF_DELWRI_Q);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1703	if (unlock) {
				1704	atomic_dec(&pb->pb_hold);
				1705	}
				1706	list_del(&pb->pb_list);
				1707	}
				1708
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	1709	pb->pb_flags \|= _PBF_DELWRI_Q;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1710	list_add_tail(&pb->pb_list, &pbd_delwrite_queue);
				1711	pb->pb_queuetime = jiffies;
				1712	spin_unlock(&pbd_delwrite_lock);
				1713
				1714	if (unlock)
				1715	pagebuf_unlock(pb);
				1716	}
				1717
				1718	void
				1719	pagebuf_delwri_dequeue(
				1720	xfs_buf_t *pb)
				1721	{
				1722	int dequeued = 0;
				1723
				1724	spin_lock(&pbd_delwrite_lock);
				1725	if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) {
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	1726	ASSERT(pb->pb_flags & _PBF_DELWRI_Q);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1727	list_del_init(&pb->pb_list);
				1728	dequeued = 1;
				1729	}
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	1730	pb->pb_flags &= ~(PBF_DELWRI\|_PBF_DELWRI_Q);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1731	spin_unlock(&pbd_delwrite_lock);
				1732
				1733	if (dequeued)
				1734	pagebuf_rele(pb);
				1735
				1736	PB_TRACE(pb, "delwri_dq", (long)dequeued);
				1737	}
				1738
				1739	STATIC void
				1740	pagebuf_runall_queues(
				1741	struct workqueue_struct *queue)
				1742	{
				1743	flush_workqueue(queue);
				1744	}
				1745
				1746	/* Defines for pagebuf daemon */
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1747	STATIC struct task_struct *xfsbufd_task;
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1748	STATIC int xfsbufd_force_flush;
				1749	STATIC int xfsbufd_force_sleep;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1750
				1751	STATIC int
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1752	xfsbufd_wakeup(
Al Viro	27496a8	2005-10-21 03:20:48 -0400	[diff] [blame]	1753	int priority,
				1754	gfp_t mask)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1755	{
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1756	if (xfsbufd_force_sleep)
Nathan Scott	abd0cf7	2005-05-05 13:30:13 -0700	[diff] [blame]	1757	return 0;
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1758	xfsbufd_force_flush = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1759	barrier();
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1760	wake_up_process(xfsbufd_task);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1761	return 0;
				1762	}
				1763
				1764	STATIC int
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1765	xfsbufd(
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1766	void *data)
				1767	{
				1768	struct list_head tmp;
				1769	unsigned long age;
				1770	xfs_buftarg_t *target;
				1771	xfs_buf_t pb, n;
				1772
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1773	current->flags \|= PF_MEMALLOC;
				1774
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1775	INIT_LIST_HEAD(&tmp);
				1776	do {
Christoph Lameter	3e1d1d2	2005-06-24 23:13:50 -0700	[diff] [blame]	1777	if (unlikely(freezing(current))) {
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1778	xfsbufd_force_sleep = 1;
Christoph Lameter	3e1d1d2	2005-06-24 23:13:50 -0700	[diff] [blame]	1779	refrigerator();
Nathan Scott	abd0cf7	2005-05-05 13:30:13 -0700	[diff] [blame]	1780	} else {
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1781	xfsbufd_force_sleep = 0;
Nathan Scott	abd0cf7	2005-05-05 13:30:13 -0700	[diff] [blame]	1782	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1783
Nishanth Aravamudan	041e0e3	2005-09-10 00:27:23 -0700	[diff] [blame]	1784	schedule_timeout_interruptible
				1785	(xfs_buf_timer_centisecs * msecs_to_jiffies(10));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1786
Nishanth Aravamudan	041e0e3	2005-09-10 00:27:23 -0700	[diff] [blame]	1787	age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1788	spin_lock(&pbd_delwrite_lock);
				1789	list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
				1790	PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb));
				1791	ASSERT(pb->pb_flags & PBF_DELWRI);
				1792
				1793	if (!pagebuf_ispin(pb) && !pagebuf_cond_lock(pb)) {
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1794	if (!xfsbufd_force_flush &&
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1795	time_before(jiffies,
				1796	pb->pb_queuetime + age)) {
				1797	pagebuf_unlock(pb);
				1798	break;
				1799	}
				1800
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	1801	pb->pb_flags &= ~(PBF_DELWRI\|_PBF_DELWRI_Q);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1802	pb->pb_flags \|= PBF_WRITE;
				1803	list_move(&pb->pb_list, &tmp);
				1804	}
				1805	}
				1806	spin_unlock(&pbd_delwrite_lock);
				1807
				1808	while (!list_empty(&tmp)) {
				1809	pb = list_entry(tmp.next, xfs_buf_t, pb_list);
				1810	target = pb->pb_target;
				1811
				1812	list_del_init(&pb->pb_list);
				1813	pagebuf_iostrategy(pb);
				1814
				1815	blk_run_address_space(target->pbr_mapping);
				1816	}
				1817
				1818	if (as_list_len > 0)
				1819	purge_addresses();
				1820
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1821	xfsbufd_force_flush = 0;
Christoph Hellwig	4df08c5	2005-09-05 08:34:18 +1000	[diff] [blame]	1822	} while (!kthread_should_stop());
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1823
Christoph Hellwig	4df08c5	2005-09-05 08:34:18 +1000	[diff] [blame]	1824	return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1825	}
				1826
				1827	/*
				1828	* Go through all incore buffers, and release buffers if they belong to
				1829	* the given device. This is used in filesystem error handling to
				1830	* preserve the consistency of its metadata.
				1831	*/
				1832	int
				1833	xfs_flush_buftarg(
				1834	xfs_buftarg_t *target,
				1835	int wait)
				1836	{
				1837	struct list_head tmp;
				1838	xfs_buf_t pb, n;
				1839	int pincount = 0;
				1840
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1841	pagebuf_runall_queues(xfsdatad_workqueue);
				1842	pagebuf_runall_queues(xfslogd_workqueue);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1843
				1844	INIT_LIST_HEAD(&tmp);
				1845	spin_lock(&pbd_delwrite_lock);
				1846	list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
				1847
				1848	if (pb->pb_target != target)
				1849	continue;
				1850
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	1851	ASSERT(pb->pb_flags & (PBF_DELWRI\|_PBF_DELWRI_Q));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1852	PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb));
				1853	if (pagebuf_ispin(pb)) {
				1854	pincount++;
				1855	continue;
				1856	}
				1857
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1858	list_move(&pb->pb_list, &tmp);
				1859	}
				1860	spin_unlock(&pbd_delwrite_lock);
				1861
				1862	/*
				1863	* Dropped the delayed write list lock, now walk the temporary list
				1864	*/
				1865	list_for_each_entry_safe(pb, n, &tmp, pb_list) {
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	1866	pagebuf_lock(pb);
				1867	pb->pb_flags &= ~(PBF_DELWRI\|_PBF_DELWRI_Q);
				1868	pb->pb_flags \|= PBF_WRITE;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1869	if (wait)
				1870	pb->pb_flags &= ~PBF_ASYNC;
				1871	else
				1872	list_del_init(&pb->pb_list);
				1873
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1874	pagebuf_iostrategy(pb);
				1875	}
				1876
				1877	/*
				1878	* Remaining list items must be flushed before returning
				1879	*/
				1880	while (!list_empty(&tmp)) {
				1881	pb = list_entry(tmp.next, xfs_buf_t, pb_list);
				1882
				1883	list_del_init(&pb->pb_list);
				1884	xfs_iowait(pb);
				1885	xfs_buf_relse(pb);
				1886	}
				1887
				1888	if (wait)
				1889	blk_run_address_space(target->pbr_mapping);
				1890
				1891	return pincount;
				1892	}
				1893
				1894	STATIC int
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1895	xfs_buf_daemons_start(void)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1896	{
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1897	int error = -ENOMEM;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1898
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1899	xfslogd_workqueue = create_workqueue("xfslogd");
				1900	if (!xfslogd_workqueue)
				1901	goto out;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1902
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1903	xfsdatad_workqueue = create_workqueue("xfsdatad");
				1904	if (!xfsdatad_workqueue)
				1905	goto out_destroy_xfslogd_workqueue;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1906
Christoph Hellwig	4df08c5	2005-09-05 08:34:18 +1000	[diff] [blame]	1907	xfsbufd_task = kthread_run(xfsbufd, NULL, "xfsbufd");
				1908	if (IS_ERR(xfsbufd_task)) {
				1909	error = PTR_ERR(xfsbufd_task);
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1910	goto out_destroy_xfsdatad_workqueue;
Christoph Hellwig	4df08c5	2005-09-05 08:34:18 +1000	[diff] [blame]	1911	}
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1912	return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1913
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1914	out_destroy_xfsdatad_workqueue:
				1915	destroy_workqueue(xfsdatad_workqueue);
				1916	out_destroy_xfslogd_workqueue:
				1917	destroy_workqueue(xfslogd_workqueue);
				1918	out:
				1919	return error;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1920	}
				1921
				1922	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1923	* Note: do not mark as __exit, it is called from pagebuf_terminate.
				1924	*/
				1925	STATIC void
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1926	xfs_buf_daemons_stop(void)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1927	{
Christoph Hellwig	4df08c5	2005-09-05 08:34:18 +1000	[diff] [blame]	1928	kthread_stop(xfsbufd_task);
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1929	destroy_workqueue(xfslogd_workqueue);
				1930	destroy_workqueue(xfsdatad_workqueue);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1931	}
				1932
				1933	/*
				1934	* Initialization and Termination
				1935	*/
				1936
				1937	int __init
				1938	pagebuf_init(void)
				1939	{
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1940	int error = -ENOMEM;
				1941
				1942	pagebuf_zone = kmem_zone_init(sizeof(xfs_buf_t), "xfs_buf");
				1943	if (!pagebuf_zone)
				1944	goto out;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1945
				1946	#ifdef PAGEBUF_TRACE
				1947	pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP);
				1948	#endif
				1949
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1950	error = xfs_buf_daemons_start();
Christoph Hellwig	cf9937c	2005-06-21 15:35:24 +1000	[diff] [blame]	1951	if (error)
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1952	goto out_free_buf_zone;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1953
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1954	pagebuf_shake = kmem_shake_register(xfsbufd_wakeup);
				1955	if (!pagebuf_shake) {
				1956	error = -ENOMEM;
				1957	goto out_stop_daemons;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1958	}
				1959
				1960	return 0;
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1961
				1962	out_stop_daemons:
				1963	xfs_buf_daemons_stop();
				1964	out_free_buf_zone:
				1965	#ifdef PAGEBUF_TRACE
				1966	ktrace_free(pagebuf_trace_buf);
				1967	#endif
				1968	kmem_zone_destroy(pagebuf_zone);
				1969	out:
				1970	return error;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1971	}
				1972
				1973
				1974	/*
				1975	* pagebuf_terminate.
				1976	*
				1977	* Note: do not mark as __exit, this is also called from the __init code.
				1978	*/
				1979	void
				1980	pagebuf_terminate(void)
				1981	{
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1982	xfs_buf_daemons_stop();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1983
				1984	#ifdef PAGEBUF_TRACE
				1985	ktrace_free(pagebuf_trace_buf);
				1986	#endif
				1987
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1988	kmem_zone_destroy(pagebuf_zone);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1989	kmem_shake_deregister(pagebuf_shake);
				1990	}