Blame - fs/xfs/linux-2.6/xfs_buf.c - kernel/msm-4.19

blob: 4663f7dbff1cbc9230eb5818a0a8c2fbf539992e [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
Nathan Scott	eedb553	2005-09-02 16:39:56 +1000	[diff] [blame]	2	* Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3	*
				4	* This program is free software; you can redistribute it and/or modify it
				5	* under the terms of version 2 of the GNU General Public License as
				6	* published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it would be useful, but
				9	* WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
				11	*
				12	* Further, this software is distributed without any warranty that it is
				13	* free of the rightful claim of any third person regarding infringement
				14	* or the like. Any license provided herein, whether implied or
				15	* otherwise, applies only to this software file. Patent licenses, if
				16	* any, provided herein do not apply to combinations of this program with
				17	* other software, or any other product whatsoever.
				18	*
				19	* You should have received a copy of the GNU General Public License along
				20	* with this program; if not, write the Free Software Foundation, Inc., 59
				21	* Temple Place - Suite 330, Boston MA 02111-1307, USA.
				22	*
				23	* Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
				24	* Mountain View, CA 94043, or:
				25	*
				26	* http://www.sgi.com
				27	*
				28	* For further information regarding this notice, see:
				29	*
				30	* http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
				31	*/
				32
				33	/*
				34	* The xfs_buf.c code provides an abstract buffer cache model on top
				35	* of the Linux page cache. Cached metadata blocks for a file system
				36	* are hashed to the inode for the block device. xfs_buf.c assembles
				37	* buffers (xfs_buf_t) on demand to aggregate such cached pages for I/O.
				38	*
				39	* Written by Steve Lord, Jim Mostek, Russell Cattelan
				40	* and Rajagopal Ananthanarayanan ("ananth") at SGI.
				41	*
				42	*/
				43
				44	#include <linux/stddef.h>
				45	#include <linux/errno.h>
				46	#include <linux/slab.h>
				47	#include <linux/pagemap.h>
				48	#include <linux/init.h>
				49	#include <linux/vmalloc.h>
				50	#include <linux/bio.h>
				51	#include <linux/sysctl.h>
				52	#include <linux/proc_fs.h>
				53	#include <linux/workqueue.h>
				54	#include <linux/percpu.h>
				55	#include <linux/blkdev.h>
				56	#include <linux/hash.h>
Christoph Hellwig	4df08c5	2005-09-05 08:34:18 +1000	[diff] [blame]	57	#include <linux/kthread.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	58
				59	#include "xfs_linux.h"
				60
				61	/*
				62	* File wide globals
				63	*/
				64
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	65	STATIC kmem_cache_t *pagebuf_zone;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	66	STATIC kmem_shaker_t pagebuf_shake;
Al Viro	27496a8	2005-10-21 03:20:48 -0400	[diff] [blame]	67	STATIC int xfsbufd_wakeup(int, gfp_t);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	68	STATIC void pagebuf_delwri_queue(xfs_buf_t *, int);
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	69
				70	STATIC struct workqueue_struct *xfslogd_workqueue;
Christoph Hellwig	0829c36	2005-09-02 16:58:49 +1000	[diff] [blame]	71	struct workqueue_struct *xfsdatad_workqueue;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	72
				73	/*
				74	* Pagebuf debugging
				75	*/
				76
				77	#ifdef PAGEBUF_TRACE
				78	void
				79	pagebuf_trace(
				80	xfs_buf_t *pb,
				81	char *id,
				82	void *data,
				83	void *ra)
				84	{
				85	ktrace_enter(pagebuf_trace_buf,
				86	pb, id,
				87	(void *)(unsigned long)pb->pb_flags,
				88	(void *)(unsigned long)pb->pb_hold.counter,
				89	(void *)(unsigned long)pb->pb_sema.count.counter,
				90	(void *)current,
				91	data, ra,
				92	(void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff),
				93	(void *)(unsigned long)(pb->pb_file_offset & 0xffffffff),
				94	(void *)(unsigned long)pb->pb_buffer_length,
				95	NULL, NULL, NULL, NULL, NULL);
				96	}
				97	ktrace_t *pagebuf_trace_buf;
				98	#define PAGEBUF_TRACE_SIZE 4096
				99	#define PB_TRACE(pb, id, data) \
				100	pagebuf_trace(pb, id, (void )data, (void )__builtin_return_address(0))
				101	#else
				102	#define PB_TRACE(pb, id, data) do { } while (0)
				103	#endif
				104
				105	#ifdef PAGEBUF_LOCK_TRACKING
				106	# define PB_SET_OWNER(pb) ((pb)->pb_last_holder = current->pid)
				107	# define PB_CLEAR_OWNER(pb) ((pb)->pb_last_holder = -1)
				108	# define PB_GET_OWNER(pb) ((pb)->pb_last_holder)
				109	#else
				110	# define PB_SET_OWNER(pb) do { } while (0)
				111	# define PB_CLEAR_OWNER(pb) do { } while (0)
				112	# define PB_GET_OWNER(pb) do { } while (0)
				113	#endif
				114
				115	/*
				116	* Pagebuf allocation / freeing.
				117	*/
				118
				119	#define pb_to_gfp(flags) \
				120	((((flags) & PBF_READ_AHEAD) ? __GFP_NORETRY : \
				121	((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) \| __GFP_NOWARN)
				122
				123	#define pb_to_km(flags) \
				124	(((flags) & PBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
				125
				126
				127	#define pagebuf_allocate(flags) \
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	128	kmem_zone_alloc(pagebuf_zone, pb_to_km(flags))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	129	#define pagebuf_deallocate(pb) \
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	130	kmem_zone_free(pagebuf_zone, (pb));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	131
				132	/*
				133	* Page Region interfaces.
				134	*
				135	* For pages in filesystems where the blocksize is smaller than the
				136	* pagesize, we use the page->private field (long) to hold a bitmap
				137	* of uptodate regions within the page.
				138	*
				139	* Each such region is "bytes per page / bits per long" bytes long.
				140	*
				141	* NBPPR == number-of-bytes-per-page-region
				142	* BTOPR == bytes-to-page-region (rounded up)
				143	* BTOPRT == bytes-to-page-region-truncated (rounded down)
				144	*/
				145	#if (BITS_PER_LONG == 32)
				146	#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */
				147	#elif (BITS_PER_LONG == 64)
				148	#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */
				149	#else
				150	#error BITS_PER_LONG must be 32 or 64
				151	#endif
				152	#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG)
				153	#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
				154	#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT))
				155
				156	STATIC unsigned long
				157	page_region_mask(
				158	size_t offset,
				159	size_t length)
				160	{
				161	unsigned long mask;
				162	int first, final;
				163
				164	first = BTOPR(offset);
				165	final = BTOPRT(offset + length - 1);
				166	first = min(first, final);
				167
				168	mask = ~0UL;
				169	mask <<= BITS_PER_LONG - (final - first);
				170	mask >>= BITS_PER_LONG - (final);
				171
				172	ASSERT(offset + length <= PAGE_CACHE_SIZE);
				173	ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
				174
				175	return mask;
				176	}
				177
				178	STATIC inline void
				179	set_page_region(
				180	struct page *page,
				181	size_t offset,
				182	size_t length)
				183	{
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	184	set_page_private(page,
				185	page_private(page) \| page_region_mask(offset, length));
				186	if (page_private(page) == ~0UL)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	187	SetPageUptodate(page);
				188	}
				189
				190	STATIC inline int
				191	test_page_region(
				192	struct page *page,
				193	size_t offset,
				194	size_t length)
				195	{
				196	unsigned long mask = page_region_mask(offset, length);
				197
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	198	return (mask && (page_private(page) & mask) == mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	199	}
				200
				201	/*
				202	* Mapping of multi-page buffers into contiguous virtual space
				203	*/
				204
				205	typedef struct a_list {
				206	void *vm_addr;
				207	struct a_list *next;
				208	} a_list_t;
				209
				210	STATIC a_list_t *as_free_head;
				211	STATIC int as_list_len;
				212	STATIC DEFINE_SPINLOCK(as_lock);
				213
				214	/*
				215	* Try to batch vunmaps because they are costly.
				216	*/
				217	STATIC void
				218	free_address(
				219	void *addr)
				220	{
				221	a_list_t *aentry;
				222
				223	aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH);
				224	if (likely(aentry)) {
				225	spin_lock(&as_lock);
				226	aentry->next = as_free_head;
				227	aentry->vm_addr = addr;
				228	as_free_head = aentry;
				229	as_list_len++;
				230	spin_unlock(&as_lock);
				231	} else {
				232	vunmap(addr);
				233	}
				234	}
				235
				236	STATIC void
				237	purge_addresses(void)
				238	{
				239	a_list_t aentry, old;
				240
				241	if (as_free_head == NULL)
				242	return;
				243
				244	spin_lock(&as_lock);
				245	aentry = as_free_head;
				246	as_free_head = NULL;
				247	as_list_len = 0;
				248	spin_unlock(&as_lock);
				249
				250	while ((old = aentry) != NULL) {
				251	vunmap(aentry->vm_addr);
				252	aentry = aentry->next;
				253	kfree(old);
				254	}
				255	}
				256
				257	/*
				258	* Internal pagebuf object manipulation
				259	*/
				260
				261	STATIC void
				262	_pagebuf_initialize(
				263	xfs_buf_t *pb,
				264	xfs_buftarg_t *target,
				265	loff_t range_base,
				266	size_t range_length,
				267	page_buf_flags_t flags)
				268	{
				269	/*
				270	* We don't want certain flags to appear in pb->pb_flags.
				271	*/
				272	flags &= ~(PBF_LOCK\|PBF_MAPPED\|PBF_DONT_BLOCK\|PBF_READ_AHEAD);
				273
				274	memset(pb, 0, sizeof(xfs_buf_t));
				275	atomic_set(&pb->pb_hold, 1);
				276	init_MUTEX_LOCKED(&pb->pb_iodonesema);
				277	INIT_LIST_HEAD(&pb->pb_list);
				278	INIT_LIST_HEAD(&pb->pb_hash_list);
				279	init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */
				280	PB_SET_OWNER(pb);
				281	pb->pb_target = target;
				282	pb->pb_file_offset = range_base;
				283	/*
				284	* Set buffer_length and count_desired to the same value initially.
				285	* I/O routines should use count_desired, which will be the same in
				286	* most cases but may be reset (e.g. XFS recovery).
				287	*/
				288	pb->pb_buffer_length = pb->pb_count_desired = range_length;
				289	pb->pb_flags = flags \| PBF_NONE;
				290	pb->pb_bn = XFS_BUF_DADDR_NULL;
				291	atomic_set(&pb->pb_pin_count, 0);
				292	init_waitqueue_head(&pb->pb_waiters);
				293
				294	XFS_STATS_INC(pb_create);
				295	PB_TRACE(pb, "initialize", target);
				296	}
				297
				298	/*
				299	* Allocate a page array capable of holding a specified number
				300	* of pages, and point the page buf at it.
				301	*/
				302	STATIC int
				303	_pagebuf_get_pages(
				304	xfs_buf_t *pb,
				305	int page_count,
				306	page_buf_flags_t flags)
				307	{
				308	/* Make sure that we have a page list */
				309	if (pb->pb_pages == NULL) {
				310	pb->pb_offset = page_buf_poff(pb->pb_file_offset);
				311	pb->pb_page_count = page_count;
				312	if (page_count <= PB_PAGES) {
				313	pb->pb_pages = pb->pb_page_array;
				314	} else {
				315	pb->pb_pages = kmem_alloc(sizeof(struct page )
				316	page_count, pb_to_km(flags));
				317	if (pb->pb_pages == NULL)
				318	return -ENOMEM;
				319	}
				320	memset(pb->pb_pages, 0, sizeof(struct page ) page_count);
				321	}
				322	return 0;
				323	}
				324
				325	/*
				326	* Frees pb_pages if it was malloced.
				327	*/
				328	STATIC void
				329	_pagebuf_free_pages(
				330	xfs_buf_t *bp)
				331	{
				332	if (bp->pb_pages != bp->pb_page_array) {
				333	kmem_free(bp->pb_pages,
				334	bp->pb_page_count * sizeof(struct page *));
				335	}
				336	}
				337
				338	/*
				339	* Releases the specified buffer.
				340	*
				341	* The modification state of any associated pages is left unchanged.
				342	* The buffer most not be on any hash - use pagebuf_rele instead for
				343	* hashed and refcounted buffers
				344	*/
				345	void
				346	pagebuf_free(
				347	xfs_buf_t *bp)
				348	{
				349	PB_TRACE(bp, "free", 0);
				350
				351	ASSERT(list_empty(&bp->pb_hash_list));
				352
				353	if (bp->pb_flags & _PBF_PAGE_CACHE) {
				354	uint i;
				355
				356	if ((bp->pb_flags & PBF_MAPPED) && (bp->pb_page_count > 1))
				357	free_address(bp->pb_addr - bp->pb_offset);
				358
				359	for (i = 0; i < bp->pb_page_count; i++)
				360	page_cache_release(bp->pb_pages[i]);
				361	_pagebuf_free_pages(bp);
				362	} else if (bp->pb_flags & _PBF_KMEM_ALLOC) {
				363	/*
				364	* XXX(hch): bp->pb_count_desired might be incorrect (see
				365	* pagebuf_associate_memory for details), but fortunately
				366	* the Linux version of kmem_free ignores the len argument..
				367	*/
				368	kmem_free(bp->pb_addr, bp->pb_count_desired);
				369	_pagebuf_free_pages(bp);
				370	}
				371
				372	pagebuf_deallocate(bp);
				373	}
				374
				375	/*
				376	* Finds all pages for buffer in question and builds it's page list.
				377	*/
				378	STATIC int
				379	_pagebuf_lookup_pages(
				380	xfs_buf_t *bp,
				381	uint flags)
				382	{
				383	struct address_space *mapping = bp->pb_target->pbr_mapping;
				384	size_t blocksize = bp->pb_target->pbr_bsize;
				385	size_t size = bp->pb_count_desired;
				386	size_t nbytes, offset;
Al Viro	27496a8	2005-10-21 03:20:48 -0400	[diff] [blame]	387	gfp_t gfp_mask = pb_to_gfp(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	388	unsigned short page_count, i;
				389	pgoff_t first;
				390	loff_t end;
				391	int error;
				392
				393	end = bp->pb_file_offset + bp->pb_buffer_length;
				394	page_count = page_buf_btoc(end) - page_buf_btoct(bp->pb_file_offset);
				395
				396	error = _pagebuf_get_pages(bp, page_count, flags);
				397	if (unlikely(error))
				398	return error;
				399	bp->pb_flags \|= _PBF_PAGE_CACHE;
				400
				401	offset = bp->pb_offset;
				402	first = bp->pb_file_offset >> PAGE_CACHE_SHIFT;
				403
				404	for (i = 0; i < bp->pb_page_count; i++) {
				405	struct page *page;
				406	uint retries = 0;
				407
				408	retry:
				409	page = find_or_create_page(mapping, first + i, gfp_mask);
				410	if (unlikely(page == NULL)) {
				411	if (flags & PBF_READ_AHEAD) {
				412	bp->pb_page_count = i;
				413	for (i = 0; i < bp->pb_page_count; i++)
				414	unlock_page(bp->pb_pages[i]);
				415	return -ENOMEM;
				416	}
				417
				418	/*
				419	* This could deadlock.
				420	*
				421	* But until all the XFS lowlevel code is revamped to
				422	* handle buffer allocation failures we can't do much.
				423	*/
				424	if (!(++retries % 100))
				425	printk(KERN_ERR
				426	"XFS: possible memory allocation "
				427	"deadlock in %s (mode:0x%x)\n",
				428	__FUNCTION__, gfp_mask);
				429
				430	XFS_STATS_INC(pb_page_retries);
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	431	xfsbufd_wakeup(0, gfp_mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	432	blk_congestion_wait(WRITE, HZ/50);
				433	goto retry;
				434	}
				435
				436	XFS_STATS_INC(pb_page_found);
				437
				438	nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
				439	size -= nbytes;
				440
				441	if (!PageUptodate(page)) {
				442	page_count--;
				443	if (blocksize >= PAGE_CACHE_SIZE) {
				444	if (flags & PBF_READ)
				445	bp->pb_locked = 1;
				446	} else if (!PagePrivate(page)) {
				447	if (test_page_region(page, offset, nbytes))
				448	page_count++;
				449	}
				450	}
				451
				452	bp->pb_pages[i] = page;
				453	offset = 0;
				454	}
				455
				456	if (!bp->pb_locked) {
				457	for (i = 0; i < bp->pb_page_count; i++)
				458	unlock_page(bp->pb_pages[i]);
				459	}
				460
Christoph Hellwig	739cafd	2005-11-02 10:25:51 +1100	[diff] [blame]	461	if (page_count)
				462	bp->pb_flags &= ~PBF_NONE;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	463
				464	PB_TRACE(bp, "lookup_pages", (long)page_count);
				465	return error;
				466	}
				467
				468	/*
				469	* Map buffer into kernel address-space if nessecary.
				470	*/
				471	STATIC int
				472	_pagebuf_map_pages(
				473	xfs_buf_t *bp,
				474	uint flags)
				475	{
				476	/* A single page buffer is always mappable */
				477	if (bp->pb_page_count == 1) {
				478	bp->pb_addr = page_address(bp->pb_pages[0]) + bp->pb_offset;
				479	bp->pb_flags \|= PBF_MAPPED;
				480	} else if (flags & PBF_MAPPED) {
				481	if (as_list_len > 64)
				482	purge_addresses();
				483	bp->pb_addr = vmap(bp->pb_pages, bp->pb_page_count,
				484	VM_MAP, PAGE_KERNEL);
				485	if (unlikely(bp->pb_addr == NULL))
				486	return -ENOMEM;
				487	bp->pb_addr += bp->pb_offset;
				488	bp->pb_flags \|= PBF_MAPPED;
				489	}
				490
				491	return 0;
				492	}
				493
				494	/*
				495	* Finding and Reading Buffers
				496	*/
				497
				498	/*
				499	* _pagebuf_find
				500	*
				501	* Looks up, and creates if absent, a lockable buffer for
				502	* a given range of an inode. The buffer is returned
				503	* locked. If other overlapping buffers exist, they are
				504	* released before the new buffer is created and locked,
				505	* which may imply that this call will block until those buffers
				506	* are unlocked. No I/O is implied by this call.
				507	*/
				508	xfs_buf_t *
				509	_pagebuf_find(
				510	xfs_buftarg_t btp, / block device target */
				511	loff_t ioff, /* starting offset of range */
				512	size_t isize, /* length of range */
				513	page_buf_flags_t flags, /* PBF_TRYLOCK */
				514	xfs_buf_t new_pb)/ newly allocated buffer */
				515	{
				516	loff_t range_base;
				517	size_t range_length;
				518	xfs_bufhash_t *hash;
				519	xfs_buf_t pb, n;
				520
				521	range_base = (ioff << BBSHIFT);
				522	range_length = (isize << BBSHIFT);
				523
				524	/* Check for IOs smaller than the sector size / not sector aligned */
				525	ASSERT(!(range_length < (1 << btp->pbr_sshift)));
				526	ASSERT(!(range_base & (loff_t)btp->pbr_smask));
				527
				528	hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
				529
				530	spin_lock(&hash->bh_lock);
				531
				532	list_for_each_entry_safe(pb, n, &hash->bh_list, pb_hash_list) {
				533	ASSERT(btp == pb->pb_target);
				534	if (pb->pb_file_offset == range_base &&
				535	pb->pb_buffer_length == range_length) {
				536	/*
				537	* If we look at something bring it to the
				538	* front of the list for next time.
				539	*/
				540	atomic_inc(&pb->pb_hold);
				541	list_move(&pb->pb_hash_list, &hash->bh_list);
				542	goto found;
				543	}
				544	}
				545
				546	/* No match found */
				547	if (new_pb) {
				548	_pagebuf_initialize(new_pb, btp, range_base,
				549	range_length, flags);
				550	new_pb->pb_hash = hash;
				551	list_add(&new_pb->pb_hash_list, &hash->bh_list);
				552	} else {
				553	XFS_STATS_INC(pb_miss_locked);
				554	}
				555
				556	spin_unlock(&hash->bh_lock);
				557	return new_pb;
				558
				559	found:
				560	spin_unlock(&hash->bh_lock);
				561
				562	/* Attempt to get the semaphore without sleeping,
				563	* if this does not work then we need to drop the
				564	* spinlock and do a hard attempt on the semaphore.
				565	*/
				566	if (down_trylock(&pb->pb_sema)) {
				567	if (!(flags & PBF_TRYLOCK)) {
				568	/* wait for buffer ownership */
				569	PB_TRACE(pb, "get_lock", 0);
				570	pagebuf_lock(pb);
				571	XFS_STATS_INC(pb_get_locked_waited);
				572	} else {
				573	/* We asked for a trylock and failed, no need
				574	* to look at file offset and length here, we
				575	* know that this pagebuf at least overlaps our
				576	* pagebuf and is locked, therefore our buffer
				577	* either does not exist, or is this buffer
				578	*/
				579
				580	pagebuf_rele(pb);
				581	XFS_STATS_INC(pb_busy_locked);
				582	return (NULL);
				583	}
				584	} else {
				585	/* trylock worked */
				586	PB_SET_OWNER(pb);
				587	}
				588
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	589	if (pb->pb_flags & PBF_STALE) {
				590	ASSERT((pb->pb_flags & _PBF_DELWRI_Q) == 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	591	pb->pb_flags &= PBF_MAPPED;
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	592	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	593	PB_TRACE(pb, "got_lock", 0);
				594	XFS_STATS_INC(pb_get_locked);
				595	return (pb);
				596	}
				597
				598	/*
				599	* xfs_buf_get_flags assembles a buffer covering the specified range.
				600	*
				601	* Storage in memory for all portions of the buffer will be allocated,
				602	* although backing storage may not be.
				603	*/
				604	xfs_buf_t *
				605	xfs_buf_get_flags( /* allocate a buffer */
				606	xfs_buftarg_t target,/ target for buffer */
				607	loff_t ioff, /* starting offset of range */
				608	size_t isize, /* length of range */
				609	page_buf_flags_t flags) /* PBF_TRYLOCK */
				610	{
				611	xfs_buf_t pb, new_pb;
				612	int error = 0, i;
				613
				614	new_pb = pagebuf_allocate(flags);
				615	if (unlikely(!new_pb))
				616	return NULL;
				617
				618	pb = _pagebuf_find(target, ioff, isize, flags, new_pb);
				619	if (pb == new_pb) {
				620	error = _pagebuf_lookup_pages(pb, flags);
				621	if (error)
				622	goto no_buffer;
				623	} else {
				624	pagebuf_deallocate(new_pb);
				625	if (unlikely(pb == NULL))
				626	return NULL;
				627	}
				628
				629	for (i = 0; i < pb->pb_page_count; i++)
				630	mark_page_accessed(pb->pb_pages[i]);
				631
				632	if (!(pb->pb_flags & PBF_MAPPED)) {
				633	error = _pagebuf_map_pages(pb, flags);
				634	if (unlikely(error)) {
				635	printk(KERN_WARNING "%s: failed to map pages\n",
				636	__FUNCTION__);
				637	goto no_buffer;
				638	}
				639	}
				640
				641	XFS_STATS_INC(pb_get);
				642
				643	/*
				644	* Always fill in the block number now, the mapped cases can do
				645	* their own overlay of this later.
				646	*/
				647	pb->pb_bn = ioff;
				648	pb->pb_count_desired = pb->pb_buffer_length;
				649
				650	PB_TRACE(pb, "get", (unsigned long)flags);
				651	return pb;
				652
				653	no_buffer:
				654	if (flags & (PBF_LOCK \| PBF_TRYLOCK))
				655	pagebuf_unlock(pb);
				656	pagebuf_rele(pb);
				657	return NULL;
				658	}
				659
				660	xfs_buf_t *
				661	xfs_buf_read_flags(
				662	xfs_buftarg_t *target,
				663	loff_t ioff,
				664	size_t isize,
				665	page_buf_flags_t flags)
				666	{
				667	xfs_buf_t *pb;
				668
				669	flags \|= PBF_READ;
				670
				671	pb = xfs_buf_get_flags(target, ioff, isize, flags);
				672	if (pb) {
Christoph Hellwig	88741a9	2005-11-02 10:21:14 +1100	[diff] [blame]	673	if (!XFS_BUF_ISDONE(pb)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	674	PB_TRACE(pb, "read", (unsigned long)flags);
				675	XFS_STATS_INC(pb_get_read);
				676	pagebuf_iostart(pb, flags);
				677	} else if (flags & PBF_ASYNC) {
				678	PB_TRACE(pb, "read_async", (unsigned long)flags);
				679	/*
				680	* Read ahead call which is already satisfied,
				681	* drop the buffer
				682	*/
				683	goto no_buffer;
				684	} else {
				685	PB_TRACE(pb, "read_done", (unsigned long)flags);
				686	/* We do not want read in the flags */
				687	pb->pb_flags &= ~PBF_READ;
				688	}
				689	}
				690
				691	return pb;
				692
				693	no_buffer:
				694	if (flags & (PBF_LOCK \| PBF_TRYLOCK))
				695	pagebuf_unlock(pb);
				696	pagebuf_rele(pb);
				697	return NULL;
				698	}
				699
				700	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	701	* If we are not low on memory then do the readahead in a deadlock
				702	* safe manner.
				703	*/
				704	void
				705	pagebuf_readahead(
				706	xfs_buftarg_t *target,
				707	loff_t ioff,
				708	size_t isize,
				709	page_buf_flags_t flags)
				710	{
				711	struct backing_dev_info *bdi;
				712
				713	bdi = target->pbr_mapping->backing_dev_info;
				714	if (bdi_read_congested(bdi))
				715	return;
				716
				717	flags \|= (PBF_TRYLOCK\|PBF_ASYNC\|PBF_READ_AHEAD);
				718	xfs_buf_read_flags(target, ioff, isize, flags);
				719	}
				720
				721	xfs_buf_t *
				722	pagebuf_get_empty(
				723	size_t len,
				724	xfs_buftarg_t *target)
				725	{
				726	xfs_buf_t *pb;
				727
				728	pb = pagebuf_allocate(0);
				729	if (pb)
				730	_pagebuf_initialize(pb, target, 0, len, 0);
				731	return pb;
				732	}
				733
				734	static inline struct page *
				735	mem_to_page(
				736	void *addr)
				737	{
				738	if (((unsigned long)addr < VMALLOC_START) \|\|
				739	((unsigned long)addr >= VMALLOC_END)) {
				740	return virt_to_page(addr);
				741	} else {
				742	return vmalloc_to_page(addr);
				743	}
				744	}
				745
				746	int
				747	pagebuf_associate_memory(
				748	xfs_buf_t *pb,
				749	void *mem,
				750	size_t len)
				751	{
				752	int rval;
				753	int i = 0;
				754	size_t ptr;
				755	size_t end, end_cur;
				756	off_t offset;
				757	int page_count;
				758
				759	page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
				760	offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
				761	if (offset && (len > PAGE_CACHE_SIZE))
				762	page_count++;
				763
				764	/* Free any previous set of page pointers */
				765	if (pb->pb_pages)
				766	_pagebuf_free_pages(pb);
				767
				768	pb->pb_pages = NULL;
				769	pb->pb_addr = mem;
				770
				771	rval = _pagebuf_get_pages(pb, page_count, 0);
				772	if (rval)
				773	return rval;
				774
				775	pb->pb_offset = offset;
				776	ptr = (size_t) mem & PAGE_CACHE_MASK;
				777	end = PAGE_CACHE_ALIGN((size_t) mem + len);
				778	end_cur = end;
				779	/* set up first page */
				780	pb->pb_pages[0] = mem_to_page(mem);
				781
				782	ptr += PAGE_CACHE_SIZE;
				783	pb->pb_page_count = ++i;
				784	while (ptr < end) {
				785	pb->pb_pages[i] = mem_to_page((void *)ptr);
				786	pb->pb_page_count = ++i;
				787	ptr += PAGE_CACHE_SIZE;
				788	}
				789	pb->pb_locked = 0;
				790
				791	pb->pb_count_desired = pb->pb_buffer_length = len;
				792	pb->pb_flags \|= PBF_MAPPED;
				793
				794	return 0;
				795	}
				796
				797	xfs_buf_t *
				798	pagebuf_get_no_daddr(
				799	size_t len,
				800	xfs_buftarg_t *target)
				801	{
				802	size_t malloc_len = len;
				803	xfs_buf_t *bp;
				804	void *data;
				805	int error;
				806
				807	bp = pagebuf_allocate(0);
				808	if (unlikely(bp == NULL))
				809	goto fail;
Christoph Hellwig	88741a9	2005-11-02 10:21:14 +1100	[diff] [blame]	810	_pagebuf_initialize(bp, target, 0, len, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	811
				812	try_again:
				813	data = kmem_alloc(malloc_len, KM_SLEEP \| KM_MAYFAIL);
				814	if (unlikely(data == NULL))
				815	goto fail_free_buf;
				816
				817	/* check whether alignment matches.. */
				818	if ((__psunsigned_t)data !=
				819	((__psunsigned_t)data & ~target->pbr_smask)) {
				820	/* .. else double the size and try again */
				821	kmem_free(data, malloc_len);
				822	malloc_len <<= 1;
				823	goto try_again;
				824	}
				825
				826	error = pagebuf_associate_memory(bp, data, len);
				827	if (error)
				828	goto fail_free_mem;
				829	bp->pb_flags \|= _PBF_KMEM_ALLOC;
				830
				831	pagebuf_unlock(bp);
				832
				833	PB_TRACE(bp, "no_daddr", data);
				834	return bp;
				835	fail_free_mem:
				836	kmem_free(data, malloc_len);
				837	fail_free_buf:
				838	pagebuf_free(bp);
				839	fail:
				840	return NULL;
				841	}
				842
				843	/*
				844	* pagebuf_hold
				845	*
				846	* Increment reference count on buffer, to hold the buffer concurrently
				847	* with another thread which may release (free) the buffer asynchronously.
				848	*
				849	* Must hold the buffer already to call this function.
				850	*/
				851	void
				852	pagebuf_hold(
				853	xfs_buf_t *pb)
				854	{
				855	atomic_inc(&pb->pb_hold);
				856	PB_TRACE(pb, "hold", 0);
				857	}
				858
				859	/*
				860	* pagebuf_rele
				861	*
				862	* pagebuf_rele releases a hold on the specified buffer. If the
				863	* the hold count is 1, pagebuf_rele calls pagebuf_free.
				864	*/
				865	void
				866	pagebuf_rele(
				867	xfs_buf_t *pb)
				868	{
				869	xfs_bufhash_t *hash = pb->pb_hash;
				870
				871	PB_TRACE(pb, "rele", pb->pb_relse);
				872
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	873	/*
				874	* pagebuf_lookup buffers are not hashed, not delayed write,
				875	* and don't have their own release routines. Special case.
				876	*/
				877	if (unlikely(!hash)) {
				878	ASSERT(!pb->pb_relse);
				879	if (atomic_dec_and_test(&pb->pb_hold))
				880	xfs_buf_free(pb);
				881	return;
				882	}
				883
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	884	if (atomic_dec_and_lock(&pb->pb_hold, &hash->bh_lock)) {
				885	int do_free = 1;
				886
				887	if (pb->pb_relse) {
				888	atomic_inc(&pb->pb_hold);
				889	spin_unlock(&hash->bh_lock);
				890	(*(pb->pb_relse)) (pb);
				891	spin_lock(&hash->bh_lock);
				892	do_free = 0;
				893	}
				894
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	895	if (pb->pb_flags & PBF_FS_MANAGED) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	896	do_free = 0;
				897	}
				898
				899	if (do_free) {
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	900	ASSERT((pb->pb_flags & (PBF_DELWRI\|_PBF_DELWRI_Q)) == 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	901	list_del_init(&pb->pb_hash_list);
				902	spin_unlock(&hash->bh_lock);
				903	pagebuf_free(pb);
				904	} else {
				905	spin_unlock(&hash->bh_lock);
				906	}
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	907	} else {
				908	/*
				909	* Catch reference count leaks
				910	*/
				911	ASSERT(atomic_read(&pb->pb_hold) >= 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	912	}
				913	}
				914
				915
				916	/*
				917	* Mutual exclusion on buffers. Locking model:
				918	*
				919	* Buffers associated with inodes for which buffer locking
				920	* is not enabled are not protected by semaphores, and are
				921	* assumed to be exclusively owned by the caller. There is a
				922	* spinlock in the buffer, used by the caller when concurrent
				923	* access is possible.
				924	*/
				925
				926	/*
				927	* pagebuf_cond_lock
				928	*
				929	* pagebuf_cond_lock locks a buffer object, if it is not already locked.
				930	* Note that this in no way
				931	* locks the underlying pages, so it is only useful for synchronizing
				932	* concurrent use of page buffer objects, not for synchronizing independent
				933	* access to the underlying pages.
				934	*/
				935	int
				936	pagebuf_cond_lock( /* lock buffer, if not locked */
				937	/* returns -EBUSY if locked) */
				938	xfs_buf_t *pb)
				939	{
				940	int locked;
				941
				942	locked = down_trylock(&pb->pb_sema) == 0;
				943	if (locked) {
				944	PB_SET_OWNER(pb);
				945	}
				946	PB_TRACE(pb, "cond_lock", (long)locked);
				947	return(locked ? 0 : -EBUSY);
				948	}
				949
				950	#if defined(DEBUG) \|\| defined(XFS_BLI_TRACE)
				951	/*
				952	* pagebuf_lock_value
				953	*
				954	* Return lock value for a pagebuf
				955	*/
				956	int
				957	pagebuf_lock_value(
				958	xfs_buf_t *pb)
				959	{
				960	return(atomic_read(&pb->pb_sema.count));
				961	}
				962	#endif
				963
				964	/*
				965	* pagebuf_lock
				966	*
				967	* pagebuf_lock locks a buffer object. Note that this in no way
				968	* locks the underlying pages, so it is only useful for synchronizing
				969	* concurrent use of page buffer objects, not for synchronizing independent
				970	* access to the underlying pages.
				971	*/
				972	int
				973	pagebuf_lock(
				974	xfs_buf_t *pb)
				975	{
				976	PB_TRACE(pb, "lock", 0);
				977	if (atomic_read(&pb->pb_io_remaining))
				978	blk_run_address_space(pb->pb_target->pbr_mapping);
				979	down(&pb->pb_sema);
				980	PB_SET_OWNER(pb);
				981	PB_TRACE(pb, "locked", 0);
				982	return 0;
				983	}
				984
				985	/*
				986	* pagebuf_unlock
				987	*
				988	* pagebuf_unlock releases the lock on the buffer object created by
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	989	* pagebuf_lock or pagebuf_cond_lock (not any pinning of underlying pages
				990	* created by pagebuf_pin).
				991	*
				992	* If the buffer is marked delwri but is not queued, do so before we
				993	* unlock the buffer as we need to set flags correctly. We also need to
				994	* take a reference for the delwri queue because the unlocker is going to
				995	* drop their's and they don't know we just queued it.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	996	*/
				997	void
				998	pagebuf_unlock( /* unlock buffer */
				999	xfs_buf_t pb) / buffer to unlock */
				1000	{
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	1001	if ((pb->pb_flags & (PBF_DELWRI\|_PBF_DELWRI_Q)) == PBF_DELWRI) {
				1002	atomic_inc(&pb->pb_hold);
				1003	pb->pb_flags \|= PBF_ASYNC;
				1004	pagebuf_delwri_queue(pb, 0);
				1005	}
				1006
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1007	PB_CLEAR_OWNER(pb);
				1008	up(&pb->pb_sema);
				1009	PB_TRACE(pb, "unlock", 0);
				1010	}
				1011
				1012
				1013	/*
				1014	* Pinning Buffer Storage in Memory
				1015	*/
				1016
				1017	/*
				1018	* pagebuf_pin
				1019	*
				1020	* pagebuf_pin locks all of the memory represented by a buffer in
				1021	* memory. Multiple calls to pagebuf_pin and pagebuf_unpin, for
				1022	* the same or different buffers affecting a given page, will
				1023	* properly count the number of outstanding "pin" requests. The
				1024	* buffer may be released after the pagebuf_pin and a different
				1025	* buffer used when calling pagebuf_unpin, if desired.
				1026	* pagebuf_pin should be used by the file system when it wants be
				1027	* assured that no attempt will be made to force the affected
				1028	* memory to disk. It does not assure that a given logical page
				1029	* will not be moved to a different physical page.
				1030	*/
				1031	void
				1032	pagebuf_pin(
				1033	xfs_buf_t *pb)
				1034	{
				1035	atomic_inc(&pb->pb_pin_count);
				1036	PB_TRACE(pb, "pin", (long)pb->pb_pin_count.counter);
				1037	}
				1038
				1039	/*
				1040	* pagebuf_unpin
				1041	*
				1042	* pagebuf_unpin reverses the locking of memory performed by
				1043	* pagebuf_pin. Note that both functions affected the logical
				1044	* pages associated with the buffer, not the buffer itself.
				1045	*/
				1046	void
				1047	pagebuf_unpin(
				1048	xfs_buf_t *pb)
				1049	{
				1050	if (atomic_dec_and_test(&pb->pb_pin_count)) {
				1051	wake_up_all(&pb->pb_waiters);
				1052	}
				1053	PB_TRACE(pb, "unpin", (long)pb->pb_pin_count.counter);
				1054	}
				1055
				1056	int
				1057	pagebuf_ispin(
				1058	xfs_buf_t *pb)
				1059	{
				1060	return atomic_read(&pb->pb_pin_count);
				1061	}
				1062
				1063	/*
				1064	* pagebuf_wait_unpin
				1065	*
				1066	* pagebuf_wait_unpin waits until all of the memory associated
				1067	* with the buffer is not longer locked in memory. It returns
				1068	* immediately if none of the affected pages are locked.
				1069	*/
				1070	static inline void
				1071	_pagebuf_wait_unpin(
				1072	xfs_buf_t *pb)
				1073	{
				1074	DECLARE_WAITQUEUE (wait, current);
				1075
				1076	if (atomic_read(&pb->pb_pin_count) == 0)
				1077	return;
				1078
				1079	add_wait_queue(&pb->pb_waiters, &wait);
				1080	for (;;) {
				1081	set_current_state(TASK_UNINTERRUPTIBLE);
				1082	if (atomic_read(&pb->pb_pin_count) == 0)
				1083	break;
				1084	if (atomic_read(&pb->pb_io_remaining))
				1085	blk_run_address_space(pb->pb_target->pbr_mapping);
				1086	schedule();
				1087	}
				1088	remove_wait_queue(&pb->pb_waiters, &wait);
				1089	set_current_state(TASK_RUNNING);
				1090	}
				1091
				1092	/*
				1093	* Buffer Utility Routines
				1094	*/
				1095
				1096	/*
				1097	* pagebuf_iodone
				1098	*
				1099	* pagebuf_iodone marks a buffer for which I/O is in progress
				1100	* done with respect to that I/O. The pb_iodone routine, if
				1101	* present, will be called as a side-effect.
				1102	*/
				1103	STATIC void
				1104	pagebuf_iodone_work(
				1105	void *v)
				1106	{
				1107	xfs_buf_t bp = (xfs_buf_t )v;
				1108
				1109	if (bp->pb_iodone)
				1110	(*(bp->pb_iodone))(bp);
				1111	else if (bp->pb_flags & PBF_ASYNC)
				1112	xfs_buf_relse(bp);
				1113	}
				1114
				1115	void
				1116	pagebuf_iodone(
				1117	xfs_buf_t *pb,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1118	int schedule)
				1119	{
				1120	pb->pb_flags &= ~(PBF_READ \| PBF_WRITE);
Christoph Hellwig	88741a9	2005-11-02 10:21:14 +1100	[diff] [blame]	1121	if (pb->pb_error == 0)
				1122	pb->pb_flags &= ~PBF_NONE;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1123
				1124	PB_TRACE(pb, "iodone", pb->pb_iodone);
				1125
				1126	if ((pb->pb_iodone) \|\| (pb->pb_flags & PBF_ASYNC)) {
				1127	if (schedule) {
				1128	INIT_WORK(&pb->pb_iodone_work, pagebuf_iodone_work, pb);
Christoph Hellwig	88741a9	2005-11-02 10:21:14 +1100	[diff] [blame]	1129	queue_work(xfslogd_workqueue, &pb->pb_iodone_work);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1130	} else {
				1131	pagebuf_iodone_work(pb);
				1132	}
				1133	} else {
				1134	up(&pb->pb_iodonesema);
				1135	}
				1136	}
				1137
				1138	/*
				1139	* pagebuf_ioerror
				1140	*
				1141	* pagebuf_ioerror sets the error code for a buffer.
				1142	*/
				1143	void
				1144	pagebuf_ioerror( /* mark/clear buffer error flag */
				1145	xfs_buf_t pb, / buffer to mark */
				1146	int error) /* error to store (0 if none) */
				1147	{
				1148	ASSERT(error >= 0 && error <= 0xffff);
				1149	pb->pb_error = (unsigned short)error;
				1150	PB_TRACE(pb, "ioerror", (unsigned long)error);
				1151	}
				1152
				1153	/*
				1154	* pagebuf_iostart
				1155	*
				1156	* pagebuf_iostart initiates I/O on a buffer, based on the flags supplied.
				1157	* If necessary, it will arrange for any disk space allocation required,
				1158	* and it will break up the request if the block mappings require it.
				1159	* The pb_iodone routine in the buffer supplied will only be called
				1160	* when all of the subsidiary I/O requests, if any, have been completed.
				1161	* pagebuf_iostart calls the pagebuf_ioinitiate routine or
				1162	* pagebuf_iorequest, if the former routine is not defined, to start
				1163	* the I/O on a given low-level request.
				1164	*/
				1165	int
				1166	pagebuf_iostart( /* start I/O on a buffer */
				1167	xfs_buf_t pb, / buffer to start */
				1168	page_buf_flags_t flags) /* PBF_LOCK, PBF_ASYNC, PBF_READ, */
				1169	/* PBF_WRITE, PBF_DELWRI, */
				1170	/* PBF_DONT_BLOCK */
				1171	{
				1172	int status = 0;
				1173
				1174	PB_TRACE(pb, "iostart", (unsigned long)flags);
				1175
				1176	if (flags & PBF_DELWRI) {
				1177	pb->pb_flags &= ~(PBF_READ \| PBF_WRITE \| PBF_ASYNC);
				1178	pb->pb_flags \|= flags & (PBF_DELWRI \| PBF_ASYNC);
				1179	pagebuf_delwri_queue(pb, 1);
				1180	return status;
				1181	}
				1182
				1183	pb->pb_flags &= ~(PBF_READ \| PBF_WRITE \| PBF_ASYNC \| PBF_DELWRI \| \
				1184	PBF_READ_AHEAD \| _PBF_RUN_QUEUES);
				1185	pb->pb_flags \|= flags & (PBF_READ \| PBF_WRITE \| PBF_ASYNC \| \
				1186	PBF_READ_AHEAD \| _PBF_RUN_QUEUES);
				1187
				1188	BUG_ON(pb->pb_bn == XFS_BUF_DADDR_NULL);
				1189
				1190	/* For writes allow an alternate strategy routine to precede
				1191	* the actual I/O request (which may not be issued at all in
				1192	* a shutdown situation, for example).
				1193	*/
				1194	status = (flags & PBF_WRITE) ?
				1195	pagebuf_iostrategy(pb) : pagebuf_iorequest(pb);
				1196
				1197	/* Wait for I/O if we are not an async request.
				1198	* Note: async I/O request completion will release the buffer,
				1199	* and that can already be done by this point. So using the
				1200	* buffer pointer from here on, after async I/O, is invalid.
				1201	*/
				1202	if (!status && !(flags & PBF_ASYNC))
				1203	status = pagebuf_iowait(pb);
				1204
				1205	return status;
				1206	}
				1207
				1208	/*
				1209	* Helper routine for pagebuf_iorequest
				1210	*/
				1211
				1212	STATIC __inline__ int
				1213	_pagebuf_iolocked(
				1214	xfs_buf_t *pb)
				1215	{
				1216	ASSERT(pb->pb_flags & (PBF_READ\|PBF_WRITE));
				1217	if (pb->pb_flags & PBF_READ)
				1218	return pb->pb_locked;
				1219	return 0;
				1220	}
				1221
				1222	STATIC __inline__ void
				1223	_pagebuf_iodone(
				1224	xfs_buf_t *pb,
				1225	int schedule)
				1226	{
				1227	if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
				1228	pb->pb_locked = 0;
Christoph Hellwig	88741a9	2005-11-02 10:21:14 +1100	[diff] [blame]	1229	pagebuf_iodone(pb, schedule);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1230	}
				1231	}
				1232
				1233	STATIC int
				1234	bio_end_io_pagebuf(
				1235	struct bio *bio,
				1236	unsigned int bytes_done,
				1237	int error)
				1238	{
				1239	xfs_buf_t pb = (xfs_buf_t )bio->bi_private;
Nathan Scott	eedb553	2005-09-02 16:39:56 +1000	[diff] [blame]	1240	unsigned int blocksize = pb->pb_target->pbr_bsize;
				1241	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1242
				1243	if (bio->bi_size)
				1244	return 1;
				1245
				1246	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
				1247	pb->pb_error = EIO;
				1248
Nathan Scott	eedb553	2005-09-02 16:39:56 +1000	[diff] [blame]	1249	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1250	struct page *page = bvec->bv_page;
				1251
Nathan Scott	eedb553	2005-09-02 16:39:56 +1000	[diff] [blame]	1252	if (unlikely(pb->pb_error)) {
				1253	if (pb->pb_flags & PBF_READ)
				1254	ClearPageUptodate(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1255	SetPageError(page);
				1256	} else if (blocksize == PAGE_CACHE_SIZE) {
				1257	SetPageUptodate(page);
				1258	} else if (!PagePrivate(page) &&
				1259	(pb->pb_flags & _PBF_PAGE_CACHE)) {
				1260	set_page_region(page, bvec->bv_offset, bvec->bv_len);
				1261	}
				1262
Nathan Scott	eedb553	2005-09-02 16:39:56 +1000	[diff] [blame]	1263	if (--bvec >= bio->bi_io_vec)
				1264	prefetchw(&bvec->bv_page->flags);
				1265
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1266	if (_pagebuf_iolocked(pb)) {
				1267	unlock_page(page);
				1268	}
Nathan Scott	eedb553	2005-09-02 16:39:56 +1000	[diff] [blame]	1269	} while (bvec >= bio->bi_io_vec);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1270
				1271	_pagebuf_iodone(pb, 1);
				1272	bio_put(bio);
				1273	return 0;
				1274	}
				1275
				1276	STATIC void
				1277	_pagebuf_ioapply(
				1278	xfs_buf_t *pb)
				1279	{
				1280	int i, rw, map_i, total_nr_pages, nr_pages;
				1281	struct bio *bio;
				1282	int offset = pb->pb_offset;
				1283	int size = pb->pb_count_desired;
				1284	sector_t sector = pb->pb_bn;
				1285	unsigned int blocksize = pb->pb_target->pbr_bsize;
				1286	int locking = _pagebuf_iolocked(pb);
				1287
				1288	total_nr_pages = pb->pb_page_count;
				1289	map_i = 0;
				1290
				1291	if (pb->pb_flags & _PBF_RUN_QUEUES) {
				1292	pb->pb_flags &= ~_PBF_RUN_QUEUES;
				1293	rw = (pb->pb_flags & PBF_READ) ? READ_SYNC : WRITE_SYNC;
				1294	} else {
				1295	rw = (pb->pb_flags & PBF_READ) ? READ : WRITE;
				1296	}
				1297
Christoph Hellwig	f538d4d	2005-11-02 10:26:59 +1100	[diff] [blame^]	1298	if (pb->pb_flags & PBF_ORDERED) {
				1299	ASSERT(!(pb->pb_flags & PBF_READ));
				1300	rw = WRITE_BARRIER;
				1301	}
				1302
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1303	/* Special code path for reading a sub page size pagebuf in --
				1304	* we populate up the whole page, and hence the other metadata
				1305	* in the same page. This optimization is only valid when the
				1306	* filesystem block size and the page size are equal.
				1307	*/
				1308	if ((pb->pb_buffer_length < PAGE_CACHE_SIZE) &&
				1309	(pb->pb_flags & PBF_READ) && locking &&
				1310	(blocksize == PAGE_CACHE_SIZE)) {
				1311	bio = bio_alloc(GFP_NOIO, 1);
				1312
				1313	bio->bi_bdev = pb->pb_target->pbr_bdev;
				1314	bio->bi_sector = sector - (offset >> BBSHIFT);
				1315	bio->bi_end_io = bio_end_io_pagebuf;
				1316	bio->bi_private = pb;
				1317
				1318	bio_add_page(bio, pb->pb_pages[0], PAGE_CACHE_SIZE, 0);
				1319	size = 0;
				1320
				1321	atomic_inc(&pb->pb_io_remaining);
				1322
				1323	goto submit_io;
				1324	}
				1325
				1326	/* Lock down the pages which we need to for the request */
				1327	if (locking && (pb->pb_flags & PBF_WRITE) && (pb->pb_locked == 0)) {
				1328	for (i = 0; size; i++) {
				1329	int nbytes = PAGE_CACHE_SIZE - offset;
				1330	struct page *page = pb->pb_pages[i];
				1331
				1332	if (nbytes > size)
				1333	nbytes = size;
				1334
				1335	lock_page(page);
				1336
				1337	size -= nbytes;
				1338	offset = 0;
				1339	}
				1340	offset = pb->pb_offset;
				1341	size = pb->pb_count_desired;
				1342	}
				1343
				1344	next_chunk:
				1345	atomic_inc(&pb->pb_io_remaining);
				1346	nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
				1347	if (nr_pages > total_nr_pages)
				1348	nr_pages = total_nr_pages;
				1349
				1350	bio = bio_alloc(GFP_NOIO, nr_pages);
				1351	bio->bi_bdev = pb->pb_target->pbr_bdev;
				1352	bio->bi_sector = sector;
				1353	bio->bi_end_io = bio_end_io_pagebuf;
				1354	bio->bi_private = pb;
				1355
				1356	for (; size && nr_pages; nr_pages--, map_i++) {
				1357	int nbytes = PAGE_CACHE_SIZE - offset;
				1358
				1359	if (nbytes > size)
				1360	nbytes = size;
				1361
				1362	if (bio_add_page(bio, pb->pb_pages[map_i],
				1363	nbytes, offset) < nbytes)
				1364	break;
				1365
				1366	offset = 0;
				1367	sector += nbytes >> BBSHIFT;
				1368	size -= nbytes;
				1369	total_nr_pages--;
				1370	}
				1371
				1372	submit_io:
				1373	if (likely(bio->bi_size)) {
				1374	submit_bio(rw, bio);
				1375	if (size)
				1376	goto next_chunk;
				1377	} else {
				1378	bio_put(bio);
				1379	pagebuf_ioerror(pb, EIO);
				1380	}
				1381	}
				1382
				1383	/*
				1384	* pagebuf_iorequest -- the core I/O request routine.
				1385	*/
				1386	int
				1387	pagebuf_iorequest( /* start real I/O */
				1388	xfs_buf_t pb) / buffer to convey to device */
				1389	{
				1390	PB_TRACE(pb, "iorequest", 0);
				1391
				1392	if (pb->pb_flags & PBF_DELWRI) {
				1393	pagebuf_delwri_queue(pb, 1);
				1394	return 0;
				1395	}
				1396
				1397	if (pb->pb_flags & PBF_WRITE) {
				1398	_pagebuf_wait_unpin(pb);
				1399	}
				1400
				1401	pagebuf_hold(pb);
				1402
				1403	/* Set the count to 1 initially, this will stop an I/O
				1404	* completion callout which happens before we have started
				1405	* all the I/O from calling pagebuf_iodone too early.
				1406	*/
				1407	atomic_set(&pb->pb_io_remaining, 1);
				1408	_pagebuf_ioapply(pb);
				1409	_pagebuf_iodone(pb, 0);
				1410
				1411	pagebuf_rele(pb);
				1412	return 0;
				1413	}
				1414
				1415	/*
				1416	* pagebuf_iowait
				1417	*
				1418	* pagebuf_iowait waits for I/O to complete on the buffer supplied.
				1419	* It returns immediately if no I/O is pending. In any case, it returns
				1420	* the error code, if any, or 0 if there is no error.
				1421	*/
				1422	int
				1423	pagebuf_iowait(
				1424	xfs_buf_t *pb)
				1425	{
				1426	PB_TRACE(pb, "iowait", 0);
				1427	if (atomic_read(&pb->pb_io_remaining))
				1428	blk_run_address_space(pb->pb_target->pbr_mapping);
				1429	down(&pb->pb_iodonesema);
				1430	PB_TRACE(pb, "iowaited", (long)pb->pb_error);
				1431	return pb->pb_error;
				1432	}
				1433
				1434	caddr_t
				1435	pagebuf_offset(
				1436	xfs_buf_t *pb,
				1437	size_t offset)
				1438	{
				1439	struct page *page;
				1440
				1441	offset += pb->pb_offset;
				1442
				1443	page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT];
				1444	return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1));
				1445	}
				1446
				1447	/*
				1448	* pagebuf_iomove
				1449	*
				1450	* Move data into or out of a buffer.
				1451	*/
				1452	void
				1453	pagebuf_iomove(
				1454	xfs_buf_t pb, / buffer to process */
				1455	size_t boff, /* starting buffer offset */
				1456	size_t bsize, /* length to copy */
				1457	caddr_t data, /* data address */
				1458	page_buf_rw_t mode) /* read/write flag */
				1459	{
				1460	size_t bend, cpoff, csize;
				1461	struct page *page;
				1462
				1463	bend = boff + bsize;
				1464	while (boff < bend) {
				1465	page = pb->pb_pages[page_buf_btoct(boff + pb->pb_offset)];
				1466	cpoff = page_buf_poff(boff + pb->pb_offset);
				1467	csize = min_t(size_t,
				1468	PAGE_CACHE_SIZE-cpoff, pb->pb_count_desired-boff);
				1469
				1470	ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
				1471
				1472	switch (mode) {
				1473	case PBRW_ZERO:
				1474	memset(page_address(page) + cpoff, 0, csize);
				1475	break;
				1476	case PBRW_READ:
				1477	memcpy(data, page_address(page) + cpoff, csize);
				1478	break;
				1479	case PBRW_WRITE:
				1480	memcpy(page_address(page) + cpoff, data, csize);
				1481	}
				1482
				1483	boff += csize;
				1484	data += csize;
				1485	}
				1486	}
				1487
				1488	/*
				1489	* Handling of buftargs.
				1490	*/
				1491
				1492	/*
				1493	* Wait for any bufs with callbacks that have been submitted but
				1494	* have not yet returned... walk the hash list for the target.
				1495	*/
				1496	void
				1497	xfs_wait_buftarg(
				1498	xfs_buftarg_t *btp)
				1499	{
				1500	xfs_buf_t bp, n;
				1501	xfs_bufhash_t *hash;
				1502	uint i;
				1503
				1504	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
				1505	hash = &btp->bt_hash[i];
				1506	again:
				1507	spin_lock(&hash->bh_lock);
				1508	list_for_each_entry_safe(bp, n, &hash->bh_list, pb_hash_list) {
				1509	ASSERT(btp == bp->pb_target);
				1510	if (!(bp->pb_flags & PBF_FS_MANAGED)) {
				1511	spin_unlock(&hash->bh_lock);
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	1512	/*
				1513	* Catch superblock reference count leaks
				1514	* immediately
				1515	*/
				1516	BUG_ON(bp->pb_bn == 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1517	delay(100);
				1518	goto again;
				1519	}
				1520	}
				1521	spin_unlock(&hash->bh_lock);
				1522	}
				1523	}
				1524
				1525	/*
				1526	* Allocate buffer hash table for a given target.
				1527	* For devices containing metadata (i.e. not the log/realtime devices)
				1528	* we need to allocate a much larger hash table.
				1529	*/
				1530	STATIC void
				1531	xfs_alloc_bufhash(
				1532	xfs_buftarg_t *btp,
				1533	int external)
				1534	{
				1535	unsigned int i;
				1536
				1537	btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */
				1538	btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
				1539	btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) *
				1540	sizeof(xfs_bufhash_t), KM_SLEEP);
				1541	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
				1542	spin_lock_init(&btp->bt_hash[i].bh_lock);
				1543	INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
				1544	}
				1545	}
				1546
				1547	STATIC void
				1548	xfs_free_bufhash(
				1549	xfs_buftarg_t *btp)
				1550	{
				1551	kmem_free(btp->bt_hash,
				1552	(1 << btp->bt_hashshift) * sizeof(xfs_bufhash_t));
				1553	btp->bt_hash = NULL;
				1554	}
				1555
				1556	void
				1557	xfs_free_buftarg(
				1558	xfs_buftarg_t *btp,
				1559	int external)
				1560	{
				1561	xfs_flush_buftarg(btp, 1);
				1562	if (external)
				1563	xfs_blkdev_put(btp->pbr_bdev);
				1564	xfs_free_bufhash(btp);
				1565	iput(btp->pbr_mapping->host);
				1566	kmem_free(btp, sizeof(*btp));
				1567	}
				1568
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1569	STATIC int
				1570	xfs_setsize_buftarg_flags(
				1571	xfs_buftarg_t *btp,
				1572	unsigned int blocksize,
				1573	unsigned int sectorsize,
				1574	int verbose)
				1575	{
				1576	btp->pbr_bsize = blocksize;
				1577	btp->pbr_sshift = ffs(sectorsize) - 1;
				1578	btp->pbr_smask = sectorsize - 1;
				1579
				1580	if (set_blocksize(btp->pbr_bdev, sectorsize)) {
				1581	printk(KERN_WARNING
				1582	"XFS: Cannot set_blocksize to %u on device %s\n",
				1583	sectorsize, XFS_BUFTARG_NAME(btp));
				1584	return EINVAL;
				1585	}
				1586
				1587	if (verbose &&
				1588	(PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
				1589	printk(KERN_WARNING
				1590	"XFS: %u byte sectors in use on device %s. "
				1591	"This is suboptimal; %u or greater is ideal.\n",
				1592	sectorsize, XFS_BUFTARG_NAME(btp),
				1593	(unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
				1594	}
				1595
				1596	return 0;
				1597	}
				1598
				1599	/*
				1600	* When allocating the initial buffer target we have not yet
				1601	* read in the superblock, so don't know what sized sectors
				1602	* are being used is at this early stage. Play safe.
				1603	*/
				1604	STATIC int
				1605	xfs_setsize_buftarg_early(
				1606	xfs_buftarg_t *btp,
				1607	struct block_device *bdev)
				1608	{
				1609	return xfs_setsize_buftarg_flags(btp,
				1610	PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0);
				1611	}
				1612
				1613	int
				1614	xfs_setsize_buftarg(
				1615	xfs_buftarg_t *btp,
				1616	unsigned int blocksize,
				1617	unsigned int sectorsize)
				1618	{
				1619	return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
				1620	}
				1621
				1622	STATIC int
				1623	xfs_mapping_buftarg(
				1624	xfs_buftarg_t *btp,
				1625	struct block_device *bdev)
				1626	{
				1627	struct backing_dev_info *bdi;
				1628	struct inode *inode;
				1629	struct address_space *mapping;
				1630	static struct address_space_operations mapping_aops = {
				1631	.sync_page = block_sync_page,
				1632	};
				1633
				1634	inode = new_inode(bdev->bd_inode->i_sb);
				1635	if (!inode) {
				1636	printk(KERN_WARNING
				1637	"XFS: Cannot allocate mapping inode for device %s\n",
				1638	XFS_BUFTARG_NAME(btp));
				1639	return ENOMEM;
				1640	}
				1641	inode->i_mode = S_IFBLK;
				1642	inode->i_bdev = bdev;
				1643	inode->i_rdev = bdev->bd_dev;
				1644	bdi = blk_get_backing_dev_info(bdev);
				1645	if (!bdi)
				1646	bdi = &default_backing_dev_info;
				1647	mapping = &inode->i_data;
				1648	mapping->a_ops = &mapping_aops;
				1649	mapping->backing_dev_info = bdi;
				1650	mapping_set_gfp_mask(mapping, GFP_NOFS);
				1651	btp->pbr_mapping = mapping;
				1652	return 0;
				1653	}
				1654
				1655	xfs_buftarg_t *
				1656	xfs_alloc_buftarg(
				1657	struct block_device *bdev,
				1658	int external)
				1659	{
				1660	xfs_buftarg_t *btp;
				1661
				1662	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
				1663
				1664	btp->pbr_dev = bdev->bd_dev;
				1665	btp->pbr_bdev = bdev;
				1666	if (xfs_setsize_buftarg_early(btp, bdev))
				1667	goto error;
				1668	if (xfs_mapping_buftarg(btp, bdev))
				1669	goto error;
				1670	xfs_alloc_bufhash(btp, external);
				1671	return btp;
				1672
				1673	error:
				1674	kmem_free(btp, sizeof(*btp));
				1675	return NULL;
				1676	}
				1677
				1678
				1679	/*
				1680	* Pagebuf delayed write buffer handling
				1681	*/
				1682
				1683	STATIC LIST_HEAD(pbd_delwrite_queue);
				1684	STATIC DEFINE_SPINLOCK(pbd_delwrite_lock);
				1685
				1686	STATIC void
				1687	pagebuf_delwri_queue(
				1688	xfs_buf_t *pb,
				1689	int unlock)
				1690	{
				1691	PB_TRACE(pb, "delwri_q", (long)unlock);
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	1692	ASSERT((pb->pb_flags & (PBF_DELWRI\|PBF_ASYNC)) ==
				1693	(PBF_DELWRI\|PBF_ASYNC));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1694
				1695	spin_lock(&pbd_delwrite_lock);
				1696	/* If already in the queue, dequeue and place at tail */
				1697	if (!list_empty(&pb->pb_list)) {
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	1698	ASSERT(pb->pb_flags & _PBF_DELWRI_Q);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1699	if (unlock) {
				1700	atomic_dec(&pb->pb_hold);
				1701	}
				1702	list_del(&pb->pb_list);
				1703	}
				1704
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	1705	pb->pb_flags \|= _PBF_DELWRI_Q;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1706	list_add_tail(&pb->pb_list, &pbd_delwrite_queue);
				1707	pb->pb_queuetime = jiffies;
				1708	spin_unlock(&pbd_delwrite_lock);
				1709
				1710	if (unlock)
				1711	pagebuf_unlock(pb);
				1712	}
				1713
				1714	void
				1715	pagebuf_delwri_dequeue(
				1716	xfs_buf_t *pb)
				1717	{
				1718	int dequeued = 0;
				1719
				1720	spin_lock(&pbd_delwrite_lock);
				1721	if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) {
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	1722	ASSERT(pb->pb_flags & _PBF_DELWRI_Q);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1723	list_del_init(&pb->pb_list);
				1724	dequeued = 1;
				1725	}
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	1726	pb->pb_flags &= ~(PBF_DELWRI\|_PBF_DELWRI_Q);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1727	spin_unlock(&pbd_delwrite_lock);
				1728
				1729	if (dequeued)
				1730	pagebuf_rele(pb);
				1731
				1732	PB_TRACE(pb, "delwri_dq", (long)dequeued);
				1733	}
				1734
				1735	STATIC void
				1736	pagebuf_runall_queues(
				1737	struct workqueue_struct *queue)
				1738	{
				1739	flush_workqueue(queue);
				1740	}
				1741
				1742	/* Defines for pagebuf daemon */
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1743	STATIC struct task_struct *xfsbufd_task;
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1744	STATIC int xfsbufd_force_flush;
				1745	STATIC int xfsbufd_force_sleep;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1746
				1747	STATIC int
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1748	xfsbufd_wakeup(
Al Viro	27496a8	2005-10-21 03:20:48 -0400	[diff] [blame]	1749	int priority,
				1750	gfp_t mask)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1751	{
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1752	if (xfsbufd_force_sleep)
Nathan Scott	abd0cf7	2005-05-05 13:30:13 -0700	[diff] [blame]	1753	return 0;
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1754	xfsbufd_force_flush = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1755	barrier();
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1756	wake_up_process(xfsbufd_task);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1757	return 0;
				1758	}
				1759
				1760	STATIC int
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1761	xfsbufd(
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1762	void *data)
				1763	{
				1764	struct list_head tmp;
				1765	unsigned long age;
				1766	xfs_buftarg_t *target;
				1767	xfs_buf_t pb, n;
				1768
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1769	current->flags \|= PF_MEMALLOC;
				1770
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1771	INIT_LIST_HEAD(&tmp);
				1772	do {
Christoph Lameter	3e1d1d2	2005-06-24 23:13:50 -0700	[diff] [blame]	1773	if (unlikely(freezing(current))) {
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1774	xfsbufd_force_sleep = 1;
Christoph Lameter	3e1d1d2	2005-06-24 23:13:50 -0700	[diff] [blame]	1775	refrigerator();
Nathan Scott	abd0cf7	2005-05-05 13:30:13 -0700	[diff] [blame]	1776	} else {
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1777	xfsbufd_force_sleep = 0;
Nathan Scott	abd0cf7	2005-05-05 13:30:13 -0700	[diff] [blame]	1778	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1779
Nishanth Aravamudan	041e0e3	2005-09-10 00:27:23 -0700	[diff] [blame]	1780	schedule_timeout_interruptible
				1781	(xfs_buf_timer_centisecs * msecs_to_jiffies(10));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1782
Nishanth Aravamudan	041e0e3	2005-09-10 00:27:23 -0700	[diff] [blame]	1783	age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1784	spin_lock(&pbd_delwrite_lock);
				1785	list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
				1786	PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb));
				1787	ASSERT(pb->pb_flags & PBF_DELWRI);
				1788
				1789	if (!pagebuf_ispin(pb) && !pagebuf_cond_lock(pb)) {
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1790	if (!xfsbufd_force_flush &&
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1791	time_before(jiffies,
				1792	pb->pb_queuetime + age)) {
				1793	pagebuf_unlock(pb);
				1794	break;
				1795	}
				1796
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	1797	pb->pb_flags &= ~(PBF_DELWRI\|_PBF_DELWRI_Q);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1798	pb->pb_flags \|= PBF_WRITE;
				1799	list_move(&pb->pb_list, &tmp);
				1800	}
				1801	}
				1802	spin_unlock(&pbd_delwrite_lock);
				1803
				1804	while (!list_empty(&tmp)) {
				1805	pb = list_entry(tmp.next, xfs_buf_t, pb_list);
				1806	target = pb->pb_target;
				1807
				1808	list_del_init(&pb->pb_list);
				1809	pagebuf_iostrategy(pb);
				1810
				1811	blk_run_address_space(target->pbr_mapping);
				1812	}
				1813
				1814	if (as_list_len > 0)
				1815	purge_addresses();
				1816
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1817	xfsbufd_force_flush = 0;
Christoph Hellwig	4df08c5	2005-09-05 08:34:18 +1000	[diff] [blame]	1818	} while (!kthread_should_stop());
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1819
Christoph Hellwig	4df08c5	2005-09-05 08:34:18 +1000	[diff] [blame]	1820	return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1821	}
				1822
				1823	/*
				1824	* Go through all incore buffers, and release buffers if they belong to
				1825	* the given device. This is used in filesystem error handling to
				1826	* preserve the consistency of its metadata.
				1827	*/
				1828	int
				1829	xfs_flush_buftarg(
				1830	xfs_buftarg_t *target,
				1831	int wait)
				1832	{
				1833	struct list_head tmp;
				1834	xfs_buf_t pb, n;
				1835	int pincount = 0;
				1836
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1837	pagebuf_runall_queues(xfsdatad_workqueue);
				1838	pagebuf_runall_queues(xfslogd_workqueue);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1839
				1840	INIT_LIST_HEAD(&tmp);
				1841	spin_lock(&pbd_delwrite_lock);
				1842	list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
				1843
				1844	if (pb->pb_target != target)
				1845	continue;
				1846
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	1847	ASSERT(pb->pb_flags & (PBF_DELWRI\|_PBF_DELWRI_Q));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1848	PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb));
				1849	if (pagebuf_ispin(pb)) {
				1850	pincount++;
				1851	continue;
				1852	}
				1853
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1854	list_move(&pb->pb_list, &tmp);
				1855	}
				1856	spin_unlock(&pbd_delwrite_lock);
				1857
				1858	/*
				1859	* Dropped the delayed write list lock, now walk the temporary list
				1860	*/
				1861	list_for_each_entry_safe(pb, n, &tmp, pb_list) {
David Chinner	2f92658	2005-09-05 08:33:35 +1000	[diff] [blame]	1862	pagebuf_lock(pb);
				1863	pb->pb_flags &= ~(PBF_DELWRI\|_PBF_DELWRI_Q);
				1864	pb->pb_flags \|= PBF_WRITE;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1865	if (wait)
				1866	pb->pb_flags &= ~PBF_ASYNC;
				1867	else
				1868	list_del_init(&pb->pb_list);
				1869
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1870	pagebuf_iostrategy(pb);
				1871	}
				1872
				1873	/*
				1874	* Remaining list items must be flushed before returning
				1875	*/
				1876	while (!list_empty(&tmp)) {
				1877	pb = list_entry(tmp.next, xfs_buf_t, pb_list);
				1878
				1879	list_del_init(&pb->pb_list);
				1880	xfs_iowait(pb);
				1881	xfs_buf_relse(pb);
				1882	}
				1883
				1884	if (wait)
				1885	blk_run_address_space(target->pbr_mapping);
				1886
				1887	return pincount;
				1888	}
				1889
Christoph Hellwig	04d8b28	2005-11-02 10:15:05 +1100	[diff] [blame]	1890	int __init
				1891	pagebuf_init(void)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1892	{
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1893	int error = -ENOMEM;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1894
Christoph Hellwig	04d8b28	2005-11-02 10:15:05 +1100	[diff] [blame]	1895	#ifdef PAGEBUF_TRACE
				1896	pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP);
				1897	#endif
				1898
				1899	pagebuf_zone = kmem_zone_init(sizeof(xfs_buf_t), "xfs_buf");
				1900	if (!pagebuf_zone)
				1901	goto out_free_trace_buf;
				1902
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1903	xfslogd_workqueue = create_workqueue("xfslogd");
				1904	if (!xfslogd_workqueue)
Christoph Hellwig	04d8b28	2005-11-02 10:15:05 +1100	[diff] [blame]	1905	goto out_free_buf_zone;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1906
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1907	xfsdatad_workqueue = create_workqueue("xfsdatad");
				1908	if (!xfsdatad_workqueue)
				1909	goto out_destroy_xfslogd_workqueue;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1910
Christoph Hellwig	4df08c5	2005-09-05 08:34:18 +1000	[diff] [blame]	1911	xfsbufd_task = kthread_run(xfsbufd, NULL, "xfsbufd");
				1912	if (IS_ERR(xfsbufd_task)) {
				1913	error = PTR_ERR(xfsbufd_task);
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1914	goto out_destroy_xfsdatad_workqueue;
Christoph Hellwig	4df08c5	2005-09-05 08:34:18 +1000	[diff] [blame]	1915	}
Christoph Hellwig	04d8b28	2005-11-02 10:15:05 +1100	[diff] [blame]	1916
				1917	pagebuf_shake = kmem_shake_register(xfsbufd_wakeup);
				1918	if (!pagebuf_shake)
				1919	goto out_stop_xfsbufd;
				1920
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1921	return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1922
Christoph Hellwig	04d8b28	2005-11-02 10:15:05 +1100	[diff] [blame]	1923	out_stop_xfsbufd:
				1924	kthread_stop(xfsbufd_task);
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1925	out_destroy_xfsdatad_workqueue:
				1926	destroy_workqueue(xfsdatad_workqueue);
				1927	out_destroy_xfslogd_workqueue:
				1928	destroy_workqueue(xfslogd_workqueue);
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1929	out_free_buf_zone:
Christoph Hellwig	04d8b28	2005-11-02 10:15:05 +1100	[diff] [blame]	1930	kmem_zone_destroy(pagebuf_zone);
				1931	out_free_trace_buf:
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1932	#ifdef PAGEBUF_TRACE
				1933	ktrace_free(pagebuf_trace_buf);
				1934	#endif
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1935	return error;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1936	}
				1937
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1938	void
				1939	pagebuf_terminate(void)
				1940	{
Christoph Hellwig	04d8b28	2005-11-02 10:15:05 +1100	[diff] [blame]	1941	kmem_shake_deregister(pagebuf_shake);
				1942	kthread_stop(xfsbufd_task);
				1943	destroy_workqueue(xfsdatad_workqueue);
				1944	destroy_workqueue(xfslogd_workqueue);
				1945	kmem_zone_destroy(pagebuf_zone);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1946	#ifdef PAGEBUF_TRACE
				1947	ktrace_free(pagebuf_trace_buf);
				1948	#endif
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1949	}