Blame - fs/xfs/linux-2.6/xfs_buf.c - kernel/msm-4.9

blob: 58286b1d733b9541c7395e78a404552a6e8022d4 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
Nathan Scott	eedb553	2005-09-02 16:39:56 +1000	[diff] [blame^]	2	* Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3	*
				4	* This program is free software; you can redistribute it and/or modify it
				5	* under the terms of version 2 of the GNU General Public License as
				6	* published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it would be useful, but
				9	* WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
				11	*
				12	* Further, this software is distributed without any warranty that it is
				13	* free of the rightful claim of any third person regarding infringement
				14	* or the like. Any license provided herein, whether implied or
				15	* otherwise, applies only to this software file. Patent licenses, if
				16	* any, provided herein do not apply to combinations of this program with
				17	* other software, or any other product whatsoever.
				18	*
				19	* You should have received a copy of the GNU General Public License along
				20	* with this program; if not, write the Free Software Foundation, Inc., 59
				21	* Temple Place - Suite 330, Boston MA 02111-1307, USA.
				22	*
				23	* Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
				24	* Mountain View, CA 94043, or:
				25	*
				26	* http://www.sgi.com
				27	*
				28	* For further information regarding this notice, see:
				29	*
				30	* http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
				31	*/
				32
				33	/*
				34	* The xfs_buf.c code provides an abstract buffer cache model on top
				35	* of the Linux page cache. Cached metadata blocks for a file system
				36	* are hashed to the inode for the block device. xfs_buf.c assembles
				37	* buffers (xfs_buf_t) on demand to aggregate such cached pages for I/O.
				38	*
				39	* Written by Steve Lord, Jim Mostek, Russell Cattelan
				40	* and Rajagopal Ananthanarayanan ("ananth") at SGI.
				41	*
				42	*/
				43
				44	#include <linux/stddef.h>
				45	#include <linux/errno.h>
				46	#include <linux/slab.h>
				47	#include <linux/pagemap.h>
				48	#include <linux/init.h>
				49	#include <linux/vmalloc.h>
				50	#include <linux/bio.h>
				51	#include <linux/sysctl.h>
				52	#include <linux/proc_fs.h>
				53	#include <linux/workqueue.h>
				54	#include <linux/percpu.h>
				55	#include <linux/blkdev.h>
				56	#include <linux/hash.h>
				57
				58	#include "xfs_linux.h"
				59
				60	/*
				61	* File wide globals
				62	*/
				63
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	64	STATIC kmem_cache_t *pagebuf_zone;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	65	STATIC kmem_shaker_t pagebuf_shake;
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	66	STATIC int xfsbufd_wakeup(int, unsigned int);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	67	STATIC void pagebuf_delwri_queue(xfs_buf_t *, int);
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	68
				69	STATIC struct workqueue_struct *xfslogd_workqueue;
				70	STATIC struct workqueue_struct *xfsdatad_workqueue;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	71
				72	/*
				73	* Pagebuf debugging
				74	*/
				75
				76	#ifdef PAGEBUF_TRACE
				77	void
				78	pagebuf_trace(
				79	xfs_buf_t *pb,
				80	char *id,
				81	void *data,
				82	void *ra)
				83	{
				84	ktrace_enter(pagebuf_trace_buf,
				85	pb, id,
				86	(void *)(unsigned long)pb->pb_flags,
				87	(void *)(unsigned long)pb->pb_hold.counter,
				88	(void *)(unsigned long)pb->pb_sema.count.counter,
				89	(void *)current,
				90	data, ra,
				91	(void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff),
				92	(void *)(unsigned long)(pb->pb_file_offset & 0xffffffff),
				93	(void *)(unsigned long)pb->pb_buffer_length,
				94	NULL, NULL, NULL, NULL, NULL);
				95	}
				96	ktrace_t *pagebuf_trace_buf;
				97	#define PAGEBUF_TRACE_SIZE 4096
				98	#define PB_TRACE(pb, id, data) \
				99	pagebuf_trace(pb, id, (void )data, (void )__builtin_return_address(0))
				100	#else
				101	#define PB_TRACE(pb, id, data) do { } while (0)
				102	#endif
				103
				104	#ifdef PAGEBUF_LOCK_TRACKING
				105	# define PB_SET_OWNER(pb) ((pb)->pb_last_holder = current->pid)
				106	# define PB_CLEAR_OWNER(pb) ((pb)->pb_last_holder = -1)
				107	# define PB_GET_OWNER(pb) ((pb)->pb_last_holder)
				108	#else
				109	# define PB_SET_OWNER(pb) do { } while (0)
				110	# define PB_CLEAR_OWNER(pb) do { } while (0)
				111	# define PB_GET_OWNER(pb) do { } while (0)
				112	#endif
				113
				114	/*
				115	* Pagebuf allocation / freeing.
				116	*/
				117
				118	#define pb_to_gfp(flags) \
				119	((((flags) & PBF_READ_AHEAD) ? __GFP_NORETRY : \
				120	((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) \| __GFP_NOWARN)
				121
				122	#define pb_to_km(flags) \
				123	(((flags) & PBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
				124
				125
				126	#define pagebuf_allocate(flags) \
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	127	kmem_zone_alloc(pagebuf_zone, pb_to_km(flags))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	128	#define pagebuf_deallocate(pb) \
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	129	kmem_zone_free(pagebuf_zone, (pb));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	130
				131	/*
				132	* Page Region interfaces.
				133	*
				134	* For pages in filesystems where the blocksize is smaller than the
				135	* pagesize, we use the page->private field (long) to hold a bitmap
				136	* of uptodate regions within the page.
				137	*
				138	* Each such region is "bytes per page / bits per long" bytes long.
				139	*
				140	* NBPPR == number-of-bytes-per-page-region
				141	* BTOPR == bytes-to-page-region (rounded up)
				142	* BTOPRT == bytes-to-page-region-truncated (rounded down)
				143	*/
				144	#if (BITS_PER_LONG == 32)
				145	#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */
				146	#elif (BITS_PER_LONG == 64)
				147	#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */
				148	#else
				149	#error BITS_PER_LONG must be 32 or 64
				150	#endif
				151	#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG)
				152	#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
				153	#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT))
				154
				155	STATIC unsigned long
				156	page_region_mask(
				157	size_t offset,
				158	size_t length)
				159	{
				160	unsigned long mask;
				161	int first, final;
				162
				163	first = BTOPR(offset);
				164	final = BTOPRT(offset + length - 1);
				165	first = min(first, final);
				166
				167	mask = ~0UL;
				168	mask <<= BITS_PER_LONG - (final - first);
				169	mask >>= BITS_PER_LONG - (final);
				170
				171	ASSERT(offset + length <= PAGE_CACHE_SIZE);
				172	ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
				173
				174	return mask;
				175	}
				176
				177	STATIC inline void
				178	set_page_region(
				179	struct page *page,
				180	size_t offset,
				181	size_t length)
				182	{
				183	page->private \|= page_region_mask(offset, length);
				184	if (page->private == ~0UL)
				185	SetPageUptodate(page);
				186	}
				187
				188	STATIC inline int
				189	test_page_region(
				190	struct page *page,
				191	size_t offset,
				192	size_t length)
				193	{
				194	unsigned long mask = page_region_mask(offset, length);
				195
				196	return (mask && (page->private & mask) == mask);
				197	}
				198
				199	/*
				200	* Mapping of multi-page buffers into contiguous virtual space
				201	*/
				202
				203	typedef struct a_list {
				204	void *vm_addr;
				205	struct a_list *next;
				206	} a_list_t;
				207
				208	STATIC a_list_t *as_free_head;
				209	STATIC int as_list_len;
				210	STATIC DEFINE_SPINLOCK(as_lock);
				211
				212	/*
				213	* Try to batch vunmaps because they are costly.
				214	*/
				215	STATIC void
				216	free_address(
				217	void *addr)
				218	{
				219	a_list_t *aentry;
				220
				221	aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH);
				222	if (likely(aentry)) {
				223	spin_lock(&as_lock);
				224	aentry->next = as_free_head;
				225	aentry->vm_addr = addr;
				226	as_free_head = aentry;
				227	as_list_len++;
				228	spin_unlock(&as_lock);
				229	} else {
				230	vunmap(addr);
				231	}
				232	}
				233
				234	STATIC void
				235	purge_addresses(void)
				236	{
				237	a_list_t aentry, old;
				238
				239	if (as_free_head == NULL)
				240	return;
				241
				242	spin_lock(&as_lock);
				243	aentry = as_free_head;
				244	as_free_head = NULL;
				245	as_list_len = 0;
				246	spin_unlock(&as_lock);
				247
				248	while ((old = aentry) != NULL) {
				249	vunmap(aentry->vm_addr);
				250	aentry = aentry->next;
				251	kfree(old);
				252	}
				253	}
				254
				255	/*
				256	* Internal pagebuf object manipulation
				257	*/
				258
				259	STATIC void
				260	_pagebuf_initialize(
				261	xfs_buf_t *pb,
				262	xfs_buftarg_t *target,
				263	loff_t range_base,
				264	size_t range_length,
				265	page_buf_flags_t flags)
				266	{
				267	/*
				268	* We don't want certain flags to appear in pb->pb_flags.
				269	*/
				270	flags &= ~(PBF_LOCK\|PBF_MAPPED\|PBF_DONT_BLOCK\|PBF_READ_AHEAD);
				271
				272	memset(pb, 0, sizeof(xfs_buf_t));
				273	atomic_set(&pb->pb_hold, 1);
				274	init_MUTEX_LOCKED(&pb->pb_iodonesema);
				275	INIT_LIST_HEAD(&pb->pb_list);
				276	INIT_LIST_HEAD(&pb->pb_hash_list);
				277	init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */
				278	PB_SET_OWNER(pb);
				279	pb->pb_target = target;
				280	pb->pb_file_offset = range_base;
				281	/*
				282	* Set buffer_length and count_desired to the same value initially.
				283	* I/O routines should use count_desired, which will be the same in
				284	* most cases but may be reset (e.g. XFS recovery).
				285	*/
				286	pb->pb_buffer_length = pb->pb_count_desired = range_length;
				287	pb->pb_flags = flags \| PBF_NONE;
				288	pb->pb_bn = XFS_BUF_DADDR_NULL;
				289	atomic_set(&pb->pb_pin_count, 0);
				290	init_waitqueue_head(&pb->pb_waiters);
				291
				292	XFS_STATS_INC(pb_create);
				293	PB_TRACE(pb, "initialize", target);
				294	}
				295
				296	/*
				297	* Allocate a page array capable of holding a specified number
				298	* of pages, and point the page buf at it.
				299	*/
				300	STATIC int
				301	_pagebuf_get_pages(
				302	xfs_buf_t *pb,
				303	int page_count,
				304	page_buf_flags_t flags)
				305	{
				306	/* Make sure that we have a page list */
				307	if (pb->pb_pages == NULL) {
				308	pb->pb_offset = page_buf_poff(pb->pb_file_offset);
				309	pb->pb_page_count = page_count;
				310	if (page_count <= PB_PAGES) {
				311	pb->pb_pages = pb->pb_page_array;
				312	} else {
				313	pb->pb_pages = kmem_alloc(sizeof(struct page )
				314	page_count, pb_to_km(flags));
				315	if (pb->pb_pages == NULL)
				316	return -ENOMEM;
				317	}
				318	memset(pb->pb_pages, 0, sizeof(struct page ) page_count);
				319	}
				320	return 0;
				321	}
				322
				323	/*
				324	* Frees pb_pages if it was malloced.
				325	*/
				326	STATIC void
				327	_pagebuf_free_pages(
				328	xfs_buf_t *bp)
				329	{
				330	if (bp->pb_pages != bp->pb_page_array) {
				331	kmem_free(bp->pb_pages,
				332	bp->pb_page_count * sizeof(struct page *));
				333	}
				334	}
				335
				336	/*
				337	* Releases the specified buffer.
				338	*
				339	* The modification state of any associated pages is left unchanged.
				340	* The buffer most not be on any hash - use pagebuf_rele instead for
				341	* hashed and refcounted buffers
				342	*/
				343	void
				344	pagebuf_free(
				345	xfs_buf_t *bp)
				346	{
				347	PB_TRACE(bp, "free", 0);
				348
				349	ASSERT(list_empty(&bp->pb_hash_list));
				350
				351	if (bp->pb_flags & _PBF_PAGE_CACHE) {
				352	uint i;
				353
				354	if ((bp->pb_flags & PBF_MAPPED) && (bp->pb_page_count > 1))
				355	free_address(bp->pb_addr - bp->pb_offset);
				356
				357	for (i = 0; i < bp->pb_page_count; i++)
				358	page_cache_release(bp->pb_pages[i]);
				359	_pagebuf_free_pages(bp);
				360	} else if (bp->pb_flags & _PBF_KMEM_ALLOC) {
				361	/*
				362	* XXX(hch): bp->pb_count_desired might be incorrect (see
				363	* pagebuf_associate_memory for details), but fortunately
				364	* the Linux version of kmem_free ignores the len argument..
				365	*/
				366	kmem_free(bp->pb_addr, bp->pb_count_desired);
				367	_pagebuf_free_pages(bp);
				368	}
				369
				370	pagebuf_deallocate(bp);
				371	}
				372
				373	/*
				374	* Finds all pages for buffer in question and builds it's page list.
				375	*/
				376	STATIC int
				377	_pagebuf_lookup_pages(
				378	xfs_buf_t *bp,
				379	uint flags)
				380	{
				381	struct address_space *mapping = bp->pb_target->pbr_mapping;
				382	size_t blocksize = bp->pb_target->pbr_bsize;
				383	size_t size = bp->pb_count_desired;
				384	size_t nbytes, offset;
				385	int gfp_mask = pb_to_gfp(flags);
				386	unsigned short page_count, i;
				387	pgoff_t first;
				388	loff_t end;
				389	int error;
				390
				391	end = bp->pb_file_offset + bp->pb_buffer_length;
				392	page_count = page_buf_btoc(end) - page_buf_btoct(bp->pb_file_offset);
				393
				394	error = _pagebuf_get_pages(bp, page_count, flags);
				395	if (unlikely(error))
				396	return error;
				397	bp->pb_flags \|= _PBF_PAGE_CACHE;
				398
				399	offset = bp->pb_offset;
				400	first = bp->pb_file_offset >> PAGE_CACHE_SHIFT;
				401
				402	for (i = 0; i < bp->pb_page_count; i++) {
				403	struct page *page;
				404	uint retries = 0;
				405
				406	retry:
				407	page = find_or_create_page(mapping, first + i, gfp_mask);
				408	if (unlikely(page == NULL)) {
				409	if (flags & PBF_READ_AHEAD) {
				410	bp->pb_page_count = i;
				411	for (i = 0; i < bp->pb_page_count; i++)
				412	unlock_page(bp->pb_pages[i]);
				413	return -ENOMEM;
				414	}
				415
				416	/*
				417	* This could deadlock.
				418	*
				419	* But until all the XFS lowlevel code is revamped to
				420	* handle buffer allocation failures we can't do much.
				421	*/
				422	if (!(++retries % 100))
				423	printk(KERN_ERR
				424	"XFS: possible memory allocation "
				425	"deadlock in %s (mode:0x%x)\n",
				426	__FUNCTION__, gfp_mask);
				427
				428	XFS_STATS_INC(pb_page_retries);
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	429	xfsbufd_wakeup(0, gfp_mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	430	blk_congestion_wait(WRITE, HZ/50);
				431	goto retry;
				432	}
				433
				434	XFS_STATS_INC(pb_page_found);
				435
				436	nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
				437	size -= nbytes;
				438
				439	if (!PageUptodate(page)) {
				440	page_count--;
				441	if (blocksize >= PAGE_CACHE_SIZE) {
				442	if (flags & PBF_READ)
				443	bp->pb_locked = 1;
				444	} else if (!PagePrivate(page)) {
				445	if (test_page_region(page, offset, nbytes))
				446	page_count++;
				447	}
				448	}
				449
				450	bp->pb_pages[i] = page;
				451	offset = 0;
				452	}
				453
				454	if (!bp->pb_locked) {
				455	for (i = 0; i < bp->pb_page_count; i++)
				456	unlock_page(bp->pb_pages[i]);
				457	}
				458
				459	if (page_count) {
				460	/* if we have any uptodate pages, mark that in the buffer */
				461	bp->pb_flags &= ~PBF_NONE;
				462
				463	/* if some pages aren't uptodate, mark that in the buffer */
				464	if (page_count != bp->pb_page_count)
				465	bp->pb_flags \|= PBF_PARTIAL;
				466	}
				467
				468	PB_TRACE(bp, "lookup_pages", (long)page_count);
				469	return error;
				470	}
				471
				472	/*
				473	* Map buffer into kernel address-space if nessecary.
				474	*/
				475	STATIC int
				476	_pagebuf_map_pages(
				477	xfs_buf_t *bp,
				478	uint flags)
				479	{
				480	/* A single page buffer is always mappable */
				481	if (bp->pb_page_count == 1) {
				482	bp->pb_addr = page_address(bp->pb_pages[0]) + bp->pb_offset;
				483	bp->pb_flags \|= PBF_MAPPED;
				484	} else if (flags & PBF_MAPPED) {
				485	if (as_list_len > 64)
				486	purge_addresses();
				487	bp->pb_addr = vmap(bp->pb_pages, bp->pb_page_count,
				488	VM_MAP, PAGE_KERNEL);
				489	if (unlikely(bp->pb_addr == NULL))
				490	return -ENOMEM;
				491	bp->pb_addr += bp->pb_offset;
				492	bp->pb_flags \|= PBF_MAPPED;
				493	}
				494
				495	return 0;
				496	}
				497
				498	/*
				499	* Finding and Reading Buffers
				500	*/
				501
				502	/*
				503	* _pagebuf_find
				504	*
				505	* Looks up, and creates if absent, a lockable buffer for
				506	* a given range of an inode. The buffer is returned
				507	* locked. If other overlapping buffers exist, they are
				508	* released before the new buffer is created and locked,
				509	* which may imply that this call will block until those buffers
				510	* are unlocked. No I/O is implied by this call.
				511	*/
				512	xfs_buf_t *
				513	_pagebuf_find(
				514	xfs_buftarg_t btp, / block device target */
				515	loff_t ioff, /* starting offset of range */
				516	size_t isize, /* length of range */
				517	page_buf_flags_t flags, /* PBF_TRYLOCK */
				518	xfs_buf_t new_pb)/ newly allocated buffer */
				519	{
				520	loff_t range_base;
				521	size_t range_length;
				522	xfs_bufhash_t *hash;
				523	xfs_buf_t pb, n;
				524
				525	range_base = (ioff << BBSHIFT);
				526	range_length = (isize << BBSHIFT);
				527
				528	/* Check for IOs smaller than the sector size / not sector aligned */
				529	ASSERT(!(range_length < (1 << btp->pbr_sshift)));
				530	ASSERT(!(range_base & (loff_t)btp->pbr_smask));
				531
				532	hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
				533
				534	spin_lock(&hash->bh_lock);
				535
				536	list_for_each_entry_safe(pb, n, &hash->bh_list, pb_hash_list) {
				537	ASSERT(btp == pb->pb_target);
				538	if (pb->pb_file_offset == range_base &&
				539	pb->pb_buffer_length == range_length) {
				540	/*
				541	* If we look at something bring it to the
				542	* front of the list for next time.
				543	*/
				544	atomic_inc(&pb->pb_hold);
				545	list_move(&pb->pb_hash_list, &hash->bh_list);
				546	goto found;
				547	}
				548	}
				549
				550	/* No match found */
				551	if (new_pb) {
				552	_pagebuf_initialize(new_pb, btp, range_base,
				553	range_length, flags);
				554	new_pb->pb_hash = hash;
				555	list_add(&new_pb->pb_hash_list, &hash->bh_list);
				556	} else {
				557	XFS_STATS_INC(pb_miss_locked);
				558	}
				559
				560	spin_unlock(&hash->bh_lock);
				561	return new_pb;
				562
				563	found:
				564	spin_unlock(&hash->bh_lock);
				565
				566	/* Attempt to get the semaphore without sleeping,
				567	* if this does not work then we need to drop the
				568	* spinlock and do a hard attempt on the semaphore.
				569	*/
				570	if (down_trylock(&pb->pb_sema)) {
				571	if (!(flags & PBF_TRYLOCK)) {
				572	/* wait for buffer ownership */
				573	PB_TRACE(pb, "get_lock", 0);
				574	pagebuf_lock(pb);
				575	XFS_STATS_INC(pb_get_locked_waited);
				576	} else {
				577	/* We asked for a trylock and failed, no need
				578	* to look at file offset and length here, we
				579	* know that this pagebuf at least overlaps our
				580	* pagebuf and is locked, therefore our buffer
				581	* either does not exist, or is this buffer
				582	*/
				583
				584	pagebuf_rele(pb);
				585	XFS_STATS_INC(pb_busy_locked);
				586	return (NULL);
				587	}
				588	} else {
				589	/* trylock worked */
				590	PB_SET_OWNER(pb);
				591	}
				592
				593	if (pb->pb_flags & PBF_STALE)
				594	pb->pb_flags &= PBF_MAPPED;
				595	PB_TRACE(pb, "got_lock", 0);
				596	XFS_STATS_INC(pb_get_locked);
				597	return (pb);
				598	}
				599
				600	/*
				601	* xfs_buf_get_flags assembles a buffer covering the specified range.
				602	*
				603	* Storage in memory for all portions of the buffer will be allocated,
				604	* although backing storage may not be.
				605	*/
				606	xfs_buf_t *
				607	xfs_buf_get_flags( /* allocate a buffer */
				608	xfs_buftarg_t target,/ target for buffer */
				609	loff_t ioff, /* starting offset of range */
				610	size_t isize, /* length of range */
				611	page_buf_flags_t flags) /* PBF_TRYLOCK */
				612	{
				613	xfs_buf_t pb, new_pb;
				614	int error = 0, i;
				615
				616	new_pb = pagebuf_allocate(flags);
				617	if (unlikely(!new_pb))
				618	return NULL;
				619
				620	pb = _pagebuf_find(target, ioff, isize, flags, new_pb);
				621	if (pb == new_pb) {
				622	error = _pagebuf_lookup_pages(pb, flags);
				623	if (error)
				624	goto no_buffer;
				625	} else {
				626	pagebuf_deallocate(new_pb);
				627	if (unlikely(pb == NULL))
				628	return NULL;
				629	}
				630
				631	for (i = 0; i < pb->pb_page_count; i++)
				632	mark_page_accessed(pb->pb_pages[i]);
				633
				634	if (!(pb->pb_flags & PBF_MAPPED)) {
				635	error = _pagebuf_map_pages(pb, flags);
				636	if (unlikely(error)) {
				637	printk(KERN_WARNING "%s: failed to map pages\n",
				638	__FUNCTION__);
				639	goto no_buffer;
				640	}
				641	}
				642
				643	XFS_STATS_INC(pb_get);
				644
				645	/*
				646	* Always fill in the block number now, the mapped cases can do
				647	* their own overlay of this later.
				648	*/
				649	pb->pb_bn = ioff;
				650	pb->pb_count_desired = pb->pb_buffer_length;
				651
				652	PB_TRACE(pb, "get", (unsigned long)flags);
				653	return pb;
				654
				655	no_buffer:
				656	if (flags & (PBF_LOCK \| PBF_TRYLOCK))
				657	pagebuf_unlock(pb);
				658	pagebuf_rele(pb);
				659	return NULL;
				660	}
				661
				662	xfs_buf_t *
				663	xfs_buf_read_flags(
				664	xfs_buftarg_t *target,
				665	loff_t ioff,
				666	size_t isize,
				667	page_buf_flags_t flags)
				668	{
				669	xfs_buf_t *pb;
				670
				671	flags \|= PBF_READ;
				672
				673	pb = xfs_buf_get_flags(target, ioff, isize, flags);
				674	if (pb) {
				675	if (PBF_NOT_DONE(pb)) {
				676	PB_TRACE(pb, "read", (unsigned long)flags);
				677	XFS_STATS_INC(pb_get_read);
				678	pagebuf_iostart(pb, flags);
				679	} else if (flags & PBF_ASYNC) {
				680	PB_TRACE(pb, "read_async", (unsigned long)flags);
				681	/*
				682	* Read ahead call which is already satisfied,
				683	* drop the buffer
				684	*/
				685	goto no_buffer;
				686	} else {
				687	PB_TRACE(pb, "read_done", (unsigned long)flags);
				688	/* We do not want read in the flags */
				689	pb->pb_flags &= ~PBF_READ;
				690	}
				691	}
				692
				693	return pb;
				694
				695	no_buffer:
				696	if (flags & (PBF_LOCK \| PBF_TRYLOCK))
				697	pagebuf_unlock(pb);
				698	pagebuf_rele(pb);
				699	return NULL;
				700	}
				701
				702	/*
				703	* Create a skeletal pagebuf (no pages associated with it).
				704	*/
				705	xfs_buf_t *
				706	pagebuf_lookup(
				707	xfs_buftarg_t *target,
				708	loff_t ioff,
				709	size_t isize,
				710	page_buf_flags_t flags)
				711	{
				712	xfs_buf_t *pb;
				713
				714	pb = pagebuf_allocate(flags);
				715	if (pb) {
				716	_pagebuf_initialize(pb, target, ioff, isize, flags);
				717	}
				718	return pb;
				719	}
				720
				721	/*
				722	* If we are not low on memory then do the readahead in a deadlock
				723	* safe manner.
				724	*/
				725	void
				726	pagebuf_readahead(
				727	xfs_buftarg_t *target,
				728	loff_t ioff,
				729	size_t isize,
				730	page_buf_flags_t flags)
				731	{
				732	struct backing_dev_info *bdi;
				733
				734	bdi = target->pbr_mapping->backing_dev_info;
				735	if (bdi_read_congested(bdi))
				736	return;
				737
				738	flags \|= (PBF_TRYLOCK\|PBF_ASYNC\|PBF_READ_AHEAD);
				739	xfs_buf_read_flags(target, ioff, isize, flags);
				740	}
				741
				742	xfs_buf_t *
				743	pagebuf_get_empty(
				744	size_t len,
				745	xfs_buftarg_t *target)
				746	{
				747	xfs_buf_t *pb;
				748
				749	pb = pagebuf_allocate(0);
				750	if (pb)
				751	_pagebuf_initialize(pb, target, 0, len, 0);
				752	return pb;
				753	}
				754
				755	static inline struct page *
				756	mem_to_page(
				757	void *addr)
				758	{
				759	if (((unsigned long)addr < VMALLOC_START) \|\|
				760	((unsigned long)addr >= VMALLOC_END)) {
				761	return virt_to_page(addr);
				762	} else {
				763	return vmalloc_to_page(addr);
				764	}
				765	}
				766
				767	int
				768	pagebuf_associate_memory(
				769	xfs_buf_t *pb,
				770	void *mem,
				771	size_t len)
				772	{
				773	int rval;
				774	int i = 0;
				775	size_t ptr;
				776	size_t end, end_cur;
				777	off_t offset;
				778	int page_count;
				779
				780	page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
				781	offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
				782	if (offset && (len > PAGE_CACHE_SIZE))
				783	page_count++;
				784
				785	/* Free any previous set of page pointers */
				786	if (pb->pb_pages)
				787	_pagebuf_free_pages(pb);
				788
				789	pb->pb_pages = NULL;
				790	pb->pb_addr = mem;
				791
				792	rval = _pagebuf_get_pages(pb, page_count, 0);
				793	if (rval)
				794	return rval;
				795
				796	pb->pb_offset = offset;
				797	ptr = (size_t) mem & PAGE_CACHE_MASK;
				798	end = PAGE_CACHE_ALIGN((size_t) mem + len);
				799	end_cur = end;
				800	/* set up first page */
				801	pb->pb_pages[0] = mem_to_page(mem);
				802
				803	ptr += PAGE_CACHE_SIZE;
				804	pb->pb_page_count = ++i;
				805	while (ptr < end) {
				806	pb->pb_pages[i] = mem_to_page((void *)ptr);
				807	pb->pb_page_count = ++i;
				808	ptr += PAGE_CACHE_SIZE;
				809	}
				810	pb->pb_locked = 0;
				811
				812	pb->pb_count_desired = pb->pb_buffer_length = len;
				813	pb->pb_flags \|= PBF_MAPPED;
				814
				815	return 0;
				816	}
				817
				818	xfs_buf_t *
				819	pagebuf_get_no_daddr(
				820	size_t len,
				821	xfs_buftarg_t *target)
				822	{
				823	size_t malloc_len = len;
				824	xfs_buf_t *bp;
				825	void *data;
				826	int error;
				827
				828	bp = pagebuf_allocate(0);
				829	if (unlikely(bp == NULL))
				830	goto fail;
				831	_pagebuf_initialize(bp, target, 0, len, PBF_FORCEIO);
				832
				833	try_again:
				834	data = kmem_alloc(malloc_len, KM_SLEEP \| KM_MAYFAIL);
				835	if (unlikely(data == NULL))
				836	goto fail_free_buf;
				837
				838	/* check whether alignment matches.. */
				839	if ((__psunsigned_t)data !=
				840	((__psunsigned_t)data & ~target->pbr_smask)) {
				841	/* .. else double the size and try again */
				842	kmem_free(data, malloc_len);
				843	malloc_len <<= 1;
				844	goto try_again;
				845	}
				846
				847	error = pagebuf_associate_memory(bp, data, len);
				848	if (error)
				849	goto fail_free_mem;
				850	bp->pb_flags \|= _PBF_KMEM_ALLOC;
				851
				852	pagebuf_unlock(bp);
				853
				854	PB_TRACE(bp, "no_daddr", data);
				855	return bp;
				856	fail_free_mem:
				857	kmem_free(data, malloc_len);
				858	fail_free_buf:
				859	pagebuf_free(bp);
				860	fail:
				861	return NULL;
				862	}
				863
				864	/*
				865	* pagebuf_hold
				866	*
				867	* Increment reference count on buffer, to hold the buffer concurrently
				868	* with another thread which may release (free) the buffer asynchronously.
				869	*
				870	* Must hold the buffer already to call this function.
				871	*/
				872	void
				873	pagebuf_hold(
				874	xfs_buf_t *pb)
				875	{
				876	atomic_inc(&pb->pb_hold);
				877	PB_TRACE(pb, "hold", 0);
				878	}
				879
				880	/*
				881	* pagebuf_rele
				882	*
				883	* pagebuf_rele releases a hold on the specified buffer. If the
				884	* the hold count is 1, pagebuf_rele calls pagebuf_free.
				885	*/
				886	void
				887	pagebuf_rele(
				888	xfs_buf_t *pb)
				889	{
				890	xfs_bufhash_t *hash = pb->pb_hash;
				891
				892	PB_TRACE(pb, "rele", pb->pb_relse);
				893
				894	/*
				895	* pagebuf_lookup buffers are not hashed, not delayed write,
				896	* and don't have their own release routines. Special case.
				897	*/
				898	if (unlikely(!hash)) {
				899	ASSERT(!pb->pb_relse);
				900	if (atomic_dec_and_test(&pb->pb_hold))
				901	xfs_buf_free(pb);
				902	return;
				903	}
				904
				905	if (atomic_dec_and_lock(&pb->pb_hold, &hash->bh_lock)) {
				906	int do_free = 1;
				907
				908	if (pb->pb_relse) {
				909	atomic_inc(&pb->pb_hold);
				910	spin_unlock(&hash->bh_lock);
				911	(*(pb->pb_relse)) (pb);
				912	spin_lock(&hash->bh_lock);
				913	do_free = 0;
				914	}
				915
				916	if (pb->pb_flags & PBF_DELWRI) {
				917	pb->pb_flags \|= PBF_ASYNC;
				918	atomic_inc(&pb->pb_hold);
				919	pagebuf_delwri_queue(pb, 0);
				920	do_free = 0;
				921	} else if (pb->pb_flags & PBF_FS_MANAGED) {
				922	do_free = 0;
				923	}
				924
				925	if (do_free) {
				926	list_del_init(&pb->pb_hash_list);
				927	spin_unlock(&hash->bh_lock);
				928	pagebuf_free(pb);
				929	} else {
				930	spin_unlock(&hash->bh_lock);
				931	}
				932	}
				933	}
				934
				935
				936	/*
				937	* Mutual exclusion on buffers. Locking model:
				938	*
				939	* Buffers associated with inodes for which buffer locking
				940	* is not enabled are not protected by semaphores, and are
				941	* assumed to be exclusively owned by the caller. There is a
				942	* spinlock in the buffer, used by the caller when concurrent
				943	* access is possible.
				944	*/
				945
				946	/*
				947	* pagebuf_cond_lock
				948	*
				949	* pagebuf_cond_lock locks a buffer object, if it is not already locked.
				950	* Note that this in no way
				951	* locks the underlying pages, so it is only useful for synchronizing
				952	* concurrent use of page buffer objects, not for synchronizing independent
				953	* access to the underlying pages.
				954	*/
				955	int
				956	pagebuf_cond_lock( /* lock buffer, if not locked */
				957	/* returns -EBUSY if locked) */
				958	xfs_buf_t *pb)
				959	{
				960	int locked;
				961
				962	locked = down_trylock(&pb->pb_sema) == 0;
				963	if (locked) {
				964	PB_SET_OWNER(pb);
				965	}
				966	PB_TRACE(pb, "cond_lock", (long)locked);
				967	return(locked ? 0 : -EBUSY);
				968	}
				969
				970	#if defined(DEBUG) \|\| defined(XFS_BLI_TRACE)
				971	/*
				972	* pagebuf_lock_value
				973	*
				974	* Return lock value for a pagebuf
				975	*/
				976	int
				977	pagebuf_lock_value(
				978	xfs_buf_t *pb)
				979	{
				980	return(atomic_read(&pb->pb_sema.count));
				981	}
				982	#endif
				983
				984	/*
				985	* pagebuf_lock
				986	*
				987	* pagebuf_lock locks a buffer object. Note that this in no way
				988	* locks the underlying pages, so it is only useful for synchronizing
				989	* concurrent use of page buffer objects, not for synchronizing independent
				990	* access to the underlying pages.
				991	*/
				992	int
				993	pagebuf_lock(
				994	xfs_buf_t *pb)
				995	{
				996	PB_TRACE(pb, "lock", 0);
				997	if (atomic_read(&pb->pb_io_remaining))
				998	blk_run_address_space(pb->pb_target->pbr_mapping);
				999	down(&pb->pb_sema);
				1000	PB_SET_OWNER(pb);
				1001	PB_TRACE(pb, "locked", 0);
				1002	return 0;
				1003	}
				1004
				1005	/*
				1006	* pagebuf_unlock
				1007	*
				1008	* pagebuf_unlock releases the lock on the buffer object created by
				1009	* pagebuf_lock or pagebuf_cond_lock (not any
				1010	* pinning of underlying pages created by pagebuf_pin).
				1011	*/
				1012	void
				1013	pagebuf_unlock( /* unlock buffer */
				1014	xfs_buf_t pb) / buffer to unlock */
				1015	{
				1016	PB_CLEAR_OWNER(pb);
				1017	up(&pb->pb_sema);
				1018	PB_TRACE(pb, "unlock", 0);
				1019	}
				1020
				1021
				1022	/*
				1023	* Pinning Buffer Storage in Memory
				1024	*/
				1025
				1026	/*
				1027	* pagebuf_pin
				1028	*
				1029	* pagebuf_pin locks all of the memory represented by a buffer in
				1030	* memory. Multiple calls to pagebuf_pin and pagebuf_unpin, for
				1031	* the same or different buffers affecting a given page, will
				1032	* properly count the number of outstanding "pin" requests. The
				1033	* buffer may be released after the pagebuf_pin and a different
				1034	* buffer used when calling pagebuf_unpin, if desired.
				1035	* pagebuf_pin should be used by the file system when it wants be
				1036	* assured that no attempt will be made to force the affected
				1037	* memory to disk. It does not assure that a given logical page
				1038	* will not be moved to a different physical page.
				1039	*/
				1040	void
				1041	pagebuf_pin(
				1042	xfs_buf_t *pb)
				1043	{
				1044	atomic_inc(&pb->pb_pin_count);
				1045	PB_TRACE(pb, "pin", (long)pb->pb_pin_count.counter);
				1046	}
				1047
				1048	/*
				1049	* pagebuf_unpin
				1050	*
				1051	* pagebuf_unpin reverses the locking of memory performed by
				1052	* pagebuf_pin. Note that both functions affected the logical
				1053	* pages associated with the buffer, not the buffer itself.
				1054	*/
				1055	void
				1056	pagebuf_unpin(
				1057	xfs_buf_t *pb)
				1058	{
				1059	if (atomic_dec_and_test(&pb->pb_pin_count)) {
				1060	wake_up_all(&pb->pb_waiters);
				1061	}
				1062	PB_TRACE(pb, "unpin", (long)pb->pb_pin_count.counter);
				1063	}
				1064
				1065	int
				1066	pagebuf_ispin(
				1067	xfs_buf_t *pb)
				1068	{
				1069	return atomic_read(&pb->pb_pin_count);
				1070	}
				1071
				1072	/*
				1073	* pagebuf_wait_unpin
				1074	*
				1075	* pagebuf_wait_unpin waits until all of the memory associated
				1076	* with the buffer is not longer locked in memory. It returns
				1077	* immediately if none of the affected pages are locked.
				1078	*/
				1079	static inline void
				1080	_pagebuf_wait_unpin(
				1081	xfs_buf_t *pb)
				1082	{
				1083	DECLARE_WAITQUEUE (wait, current);
				1084
				1085	if (atomic_read(&pb->pb_pin_count) == 0)
				1086	return;
				1087
				1088	add_wait_queue(&pb->pb_waiters, &wait);
				1089	for (;;) {
				1090	set_current_state(TASK_UNINTERRUPTIBLE);
				1091	if (atomic_read(&pb->pb_pin_count) == 0)
				1092	break;
				1093	if (atomic_read(&pb->pb_io_remaining))
				1094	blk_run_address_space(pb->pb_target->pbr_mapping);
				1095	schedule();
				1096	}
				1097	remove_wait_queue(&pb->pb_waiters, &wait);
				1098	set_current_state(TASK_RUNNING);
				1099	}
				1100
				1101	/*
				1102	* Buffer Utility Routines
				1103	*/
				1104
				1105	/*
				1106	* pagebuf_iodone
				1107	*
				1108	* pagebuf_iodone marks a buffer for which I/O is in progress
				1109	* done with respect to that I/O. The pb_iodone routine, if
				1110	* present, will be called as a side-effect.
				1111	*/
				1112	STATIC void
				1113	pagebuf_iodone_work(
				1114	void *v)
				1115	{
				1116	xfs_buf_t bp = (xfs_buf_t )v;
				1117
				1118	if (bp->pb_iodone)
				1119	(*(bp->pb_iodone))(bp);
				1120	else if (bp->pb_flags & PBF_ASYNC)
				1121	xfs_buf_relse(bp);
				1122	}
				1123
				1124	void
				1125	pagebuf_iodone(
				1126	xfs_buf_t *pb,
				1127	int dataio,
				1128	int schedule)
				1129	{
				1130	pb->pb_flags &= ~(PBF_READ \| PBF_WRITE);
				1131	if (pb->pb_error == 0) {
				1132	pb->pb_flags &= ~(PBF_PARTIAL \| PBF_NONE);
				1133	}
				1134
				1135	PB_TRACE(pb, "iodone", pb->pb_iodone);
				1136
				1137	if ((pb->pb_iodone) \|\| (pb->pb_flags & PBF_ASYNC)) {
				1138	if (schedule) {
				1139	INIT_WORK(&pb->pb_iodone_work, pagebuf_iodone_work, pb);
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1140	queue_work(dataio ? xfsdatad_workqueue :
				1141	xfslogd_workqueue, &pb->pb_iodone_work);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1142	} else {
				1143	pagebuf_iodone_work(pb);
				1144	}
				1145	} else {
				1146	up(&pb->pb_iodonesema);
				1147	}
				1148	}
				1149
				1150	/*
				1151	* pagebuf_ioerror
				1152	*
				1153	* pagebuf_ioerror sets the error code for a buffer.
				1154	*/
				1155	void
				1156	pagebuf_ioerror( /* mark/clear buffer error flag */
				1157	xfs_buf_t pb, / buffer to mark */
				1158	int error) /* error to store (0 if none) */
				1159	{
				1160	ASSERT(error >= 0 && error <= 0xffff);
				1161	pb->pb_error = (unsigned short)error;
				1162	PB_TRACE(pb, "ioerror", (unsigned long)error);
				1163	}
				1164
				1165	/*
				1166	* pagebuf_iostart
				1167	*
				1168	* pagebuf_iostart initiates I/O on a buffer, based on the flags supplied.
				1169	* If necessary, it will arrange for any disk space allocation required,
				1170	* and it will break up the request if the block mappings require it.
				1171	* The pb_iodone routine in the buffer supplied will only be called
				1172	* when all of the subsidiary I/O requests, if any, have been completed.
				1173	* pagebuf_iostart calls the pagebuf_ioinitiate routine or
				1174	* pagebuf_iorequest, if the former routine is not defined, to start
				1175	* the I/O on a given low-level request.
				1176	*/
				1177	int
				1178	pagebuf_iostart( /* start I/O on a buffer */
				1179	xfs_buf_t pb, / buffer to start */
				1180	page_buf_flags_t flags) /* PBF_LOCK, PBF_ASYNC, PBF_READ, */
				1181	/* PBF_WRITE, PBF_DELWRI, */
				1182	/* PBF_DONT_BLOCK */
				1183	{
				1184	int status = 0;
				1185
				1186	PB_TRACE(pb, "iostart", (unsigned long)flags);
				1187
				1188	if (flags & PBF_DELWRI) {
				1189	pb->pb_flags &= ~(PBF_READ \| PBF_WRITE \| PBF_ASYNC);
				1190	pb->pb_flags \|= flags & (PBF_DELWRI \| PBF_ASYNC);
				1191	pagebuf_delwri_queue(pb, 1);
				1192	return status;
				1193	}
				1194
				1195	pb->pb_flags &= ~(PBF_READ \| PBF_WRITE \| PBF_ASYNC \| PBF_DELWRI \| \
				1196	PBF_READ_AHEAD \| _PBF_RUN_QUEUES);
				1197	pb->pb_flags \|= flags & (PBF_READ \| PBF_WRITE \| PBF_ASYNC \| \
				1198	PBF_READ_AHEAD \| _PBF_RUN_QUEUES);
				1199
				1200	BUG_ON(pb->pb_bn == XFS_BUF_DADDR_NULL);
				1201
				1202	/* For writes allow an alternate strategy routine to precede
				1203	* the actual I/O request (which may not be issued at all in
				1204	* a shutdown situation, for example).
				1205	*/
				1206	status = (flags & PBF_WRITE) ?
				1207	pagebuf_iostrategy(pb) : pagebuf_iorequest(pb);
				1208
				1209	/* Wait for I/O if we are not an async request.
				1210	* Note: async I/O request completion will release the buffer,
				1211	* and that can already be done by this point. So using the
				1212	* buffer pointer from here on, after async I/O, is invalid.
				1213	*/
				1214	if (!status && !(flags & PBF_ASYNC))
				1215	status = pagebuf_iowait(pb);
				1216
				1217	return status;
				1218	}
				1219
				1220	/*
				1221	* Helper routine for pagebuf_iorequest
				1222	*/
				1223
				1224	STATIC __inline__ int
				1225	_pagebuf_iolocked(
				1226	xfs_buf_t *pb)
				1227	{
				1228	ASSERT(pb->pb_flags & (PBF_READ\|PBF_WRITE));
				1229	if (pb->pb_flags & PBF_READ)
				1230	return pb->pb_locked;
				1231	return 0;
				1232	}
				1233
				1234	STATIC __inline__ void
				1235	_pagebuf_iodone(
				1236	xfs_buf_t *pb,
				1237	int schedule)
				1238	{
				1239	if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
				1240	pb->pb_locked = 0;
				1241	pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), schedule);
				1242	}
				1243	}
				1244
				1245	STATIC int
				1246	bio_end_io_pagebuf(
				1247	struct bio *bio,
				1248	unsigned int bytes_done,
				1249	int error)
				1250	{
				1251	xfs_buf_t pb = (xfs_buf_t )bio->bi_private;
Nathan Scott	eedb553	2005-09-02 16:39:56 +1000	[diff] [blame^]	1252	unsigned int blocksize = pb->pb_target->pbr_bsize;
				1253	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1254
				1255	if (bio->bi_size)
				1256	return 1;
				1257
				1258	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
				1259	pb->pb_error = EIO;
				1260
Nathan Scott	eedb553	2005-09-02 16:39:56 +1000	[diff] [blame^]	1261	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1262	struct page *page = bvec->bv_page;
				1263
Nathan Scott	eedb553	2005-09-02 16:39:56 +1000	[diff] [blame^]	1264	if (unlikely(pb->pb_error)) {
				1265	if (pb->pb_flags & PBF_READ)
				1266	ClearPageUptodate(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1267	SetPageError(page);
				1268	} else if (blocksize == PAGE_CACHE_SIZE) {
				1269	SetPageUptodate(page);
				1270	} else if (!PagePrivate(page) &&
				1271	(pb->pb_flags & _PBF_PAGE_CACHE)) {
				1272	set_page_region(page, bvec->bv_offset, bvec->bv_len);
				1273	}
				1274
Nathan Scott	eedb553	2005-09-02 16:39:56 +1000	[diff] [blame^]	1275	if (--bvec >= bio->bi_io_vec)
				1276	prefetchw(&bvec->bv_page->flags);
				1277
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1278	if (_pagebuf_iolocked(pb)) {
				1279	unlock_page(page);
				1280	}
Nathan Scott	eedb553	2005-09-02 16:39:56 +1000	[diff] [blame^]	1281	} while (bvec >= bio->bi_io_vec);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1282
				1283	_pagebuf_iodone(pb, 1);
				1284	bio_put(bio);
				1285	return 0;
				1286	}
				1287
				1288	STATIC void
				1289	_pagebuf_ioapply(
				1290	xfs_buf_t *pb)
				1291	{
				1292	int i, rw, map_i, total_nr_pages, nr_pages;
				1293	struct bio *bio;
				1294	int offset = pb->pb_offset;
				1295	int size = pb->pb_count_desired;
				1296	sector_t sector = pb->pb_bn;
				1297	unsigned int blocksize = pb->pb_target->pbr_bsize;
				1298	int locking = _pagebuf_iolocked(pb);
				1299
				1300	total_nr_pages = pb->pb_page_count;
				1301	map_i = 0;
				1302
				1303	if (pb->pb_flags & _PBF_RUN_QUEUES) {
				1304	pb->pb_flags &= ~_PBF_RUN_QUEUES;
				1305	rw = (pb->pb_flags & PBF_READ) ? READ_SYNC : WRITE_SYNC;
				1306	} else {
				1307	rw = (pb->pb_flags & PBF_READ) ? READ : WRITE;
				1308	}
				1309
				1310	/* Special code path for reading a sub page size pagebuf in --
				1311	* we populate up the whole page, and hence the other metadata
				1312	* in the same page. This optimization is only valid when the
				1313	* filesystem block size and the page size are equal.
				1314	*/
				1315	if ((pb->pb_buffer_length < PAGE_CACHE_SIZE) &&
				1316	(pb->pb_flags & PBF_READ) && locking &&
				1317	(blocksize == PAGE_CACHE_SIZE)) {
				1318	bio = bio_alloc(GFP_NOIO, 1);
				1319
				1320	bio->bi_bdev = pb->pb_target->pbr_bdev;
				1321	bio->bi_sector = sector - (offset >> BBSHIFT);
				1322	bio->bi_end_io = bio_end_io_pagebuf;
				1323	bio->bi_private = pb;
				1324
				1325	bio_add_page(bio, pb->pb_pages[0], PAGE_CACHE_SIZE, 0);
				1326	size = 0;
				1327
				1328	atomic_inc(&pb->pb_io_remaining);
				1329
				1330	goto submit_io;
				1331	}
				1332
				1333	/* Lock down the pages which we need to for the request */
				1334	if (locking && (pb->pb_flags & PBF_WRITE) && (pb->pb_locked == 0)) {
				1335	for (i = 0; size; i++) {
				1336	int nbytes = PAGE_CACHE_SIZE - offset;
				1337	struct page *page = pb->pb_pages[i];
				1338
				1339	if (nbytes > size)
				1340	nbytes = size;
				1341
				1342	lock_page(page);
				1343
				1344	size -= nbytes;
				1345	offset = 0;
				1346	}
				1347	offset = pb->pb_offset;
				1348	size = pb->pb_count_desired;
				1349	}
				1350
				1351	next_chunk:
				1352	atomic_inc(&pb->pb_io_remaining);
				1353	nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
				1354	if (nr_pages > total_nr_pages)
				1355	nr_pages = total_nr_pages;
				1356
				1357	bio = bio_alloc(GFP_NOIO, nr_pages);
				1358	bio->bi_bdev = pb->pb_target->pbr_bdev;
				1359	bio->bi_sector = sector;
				1360	bio->bi_end_io = bio_end_io_pagebuf;
				1361	bio->bi_private = pb;
				1362
				1363	for (; size && nr_pages; nr_pages--, map_i++) {
				1364	int nbytes = PAGE_CACHE_SIZE - offset;
				1365
				1366	if (nbytes > size)
				1367	nbytes = size;
				1368
				1369	if (bio_add_page(bio, pb->pb_pages[map_i],
				1370	nbytes, offset) < nbytes)
				1371	break;
				1372
				1373	offset = 0;
				1374	sector += nbytes >> BBSHIFT;
				1375	size -= nbytes;
				1376	total_nr_pages--;
				1377	}
				1378
				1379	submit_io:
				1380	if (likely(bio->bi_size)) {
				1381	submit_bio(rw, bio);
				1382	if (size)
				1383	goto next_chunk;
				1384	} else {
				1385	bio_put(bio);
				1386	pagebuf_ioerror(pb, EIO);
				1387	}
				1388	}
				1389
				1390	/*
				1391	* pagebuf_iorequest -- the core I/O request routine.
				1392	*/
				1393	int
				1394	pagebuf_iorequest( /* start real I/O */
				1395	xfs_buf_t pb) / buffer to convey to device */
				1396	{
				1397	PB_TRACE(pb, "iorequest", 0);
				1398
				1399	if (pb->pb_flags & PBF_DELWRI) {
				1400	pagebuf_delwri_queue(pb, 1);
				1401	return 0;
				1402	}
				1403
				1404	if (pb->pb_flags & PBF_WRITE) {
				1405	_pagebuf_wait_unpin(pb);
				1406	}
				1407
				1408	pagebuf_hold(pb);
				1409
				1410	/* Set the count to 1 initially, this will stop an I/O
				1411	* completion callout which happens before we have started
				1412	* all the I/O from calling pagebuf_iodone too early.
				1413	*/
				1414	atomic_set(&pb->pb_io_remaining, 1);
				1415	_pagebuf_ioapply(pb);
				1416	_pagebuf_iodone(pb, 0);
				1417
				1418	pagebuf_rele(pb);
				1419	return 0;
				1420	}
				1421
				1422	/*
				1423	* pagebuf_iowait
				1424	*
				1425	* pagebuf_iowait waits for I/O to complete on the buffer supplied.
				1426	* It returns immediately if no I/O is pending. In any case, it returns
				1427	* the error code, if any, or 0 if there is no error.
				1428	*/
				1429	int
				1430	pagebuf_iowait(
				1431	xfs_buf_t *pb)
				1432	{
				1433	PB_TRACE(pb, "iowait", 0);
				1434	if (atomic_read(&pb->pb_io_remaining))
				1435	blk_run_address_space(pb->pb_target->pbr_mapping);
				1436	down(&pb->pb_iodonesema);
				1437	PB_TRACE(pb, "iowaited", (long)pb->pb_error);
				1438	return pb->pb_error;
				1439	}
				1440
				1441	caddr_t
				1442	pagebuf_offset(
				1443	xfs_buf_t *pb,
				1444	size_t offset)
				1445	{
				1446	struct page *page;
				1447
				1448	offset += pb->pb_offset;
				1449
				1450	page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT];
				1451	return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1));
				1452	}
				1453
				1454	/*
				1455	* pagebuf_iomove
				1456	*
				1457	* Move data into or out of a buffer.
				1458	*/
				1459	void
				1460	pagebuf_iomove(
				1461	xfs_buf_t pb, / buffer to process */
				1462	size_t boff, /* starting buffer offset */
				1463	size_t bsize, /* length to copy */
				1464	caddr_t data, /* data address */
				1465	page_buf_rw_t mode) /* read/write flag */
				1466	{
				1467	size_t bend, cpoff, csize;
				1468	struct page *page;
				1469
				1470	bend = boff + bsize;
				1471	while (boff < bend) {
				1472	page = pb->pb_pages[page_buf_btoct(boff + pb->pb_offset)];
				1473	cpoff = page_buf_poff(boff + pb->pb_offset);
				1474	csize = min_t(size_t,
				1475	PAGE_CACHE_SIZE-cpoff, pb->pb_count_desired-boff);
				1476
				1477	ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
				1478
				1479	switch (mode) {
				1480	case PBRW_ZERO:
				1481	memset(page_address(page) + cpoff, 0, csize);
				1482	break;
				1483	case PBRW_READ:
				1484	memcpy(data, page_address(page) + cpoff, csize);
				1485	break;
				1486	case PBRW_WRITE:
				1487	memcpy(page_address(page) + cpoff, data, csize);
				1488	}
				1489
				1490	boff += csize;
				1491	data += csize;
				1492	}
				1493	}
				1494
				1495	/*
				1496	* Handling of buftargs.
				1497	*/
				1498
				1499	/*
				1500	* Wait for any bufs with callbacks that have been submitted but
				1501	* have not yet returned... walk the hash list for the target.
				1502	*/
				1503	void
				1504	xfs_wait_buftarg(
				1505	xfs_buftarg_t *btp)
				1506	{
				1507	xfs_buf_t bp, n;
				1508	xfs_bufhash_t *hash;
				1509	uint i;
				1510
				1511	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
				1512	hash = &btp->bt_hash[i];
				1513	again:
				1514	spin_lock(&hash->bh_lock);
				1515	list_for_each_entry_safe(bp, n, &hash->bh_list, pb_hash_list) {
				1516	ASSERT(btp == bp->pb_target);
				1517	if (!(bp->pb_flags & PBF_FS_MANAGED)) {
				1518	spin_unlock(&hash->bh_lock);
				1519	delay(100);
				1520	goto again;
				1521	}
				1522	}
				1523	spin_unlock(&hash->bh_lock);
				1524	}
				1525	}
				1526
				1527	/*
				1528	* Allocate buffer hash table for a given target.
				1529	* For devices containing metadata (i.e. not the log/realtime devices)
				1530	* we need to allocate a much larger hash table.
				1531	*/
				1532	STATIC void
				1533	xfs_alloc_bufhash(
				1534	xfs_buftarg_t *btp,
				1535	int external)
				1536	{
				1537	unsigned int i;
				1538
				1539	btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */
				1540	btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
				1541	btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) *
				1542	sizeof(xfs_bufhash_t), KM_SLEEP);
				1543	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
				1544	spin_lock_init(&btp->bt_hash[i].bh_lock);
				1545	INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
				1546	}
				1547	}
				1548
				1549	STATIC void
				1550	xfs_free_bufhash(
				1551	xfs_buftarg_t *btp)
				1552	{
				1553	kmem_free(btp->bt_hash,
				1554	(1 << btp->bt_hashshift) * sizeof(xfs_bufhash_t));
				1555	btp->bt_hash = NULL;
				1556	}
				1557
				1558	void
				1559	xfs_free_buftarg(
				1560	xfs_buftarg_t *btp,
				1561	int external)
				1562	{
				1563	xfs_flush_buftarg(btp, 1);
				1564	if (external)
				1565	xfs_blkdev_put(btp->pbr_bdev);
				1566	xfs_free_bufhash(btp);
				1567	iput(btp->pbr_mapping->host);
				1568	kmem_free(btp, sizeof(*btp));
				1569	}
				1570
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1571	STATIC int
				1572	xfs_setsize_buftarg_flags(
				1573	xfs_buftarg_t *btp,
				1574	unsigned int blocksize,
				1575	unsigned int sectorsize,
				1576	int verbose)
				1577	{
				1578	btp->pbr_bsize = blocksize;
				1579	btp->pbr_sshift = ffs(sectorsize) - 1;
				1580	btp->pbr_smask = sectorsize - 1;
				1581
				1582	if (set_blocksize(btp->pbr_bdev, sectorsize)) {
				1583	printk(KERN_WARNING
				1584	"XFS: Cannot set_blocksize to %u on device %s\n",
				1585	sectorsize, XFS_BUFTARG_NAME(btp));
				1586	return EINVAL;
				1587	}
				1588
				1589	if (verbose &&
				1590	(PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
				1591	printk(KERN_WARNING
				1592	"XFS: %u byte sectors in use on device %s. "
				1593	"This is suboptimal; %u or greater is ideal.\n",
				1594	sectorsize, XFS_BUFTARG_NAME(btp),
				1595	(unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
				1596	}
				1597
				1598	return 0;
				1599	}
				1600
				1601	/*
				1602	* When allocating the initial buffer target we have not yet
				1603	* read in the superblock, so don't know what sized sectors
				1604	* are being used is at this early stage. Play safe.
				1605	*/
				1606	STATIC int
				1607	xfs_setsize_buftarg_early(
				1608	xfs_buftarg_t *btp,
				1609	struct block_device *bdev)
				1610	{
				1611	return xfs_setsize_buftarg_flags(btp,
				1612	PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0);
				1613	}
				1614
				1615	int
				1616	xfs_setsize_buftarg(
				1617	xfs_buftarg_t *btp,
				1618	unsigned int blocksize,
				1619	unsigned int sectorsize)
				1620	{
				1621	return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
				1622	}
				1623
				1624	STATIC int
				1625	xfs_mapping_buftarg(
				1626	xfs_buftarg_t *btp,
				1627	struct block_device *bdev)
				1628	{
				1629	struct backing_dev_info *bdi;
				1630	struct inode *inode;
				1631	struct address_space *mapping;
				1632	static struct address_space_operations mapping_aops = {
				1633	.sync_page = block_sync_page,
				1634	};
				1635
				1636	inode = new_inode(bdev->bd_inode->i_sb);
				1637	if (!inode) {
				1638	printk(KERN_WARNING
				1639	"XFS: Cannot allocate mapping inode for device %s\n",
				1640	XFS_BUFTARG_NAME(btp));
				1641	return ENOMEM;
				1642	}
				1643	inode->i_mode = S_IFBLK;
				1644	inode->i_bdev = bdev;
				1645	inode->i_rdev = bdev->bd_dev;
				1646	bdi = blk_get_backing_dev_info(bdev);
				1647	if (!bdi)
				1648	bdi = &default_backing_dev_info;
				1649	mapping = &inode->i_data;
				1650	mapping->a_ops = &mapping_aops;
				1651	mapping->backing_dev_info = bdi;
				1652	mapping_set_gfp_mask(mapping, GFP_NOFS);
				1653	btp->pbr_mapping = mapping;
				1654	return 0;
				1655	}
				1656
				1657	xfs_buftarg_t *
				1658	xfs_alloc_buftarg(
				1659	struct block_device *bdev,
				1660	int external)
				1661	{
				1662	xfs_buftarg_t *btp;
				1663
				1664	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
				1665
				1666	btp->pbr_dev = bdev->bd_dev;
				1667	btp->pbr_bdev = bdev;
				1668	if (xfs_setsize_buftarg_early(btp, bdev))
				1669	goto error;
				1670	if (xfs_mapping_buftarg(btp, bdev))
				1671	goto error;
				1672	xfs_alloc_bufhash(btp, external);
				1673	return btp;
				1674
				1675	error:
				1676	kmem_free(btp, sizeof(*btp));
				1677	return NULL;
				1678	}
				1679
				1680
				1681	/*
				1682	* Pagebuf delayed write buffer handling
				1683	*/
				1684
				1685	STATIC LIST_HEAD(pbd_delwrite_queue);
				1686	STATIC DEFINE_SPINLOCK(pbd_delwrite_lock);
				1687
				1688	STATIC void
				1689	pagebuf_delwri_queue(
				1690	xfs_buf_t *pb,
				1691	int unlock)
				1692	{
				1693	PB_TRACE(pb, "delwri_q", (long)unlock);
				1694	ASSERT(pb->pb_flags & PBF_DELWRI);
				1695
				1696	spin_lock(&pbd_delwrite_lock);
				1697	/* If already in the queue, dequeue and place at tail */
				1698	if (!list_empty(&pb->pb_list)) {
				1699	if (unlock) {
				1700	atomic_dec(&pb->pb_hold);
				1701	}
				1702	list_del(&pb->pb_list);
				1703	}
				1704
				1705	list_add_tail(&pb->pb_list, &pbd_delwrite_queue);
				1706	pb->pb_queuetime = jiffies;
				1707	spin_unlock(&pbd_delwrite_lock);
				1708
				1709	if (unlock)
				1710	pagebuf_unlock(pb);
				1711	}
				1712
				1713	void
				1714	pagebuf_delwri_dequeue(
				1715	xfs_buf_t *pb)
				1716	{
				1717	int dequeued = 0;
				1718
				1719	spin_lock(&pbd_delwrite_lock);
				1720	if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) {
				1721	list_del_init(&pb->pb_list);
				1722	dequeued = 1;
				1723	}
				1724	pb->pb_flags &= ~PBF_DELWRI;
				1725	spin_unlock(&pbd_delwrite_lock);
				1726
				1727	if (dequeued)
				1728	pagebuf_rele(pb);
				1729
				1730	PB_TRACE(pb, "delwri_dq", (long)dequeued);
				1731	}
				1732
				1733	STATIC void
				1734	pagebuf_runall_queues(
				1735	struct workqueue_struct *queue)
				1736	{
				1737	flush_workqueue(queue);
				1738	}
				1739
				1740	/* Defines for pagebuf daemon */
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1741	STATIC DECLARE_COMPLETION(xfsbufd_done);
				1742	STATIC struct task_struct *xfsbufd_task;
				1743	STATIC int xfsbufd_active;
				1744	STATIC int xfsbufd_force_flush;
				1745	STATIC int xfsbufd_force_sleep;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1746
				1747	STATIC int
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1748	xfsbufd_wakeup(
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1749	int priority,
				1750	unsigned int mask)
				1751	{
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1752	if (xfsbufd_force_sleep)
Nathan Scott	abd0cf7	2005-05-05 13:30:13 -0700	[diff] [blame]	1753	return 0;
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1754	xfsbufd_force_flush = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1755	barrier();
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1756	wake_up_process(xfsbufd_task);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1757	return 0;
				1758	}
				1759
				1760	STATIC int
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1761	xfsbufd(
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1762	void *data)
				1763	{
				1764	struct list_head tmp;
				1765	unsigned long age;
				1766	xfs_buftarg_t *target;
				1767	xfs_buf_t pb, n;
				1768
				1769	/* Set up the thread */
				1770	daemonize("xfsbufd");
				1771	current->flags \|= PF_MEMALLOC;
				1772
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1773	xfsbufd_task = current;
				1774	xfsbufd_active = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1775	barrier();
				1776
				1777	INIT_LIST_HEAD(&tmp);
				1778	do {
Christoph Lameter	3e1d1d2	2005-06-24 23:13:50 -0700	[diff] [blame]	1779	if (unlikely(freezing(current))) {
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1780	xfsbufd_force_sleep = 1;
Christoph Lameter	3e1d1d2	2005-06-24 23:13:50 -0700	[diff] [blame]	1781	refrigerator();
Nathan Scott	abd0cf7	2005-05-05 13:30:13 -0700	[diff] [blame]	1782	} else {
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1783	xfsbufd_force_sleep = 0;
Nathan Scott	abd0cf7	2005-05-05 13:30:13 -0700	[diff] [blame]	1784	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1785
				1786	set_current_state(TASK_INTERRUPTIBLE);
				1787	schedule_timeout((xfs_buf_timer_centisecs * HZ) / 100);
				1788
				1789	age = (xfs_buf_age_centisecs * HZ) / 100;
				1790	spin_lock(&pbd_delwrite_lock);
				1791	list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
				1792	PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb));
				1793	ASSERT(pb->pb_flags & PBF_DELWRI);
				1794
				1795	if (!pagebuf_ispin(pb) && !pagebuf_cond_lock(pb)) {
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1796	if (!xfsbufd_force_flush &&
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1797	time_before(jiffies,
				1798	pb->pb_queuetime + age)) {
				1799	pagebuf_unlock(pb);
				1800	break;
				1801	}
				1802
				1803	pb->pb_flags &= ~PBF_DELWRI;
				1804	pb->pb_flags \|= PBF_WRITE;
				1805	list_move(&pb->pb_list, &tmp);
				1806	}
				1807	}
				1808	spin_unlock(&pbd_delwrite_lock);
				1809
				1810	while (!list_empty(&tmp)) {
				1811	pb = list_entry(tmp.next, xfs_buf_t, pb_list);
				1812	target = pb->pb_target;
				1813
				1814	list_del_init(&pb->pb_list);
				1815	pagebuf_iostrategy(pb);
				1816
				1817	blk_run_address_space(target->pbr_mapping);
				1818	}
				1819
				1820	if (as_list_len > 0)
				1821	purge_addresses();
				1822
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1823	xfsbufd_force_flush = 0;
				1824	} while (xfsbufd_active);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1825
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1826	complete_and_exit(&xfsbufd_done, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1827	}
				1828
				1829	/*
				1830	* Go through all incore buffers, and release buffers if they belong to
				1831	* the given device. This is used in filesystem error handling to
				1832	* preserve the consistency of its metadata.
				1833	*/
				1834	int
				1835	xfs_flush_buftarg(
				1836	xfs_buftarg_t *target,
				1837	int wait)
				1838	{
				1839	struct list_head tmp;
				1840	xfs_buf_t pb, n;
				1841	int pincount = 0;
				1842
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1843	pagebuf_runall_queues(xfsdatad_workqueue);
				1844	pagebuf_runall_queues(xfslogd_workqueue);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1845
				1846	INIT_LIST_HEAD(&tmp);
				1847	spin_lock(&pbd_delwrite_lock);
				1848	list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
				1849
				1850	if (pb->pb_target != target)
				1851	continue;
				1852
				1853	ASSERT(pb->pb_flags & PBF_DELWRI);
				1854	PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb));
				1855	if (pagebuf_ispin(pb)) {
				1856	pincount++;
				1857	continue;
				1858	}
				1859
				1860	pb->pb_flags &= ~PBF_DELWRI;
				1861	pb->pb_flags \|= PBF_WRITE;
				1862	list_move(&pb->pb_list, &tmp);
				1863	}
				1864	spin_unlock(&pbd_delwrite_lock);
				1865
				1866	/*
				1867	* Dropped the delayed write list lock, now walk the temporary list
				1868	*/
				1869	list_for_each_entry_safe(pb, n, &tmp, pb_list) {
				1870	if (wait)
				1871	pb->pb_flags &= ~PBF_ASYNC;
				1872	else
				1873	list_del_init(&pb->pb_list);
				1874
				1875	pagebuf_lock(pb);
				1876	pagebuf_iostrategy(pb);
				1877	}
				1878
				1879	/*
				1880	* Remaining list items must be flushed before returning
				1881	*/
				1882	while (!list_empty(&tmp)) {
				1883	pb = list_entry(tmp.next, xfs_buf_t, pb_list);
				1884
				1885	list_del_init(&pb->pb_list);
				1886	xfs_iowait(pb);
				1887	xfs_buf_relse(pb);
				1888	}
				1889
				1890	if (wait)
				1891	blk_run_address_space(target->pbr_mapping);
				1892
				1893	return pincount;
				1894	}
				1895
				1896	STATIC int
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1897	xfs_buf_daemons_start(void)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1898	{
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1899	int error = -ENOMEM;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1900
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1901	xfslogd_workqueue = create_workqueue("xfslogd");
				1902	if (!xfslogd_workqueue)
				1903	goto out;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1904
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1905	xfsdatad_workqueue = create_workqueue("xfsdatad");
				1906	if (!xfsdatad_workqueue)
				1907	goto out_destroy_xfslogd_workqueue;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1908
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1909	error = kernel_thread(xfsbufd, NULL, CLONE_FS\|CLONE_FILES);
				1910	if (error < 0)
				1911	goto out_destroy_xfsdatad_workqueue;
				1912	return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1913
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1914	out_destroy_xfsdatad_workqueue:
				1915	destroy_workqueue(xfsdatad_workqueue);
				1916	out_destroy_xfslogd_workqueue:
				1917	destroy_workqueue(xfslogd_workqueue);
				1918	out:
				1919	return error;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1920	}
				1921
				1922	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1923	* Note: do not mark as __exit, it is called from pagebuf_terminate.
				1924	*/
				1925	STATIC void
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1926	xfs_buf_daemons_stop(void)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1927	{
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1928	xfsbufd_active = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1929	barrier();
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1930	wait_for_completion(&xfsbufd_done);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1931
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1932	destroy_workqueue(xfslogd_workqueue);
				1933	destroy_workqueue(xfsdatad_workqueue);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1934	}
				1935
				1936	/*
				1937	* Initialization and Termination
				1938	*/
				1939
				1940	int __init
				1941	pagebuf_init(void)
				1942	{
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1943	int error = -ENOMEM;
				1944
				1945	pagebuf_zone = kmem_zone_init(sizeof(xfs_buf_t), "xfs_buf");
				1946	if (!pagebuf_zone)
				1947	goto out;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1948
				1949	#ifdef PAGEBUF_TRACE
				1950	pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP);
				1951	#endif
				1952
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1953	error = xfs_buf_daemons_start();
Christoph Hellwig	cf9937c	2005-06-21 15:35:24 +1000	[diff] [blame]	1954	if (error)
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1955	goto out_free_buf_zone;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1956
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1957	pagebuf_shake = kmem_shake_register(xfsbufd_wakeup);
				1958	if (!pagebuf_shake) {
				1959	error = -ENOMEM;
				1960	goto out_stop_daemons;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1961	}
				1962
				1963	return 0;
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1964
				1965	out_stop_daemons:
				1966	xfs_buf_daemons_stop();
				1967	out_free_buf_zone:
				1968	#ifdef PAGEBUF_TRACE
				1969	ktrace_free(pagebuf_trace_buf);
				1970	#endif
				1971	kmem_zone_destroy(pagebuf_zone);
				1972	out:
				1973	return error;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1974	}
				1975
				1976
				1977	/*
				1978	* pagebuf_terminate.
				1979	*
				1980	* Note: do not mark as __exit, this is also called from the __init code.
				1981	*/
				1982	void
				1983	pagebuf_terminate(void)
				1984	{
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1985	xfs_buf_daemons_stop();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1986
				1987	#ifdef PAGEBUF_TRACE
				1988	ktrace_free(pagebuf_trace_buf);
				1989	#endif
				1990
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1991	kmem_zone_destroy(pagebuf_zone);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1992	kmem_shake_deregister(pagebuf_shake);
				1993	}