Blame - fs/xfs/linux-2.6/xfs_buf.c - kernel/msm-4.9

blob: df0cba239dd589b2ac7b9fbb90aa0909190a5ac3 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or modify it
				5	* under the terms of version 2 of the GNU General Public License as
				6	* published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it would be useful, but
				9	* WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
				11	*
				12	* Further, this software is distributed without any warranty that it is
				13	* free of the rightful claim of any third person regarding infringement
				14	* or the like. Any license provided herein, whether implied or
				15	* otherwise, applies only to this software file. Patent licenses, if
				16	* any, provided herein do not apply to combinations of this program with
				17	* other software, or any other product whatsoever.
				18	*
				19	* You should have received a copy of the GNU General Public License along
				20	* with this program; if not, write the Free Software Foundation, Inc., 59
				21	* Temple Place - Suite 330, Boston MA 02111-1307, USA.
				22	*
				23	* Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
				24	* Mountain View, CA 94043, or:
				25	*
				26	* http://www.sgi.com
				27	*
				28	* For further information regarding this notice, see:
				29	*
				30	* http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
				31	*/
				32
				33	/*
				34	* The xfs_buf.c code provides an abstract buffer cache model on top
				35	* of the Linux page cache. Cached metadata blocks for a file system
				36	* are hashed to the inode for the block device. xfs_buf.c assembles
				37	* buffers (xfs_buf_t) on demand to aggregate such cached pages for I/O.
				38	*
				39	* Written by Steve Lord, Jim Mostek, Russell Cattelan
				40	* and Rajagopal Ananthanarayanan ("ananth") at SGI.
				41	*
				42	*/
				43
				44	#include <linux/stddef.h>
				45	#include <linux/errno.h>
				46	#include <linux/slab.h>
				47	#include <linux/pagemap.h>
				48	#include <linux/init.h>
				49	#include <linux/vmalloc.h>
				50	#include <linux/bio.h>
				51	#include <linux/sysctl.h>
				52	#include <linux/proc_fs.h>
				53	#include <linux/workqueue.h>
				54	#include <linux/percpu.h>
				55	#include <linux/blkdev.h>
				56	#include <linux/hash.h>
				57
				58	#include "xfs_linux.h"
				59
				60	/*
				61	* File wide globals
				62	*/
				63
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	64	STATIC kmem_cache_t *pagebuf_zone;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	65	STATIC kmem_shaker_t pagebuf_shake;
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	66	STATIC int xfsbufd_wakeup(int, unsigned int);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	67	STATIC void pagebuf_delwri_queue(xfs_buf_t *, int);
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	68
				69	STATIC struct workqueue_struct *xfslogd_workqueue;
				70	STATIC struct workqueue_struct *xfsdatad_workqueue;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	71
				72	/*
				73	* Pagebuf debugging
				74	*/
				75
				76	#ifdef PAGEBUF_TRACE
				77	void
				78	pagebuf_trace(
				79	xfs_buf_t *pb,
				80	char *id,
				81	void *data,
				82	void *ra)
				83	{
				84	ktrace_enter(pagebuf_trace_buf,
				85	pb, id,
				86	(void *)(unsigned long)pb->pb_flags,
				87	(void *)(unsigned long)pb->pb_hold.counter,
				88	(void *)(unsigned long)pb->pb_sema.count.counter,
				89	(void *)current,
				90	data, ra,
				91	(void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff),
				92	(void *)(unsigned long)(pb->pb_file_offset & 0xffffffff),
				93	(void *)(unsigned long)pb->pb_buffer_length,
				94	NULL, NULL, NULL, NULL, NULL);
				95	}
				96	ktrace_t *pagebuf_trace_buf;
				97	#define PAGEBUF_TRACE_SIZE 4096
				98	#define PB_TRACE(pb, id, data) \
				99	pagebuf_trace(pb, id, (void )data, (void )__builtin_return_address(0))
				100	#else
				101	#define PB_TRACE(pb, id, data) do { } while (0)
				102	#endif
				103
				104	#ifdef PAGEBUF_LOCK_TRACKING
				105	# define PB_SET_OWNER(pb) ((pb)->pb_last_holder = current->pid)
				106	# define PB_CLEAR_OWNER(pb) ((pb)->pb_last_holder = -1)
				107	# define PB_GET_OWNER(pb) ((pb)->pb_last_holder)
				108	#else
				109	# define PB_SET_OWNER(pb) do { } while (0)
				110	# define PB_CLEAR_OWNER(pb) do { } while (0)
				111	# define PB_GET_OWNER(pb) do { } while (0)
				112	#endif
				113
				114	/*
				115	* Pagebuf allocation / freeing.
				116	*/
				117
				118	#define pb_to_gfp(flags) \
				119	((((flags) & PBF_READ_AHEAD) ? __GFP_NORETRY : \
				120	((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) \| __GFP_NOWARN)
				121
				122	#define pb_to_km(flags) \
				123	(((flags) & PBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
				124
				125
				126	#define pagebuf_allocate(flags) \
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	127	kmem_zone_alloc(pagebuf_zone, pb_to_km(flags))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	128	#define pagebuf_deallocate(pb) \
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	129	kmem_zone_free(pagebuf_zone, (pb));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	130
				131	/*
				132	* Page Region interfaces.
				133	*
				134	* For pages in filesystems where the blocksize is smaller than the
				135	* pagesize, we use the page->private field (long) to hold a bitmap
				136	* of uptodate regions within the page.
				137	*
				138	* Each such region is "bytes per page / bits per long" bytes long.
				139	*
				140	* NBPPR == number-of-bytes-per-page-region
				141	* BTOPR == bytes-to-page-region (rounded up)
				142	* BTOPRT == bytes-to-page-region-truncated (rounded down)
				143	*/
				144	#if (BITS_PER_LONG == 32)
				145	#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */
				146	#elif (BITS_PER_LONG == 64)
				147	#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */
				148	#else
				149	#error BITS_PER_LONG must be 32 or 64
				150	#endif
				151	#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG)
				152	#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
				153	#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT))
				154
				155	STATIC unsigned long
				156	page_region_mask(
				157	size_t offset,
				158	size_t length)
				159	{
				160	unsigned long mask;
				161	int first, final;
				162
				163	first = BTOPR(offset);
				164	final = BTOPRT(offset + length - 1);
				165	first = min(first, final);
				166
				167	mask = ~0UL;
				168	mask <<= BITS_PER_LONG - (final - first);
				169	mask >>= BITS_PER_LONG - (final);
				170
				171	ASSERT(offset + length <= PAGE_CACHE_SIZE);
				172	ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
				173
				174	return mask;
				175	}
				176
				177	STATIC inline void
				178	set_page_region(
				179	struct page *page,
				180	size_t offset,
				181	size_t length)
				182	{
				183	page->private \|= page_region_mask(offset, length);
				184	if (page->private == ~0UL)
				185	SetPageUptodate(page);
				186	}
				187
				188	STATIC inline int
				189	test_page_region(
				190	struct page *page,
				191	size_t offset,
				192	size_t length)
				193	{
				194	unsigned long mask = page_region_mask(offset, length);
				195
				196	return (mask && (page->private & mask) == mask);
				197	}
				198
				199	/*
				200	* Mapping of multi-page buffers into contiguous virtual space
				201	*/
				202
				203	typedef struct a_list {
				204	void *vm_addr;
				205	struct a_list *next;
				206	} a_list_t;
				207
				208	STATIC a_list_t *as_free_head;
				209	STATIC int as_list_len;
				210	STATIC DEFINE_SPINLOCK(as_lock);
				211
				212	/*
				213	* Try to batch vunmaps because they are costly.
				214	*/
				215	STATIC void
				216	free_address(
				217	void *addr)
				218	{
				219	a_list_t *aentry;
				220
				221	aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH);
				222	if (likely(aentry)) {
				223	spin_lock(&as_lock);
				224	aentry->next = as_free_head;
				225	aentry->vm_addr = addr;
				226	as_free_head = aentry;
				227	as_list_len++;
				228	spin_unlock(&as_lock);
				229	} else {
				230	vunmap(addr);
				231	}
				232	}
				233
				234	STATIC void
				235	purge_addresses(void)
				236	{
				237	a_list_t aentry, old;
				238
				239	if (as_free_head == NULL)
				240	return;
				241
				242	spin_lock(&as_lock);
				243	aentry = as_free_head;
				244	as_free_head = NULL;
				245	as_list_len = 0;
				246	spin_unlock(&as_lock);
				247
				248	while ((old = aentry) != NULL) {
				249	vunmap(aentry->vm_addr);
				250	aentry = aentry->next;
				251	kfree(old);
				252	}
				253	}
				254
				255	/*
				256	* Internal pagebuf object manipulation
				257	*/
				258
				259	STATIC void
				260	_pagebuf_initialize(
				261	xfs_buf_t *pb,
				262	xfs_buftarg_t *target,
				263	loff_t range_base,
				264	size_t range_length,
				265	page_buf_flags_t flags)
				266	{
				267	/*
				268	* We don't want certain flags to appear in pb->pb_flags.
				269	*/
				270	flags &= ~(PBF_LOCK\|PBF_MAPPED\|PBF_DONT_BLOCK\|PBF_READ_AHEAD);
				271
				272	memset(pb, 0, sizeof(xfs_buf_t));
				273	atomic_set(&pb->pb_hold, 1);
				274	init_MUTEX_LOCKED(&pb->pb_iodonesema);
				275	INIT_LIST_HEAD(&pb->pb_list);
				276	INIT_LIST_HEAD(&pb->pb_hash_list);
				277	init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */
				278	PB_SET_OWNER(pb);
				279	pb->pb_target = target;
				280	pb->pb_file_offset = range_base;
				281	/*
				282	* Set buffer_length and count_desired to the same value initially.
				283	* I/O routines should use count_desired, which will be the same in
				284	* most cases but may be reset (e.g. XFS recovery).
				285	*/
				286	pb->pb_buffer_length = pb->pb_count_desired = range_length;
				287	pb->pb_flags = flags \| PBF_NONE;
				288	pb->pb_bn = XFS_BUF_DADDR_NULL;
				289	atomic_set(&pb->pb_pin_count, 0);
				290	init_waitqueue_head(&pb->pb_waiters);
				291
				292	XFS_STATS_INC(pb_create);
				293	PB_TRACE(pb, "initialize", target);
				294	}
				295
				296	/*
				297	* Allocate a page array capable of holding a specified number
				298	* of pages, and point the page buf at it.
				299	*/
				300	STATIC int
				301	_pagebuf_get_pages(
				302	xfs_buf_t *pb,
				303	int page_count,
				304	page_buf_flags_t flags)
				305	{
				306	/* Make sure that we have a page list */
				307	if (pb->pb_pages == NULL) {
				308	pb->pb_offset = page_buf_poff(pb->pb_file_offset);
				309	pb->pb_page_count = page_count;
				310	if (page_count <= PB_PAGES) {
				311	pb->pb_pages = pb->pb_page_array;
				312	} else {
				313	pb->pb_pages = kmem_alloc(sizeof(struct page )
				314	page_count, pb_to_km(flags));
				315	if (pb->pb_pages == NULL)
				316	return -ENOMEM;
				317	}
				318	memset(pb->pb_pages, 0, sizeof(struct page ) page_count);
				319	}
				320	return 0;
				321	}
				322
				323	/*
				324	* Frees pb_pages if it was malloced.
				325	*/
				326	STATIC void
				327	_pagebuf_free_pages(
				328	xfs_buf_t *bp)
				329	{
				330	if (bp->pb_pages != bp->pb_page_array) {
				331	kmem_free(bp->pb_pages,
				332	bp->pb_page_count * sizeof(struct page *));
				333	}
				334	}
				335
				336	/*
				337	* Releases the specified buffer.
				338	*
				339	* The modification state of any associated pages is left unchanged.
				340	* The buffer most not be on any hash - use pagebuf_rele instead for
				341	* hashed and refcounted buffers
				342	*/
				343	void
				344	pagebuf_free(
				345	xfs_buf_t *bp)
				346	{
				347	PB_TRACE(bp, "free", 0);
				348
				349	ASSERT(list_empty(&bp->pb_hash_list));
				350
				351	if (bp->pb_flags & _PBF_PAGE_CACHE) {
				352	uint i;
				353
				354	if ((bp->pb_flags & PBF_MAPPED) && (bp->pb_page_count > 1))
				355	free_address(bp->pb_addr - bp->pb_offset);
				356
				357	for (i = 0; i < bp->pb_page_count; i++)
				358	page_cache_release(bp->pb_pages[i]);
				359	_pagebuf_free_pages(bp);
				360	} else if (bp->pb_flags & _PBF_KMEM_ALLOC) {
				361	/*
				362	* XXX(hch): bp->pb_count_desired might be incorrect (see
				363	* pagebuf_associate_memory for details), but fortunately
				364	* the Linux version of kmem_free ignores the len argument..
				365	*/
				366	kmem_free(bp->pb_addr, bp->pb_count_desired);
				367	_pagebuf_free_pages(bp);
				368	}
				369
				370	pagebuf_deallocate(bp);
				371	}
				372
				373	/*
				374	* Finds all pages for buffer in question and builds it's page list.
				375	*/
				376	STATIC int
				377	_pagebuf_lookup_pages(
				378	xfs_buf_t *bp,
				379	uint flags)
				380	{
				381	struct address_space *mapping = bp->pb_target->pbr_mapping;
				382	size_t blocksize = bp->pb_target->pbr_bsize;
				383	size_t size = bp->pb_count_desired;
				384	size_t nbytes, offset;
				385	int gfp_mask = pb_to_gfp(flags);
				386	unsigned short page_count, i;
				387	pgoff_t first;
				388	loff_t end;
				389	int error;
				390
				391	end = bp->pb_file_offset + bp->pb_buffer_length;
				392	page_count = page_buf_btoc(end) - page_buf_btoct(bp->pb_file_offset);
				393
				394	error = _pagebuf_get_pages(bp, page_count, flags);
				395	if (unlikely(error))
				396	return error;
				397	bp->pb_flags \|= _PBF_PAGE_CACHE;
				398
				399	offset = bp->pb_offset;
				400	first = bp->pb_file_offset >> PAGE_CACHE_SHIFT;
				401
				402	for (i = 0; i < bp->pb_page_count; i++) {
				403	struct page *page;
				404	uint retries = 0;
				405
				406	retry:
				407	page = find_or_create_page(mapping, first + i, gfp_mask);
				408	if (unlikely(page == NULL)) {
				409	if (flags & PBF_READ_AHEAD) {
				410	bp->pb_page_count = i;
				411	for (i = 0; i < bp->pb_page_count; i++)
				412	unlock_page(bp->pb_pages[i]);
				413	return -ENOMEM;
				414	}
				415
				416	/*
				417	* This could deadlock.
				418	*
				419	* But until all the XFS lowlevel code is revamped to
				420	* handle buffer allocation failures we can't do much.
				421	*/
				422	if (!(++retries % 100))
				423	printk(KERN_ERR
				424	"XFS: possible memory allocation "
				425	"deadlock in %s (mode:0x%x)\n",
				426	__FUNCTION__, gfp_mask);
				427
				428	XFS_STATS_INC(pb_page_retries);
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	429	xfsbufd_wakeup(0, gfp_mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	430	blk_congestion_wait(WRITE, HZ/50);
				431	goto retry;
				432	}
				433
				434	XFS_STATS_INC(pb_page_found);
				435
				436	nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
				437	size -= nbytes;
				438
				439	if (!PageUptodate(page)) {
				440	page_count--;
				441	if (blocksize >= PAGE_CACHE_SIZE) {
				442	if (flags & PBF_READ)
				443	bp->pb_locked = 1;
				444	} else if (!PagePrivate(page)) {
				445	if (test_page_region(page, offset, nbytes))
				446	page_count++;
				447	}
				448	}
				449
				450	bp->pb_pages[i] = page;
				451	offset = 0;
				452	}
				453
				454	if (!bp->pb_locked) {
				455	for (i = 0; i < bp->pb_page_count; i++)
				456	unlock_page(bp->pb_pages[i]);
				457	}
				458
				459	if (page_count) {
				460	/* if we have any uptodate pages, mark that in the buffer */
				461	bp->pb_flags &= ~PBF_NONE;
				462
				463	/* if some pages aren't uptodate, mark that in the buffer */
				464	if (page_count != bp->pb_page_count)
				465	bp->pb_flags \|= PBF_PARTIAL;
				466	}
				467
				468	PB_TRACE(bp, "lookup_pages", (long)page_count);
				469	return error;
				470	}
				471
				472	/*
				473	* Map buffer into kernel address-space if nessecary.
				474	*/
				475	STATIC int
				476	_pagebuf_map_pages(
				477	xfs_buf_t *bp,
				478	uint flags)
				479	{
				480	/* A single page buffer is always mappable */
				481	if (bp->pb_page_count == 1) {
				482	bp->pb_addr = page_address(bp->pb_pages[0]) + bp->pb_offset;
				483	bp->pb_flags \|= PBF_MAPPED;
				484	} else if (flags & PBF_MAPPED) {
				485	if (as_list_len > 64)
				486	purge_addresses();
				487	bp->pb_addr = vmap(bp->pb_pages, bp->pb_page_count,
				488	VM_MAP, PAGE_KERNEL);
				489	if (unlikely(bp->pb_addr == NULL))
				490	return -ENOMEM;
				491	bp->pb_addr += bp->pb_offset;
				492	bp->pb_flags \|= PBF_MAPPED;
				493	}
				494
				495	return 0;
				496	}
				497
				498	/*
				499	* Finding and Reading Buffers
				500	*/
				501
				502	/*
				503	* _pagebuf_find
				504	*
				505	* Looks up, and creates if absent, a lockable buffer for
				506	* a given range of an inode. The buffer is returned
				507	* locked. If other overlapping buffers exist, they are
				508	* released before the new buffer is created and locked,
				509	* which may imply that this call will block until those buffers
				510	* are unlocked. No I/O is implied by this call.
				511	*/
				512	xfs_buf_t *
				513	_pagebuf_find(
				514	xfs_buftarg_t btp, / block device target */
				515	loff_t ioff, /* starting offset of range */
				516	size_t isize, /* length of range */
				517	page_buf_flags_t flags, /* PBF_TRYLOCK */
				518	xfs_buf_t new_pb)/ newly allocated buffer */
				519	{
				520	loff_t range_base;
				521	size_t range_length;
				522	xfs_bufhash_t *hash;
				523	xfs_buf_t pb, n;
				524
				525	range_base = (ioff << BBSHIFT);
				526	range_length = (isize << BBSHIFT);
				527
				528	/* Check for IOs smaller than the sector size / not sector aligned */
				529	ASSERT(!(range_length < (1 << btp->pbr_sshift)));
				530	ASSERT(!(range_base & (loff_t)btp->pbr_smask));
				531
				532	hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
				533
				534	spin_lock(&hash->bh_lock);
				535
				536	list_for_each_entry_safe(pb, n, &hash->bh_list, pb_hash_list) {
				537	ASSERT(btp == pb->pb_target);
				538	if (pb->pb_file_offset == range_base &&
				539	pb->pb_buffer_length == range_length) {
				540	/*
				541	* If we look at something bring it to the
				542	* front of the list for next time.
				543	*/
				544	atomic_inc(&pb->pb_hold);
				545	list_move(&pb->pb_hash_list, &hash->bh_list);
				546	goto found;
				547	}
				548	}
				549
				550	/* No match found */
				551	if (new_pb) {
				552	_pagebuf_initialize(new_pb, btp, range_base,
				553	range_length, flags);
				554	new_pb->pb_hash = hash;
				555	list_add(&new_pb->pb_hash_list, &hash->bh_list);
				556	} else {
				557	XFS_STATS_INC(pb_miss_locked);
				558	}
				559
				560	spin_unlock(&hash->bh_lock);
				561	return new_pb;
				562
				563	found:
				564	spin_unlock(&hash->bh_lock);
				565
				566	/* Attempt to get the semaphore without sleeping,
				567	* if this does not work then we need to drop the
				568	* spinlock and do a hard attempt on the semaphore.
				569	*/
				570	if (down_trylock(&pb->pb_sema)) {
				571	if (!(flags & PBF_TRYLOCK)) {
				572	/* wait for buffer ownership */
				573	PB_TRACE(pb, "get_lock", 0);
				574	pagebuf_lock(pb);
				575	XFS_STATS_INC(pb_get_locked_waited);
				576	} else {
				577	/* We asked for a trylock and failed, no need
				578	* to look at file offset and length here, we
				579	* know that this pagebuf at least overlaps our
				580	* pagebuf and is locked, therefore our buffer
				581	* either does not exist, or is this buffer
				582	*/
				583
				584	pagebuf_rele(pb);
				585	XFS_STATS_INC(pb_busy_locked);
				586	return (NULL);
				587	}
				588	} else {
				589	/* trylock worked */
				590	PB_SET_OWNER(pb);
				591	}
				592
				593	if (pb->pb_flags & PBF_STALE)
				594	pb->pb_flags &= PBF_MAPPED;
				595	PB_TRACE(pb, "got_lock", 0);
				596	XFS_STATS_INC(pb_get_locked);
				597	return (pb);
				598	}
				599
				600	/*
				601	* xfs_buf_get_flags assembles a buffer covering the specified range.
				602	*
				603	* Storage in memory for all portions of the buffer will be allocated,
				604	* although backing storage may not be.
				605	*/
				606	xfs_buf_t *
				607	xfs_buf_get_flags( /* allocate a buffer */
				608	xfs_buftarg_t target,/ target for buffer */
				609	loff_t ioff, /* starting offset of range */
				610	size_t isize, /* length of range */
				611	page_buf_flags_t flags) /* PBF_TRYLOCK */
				612	{
				613	xfs_buf_t pb, new_pb;
				614	int error = 0, i;
				615
				616	new_pb = pagebuf_allocate(flags);
				617	if (unlikely(!new_pb))
				618	return NULL;
				619
				620	pb = _pagebuf_find(target, ioff, isize, flags, new_pb);
				621	if (pb == new_pb) {
				622	error = _pagebuf_lookup_pages(pb, flags);
				623	if (error)
				624	goto no_buffer;
				625	} else {
				626	pagebuf_deallocate(new_pb);
				627	if (unlikely(pb == NULL))
				628	return NULL;
				629	}
				630
				631	for (i = 0; i < pb->pb_page_count; i++)
				632	mark_page_accessed(pb->pb_pages[i]);
				633
				634	if (!(pb->pb_flags & PBF_MAPPED)) {
				635	error = _pagebuf_map_pages(pb, flags);
				636	if (unlikely(error)) {
				637	printk(KERN_WARNING "%s: failed to map pages\n",
				638	__FUNCTION__);
				639	goto no_buffer;
				640	}
				641	}
				642
				643	XFS_STATS_INC(pb_get);
				644
				645	/*
				646	* Always fill in the block number now, the mapped cases can do
				647	* their own overlay of this later.
				648	*/
				649	pb->pb_bn = ioff;
				650	pb->pb_count_desired = pb->pb_buffer_length;
				651
				652	PB_TRACE(pb, "get", (unsigned long)flags);
				653	return pb;
				654
				655	no_buffer:
				656	if (flags & (PBF_LOCK \| PBF_TRYLOCK))
				657	pagebuf_unlock(pb);
				658	pagebuf_rele(pb);
				659	return NULL;
				660	}
				661
				662	xfs_buf_t *
				663	xfs_buf_read_flags(
				664	xfs_buftarg_t *target,
				665	loff_t ioff,
				666	size_t isize,
				667	page_buf_flags_t flags)
				668	{
				669	xfs_buf_t *pb;
				670
				671	flags \|= PBF_READ;
				672
				673	pb = xfs_buf_get_flags(target, ioff, isize, flags);
				674	if (pb) {
				675	if (PBF_NOT_DONE(pb)) {
				676	PB_TRACE(pb, "read", (unsigned long)flags);
				677	XFS_STATS_INC(pb_get_read);
				678	pagebuf_iostart(pb, flags);
				679	} else if (flags & PBF_ASYNC) {
				680	PB_TRACE(pb, "read_async", (unsigned long)flags);
				681	/*
				682	* Read ahead call which is already satisfied,
				683	* drop the buffer
				684	*/
				685	goto no_buffer;
				686	} else {
				687	PB_TRACE(pb, "read_done", (unsigned long)flags);
				688	/* We do not want read in the flags */
				689	pb->pb_flags &= ~PBF_READ;
				690	}
				691	}
				692
				693	return pb;
				694
				695	no_buffer:
				696	if (flags & (PBF_LOCK \| PBF_TRYLOCK))
				697	pagebuf_unlock(pb);
				698	pagebuf_rele(pb);
				699	return NULL;
				700	}
				701
				702	/*
				703	* Create a skeletal pagebuf (no pages associated with it).
				704	*/
				705	xfs_buf_t *
				706	pagebuf_lookup(
				707	xfs_buftarg_t *target,
				708	loff_t ioff,
				709	size_t isize,
				710	page_buf_flags_t flags)
				711	{
				712	xfs_buf_t *pb;
				713
				714	pb = pagebuf_allocate(flags);
				715	if (pb) {
				716	_pagebuf_initialize(pb, target, ioff, isize, flags);
				717	}
				718	return pb;
				719	}
				720
				721	/*
				722	* If we are not low on memory then do the readahead in a deadlock
				723	* safe manner.
				724	*/
				725	void
				726	pagebuf_readahead(
				727	xfs_buftarg_t *target,
				728	loff_t ioff,
				729	size_t isize,
				730	page_buf_flags_t flags)
				731	{
				732	struct backing_dev_info *bdi;
				733
				734	bdi = target->pbr_mapping->backing_dev_info;
				735	if (bdi_read_congested(bdi))
				736	return;
				737
				738	flags \|= (PBF_TRYLOCK\|PBF_ASYNC\|PBF_READ_AHEAD);
				739	xfs_buf_read_flags(target, ioff, isize, flags);
				740	}
				741
				742	xfs_buf_t *
				743	pagebuf_get_empty(
				744	size_t len,
				745	xfs_buftarg_t *target)
				746	{
				747	xfs_buf_t *pb;
				748
				749	pb = pagebuf_allocate(0);
				750	if (pb)
				751	_pagebuf_initialize(pb, target, 0, len, 0);
				752	return pb;
				753	}
				754
				755	static inline struct page *
				756	mem_to_page(
				757	void *addr)
				758	{
				759	if (((unsigned long)addr < VMALLOC_START) \|\|
				760	((unsigned long)addr >= VMALLOC_END)) {
				761	return virt_to_page(addr);
				762	} else {
				763	return vmalloc_to_page(addr);
				764	}
				765	}
				766
				767	int
				768	pagebuf_associate_memory(
				769	xfs_buf_t *pb,
				770	void *mem,
				771	size_t len)
				772	{
				773	int rval;
				774	int i = 0;
				775	size_t ptr;
				776	size_t end, end_cur;
				777	off_t offset;
				778	int page_count;
				779
				780	page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
				781	offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
				782	if (offset && (len > PAGE_CACHE_SIZE))
				783	page_count++;
				784
				785	/* Free any previous set of page pointers */
				786	if (pb->pb_pages)
				787	_pagebuf_free_pages(pb);
				788
				789	pb->pb_pages = NULL;
				790	pb->pb_addr = mem;
				791
				792	rval = _pagebuf_get_pages(pb, page_count, 0);
				793	if (rval)
				794	return rval;
				795
				796	pb->pb_offset = offset;
				797	ptr = (size_t) mem & PAGE_CACHE_MASK;
				798	end = PAGE_CACHE_ALIGN((size_t) mem + len);
				799	end_cur = end;
				800	/* set up first page */
				801	pb->pb_pages[0] = mem_to_page(mem);
				802
				803	ptr += PAGE_CACHE_SIZE;
				804	pb->pb_page_count = ++i;
				805	while (ptr < end) {
				806	pb->pb_pages[i] = mem_to_page((void *)ptr);
				807	pb->pb_page_count = ++i;
				808	ptr += PAGE_CACHE_SIZE;
				809	}
				810	pb->pb_locked = 0;
				811
				812	pb->pb_count_desired = pb->pb_buffer_length = len;
				813	pb->pb_flags \|= PBF_MAPPED;
				814
				815	return 0;
				816	}
				817
				818	xfs_buf_t *
				819	pagebuf_get_no_daddr(
				820	size_t len,
				821	xfs_buftarg_t *target)
				822	{
				823	size_t malloc_len = len;
				824	xfs_buf_t *bp;
				825	void *data;
				826	int error;
				827
				828	bp = pagebuf_allocate(0);
				829	if (unlikely(bp == NULL))
				830	goto fail;
				831	_pagebuf_initialize(bp, target, 0, len, PBF_FORCEIO);
				832
				833	try_again:
				834	data = kmem_alloc(malloc_len, KM_SLEEP \| KM_MAYFAIL);
				835	if (unlikely(data == NULL))
				836	goto fail_free_buf;
				837
				838	/* check whether alignment matches.. */
				839	if ((__psunsigned_t)data !=
				840	((__psunsigned_t)data & ~target->pbr_smask)) {
				841	/* .. else double the size and try again */
				842	kmem_free(data, malloc_len);
				843	malloc_len <<= 1;
				844	goto try_again;
				845	}
				846
				847	error = pagebuf_associate_memory(bp, data, len);
				848	if (error)
				849	goto fail_free_mem;
				850	bp->pb_flags \|= _PBF_KMEM_ALLOC;
				851
				852	pagebuf_unlock(bp);
				853
				854	PB_TRACE(bp, "no_daddr", data);
				855	return bp;
				856	fail_free_mem:
				857	kmem_free(data, malloc_len);
				858	fail_free_buf:
				859	pagebuf_free(bp);
				860	fail:
				861	return NULL;
				862	}
				863
				864	/*
				865	* pagebuf_hold
				866	*
				867	* Increment reference count on buffer, to hold the buffer concurrently
				868	* with another thread which may release (free) the buffer asynchronously.
				869	*
				870	* Must hold the buffer already to call this function.
				871	*/
				872	void
				873	pagebuf_hold(
				874	xfs_buf_t *pb)
				875	{
				876	atomic_inc(&pb->pb_hold);
				877	PB_TRACE(pb, "hold", 0);
				878	}
				879
				880	/*
				881	* pagebuf_rele
				882	*
				883	* pagebuf_rele releases a hold on the specified buffer. If the
				884	* the hold count is 1, pagebuf_rele calls pagebuf_free.
				885	*/
				886	void
				887	pagebuf_rele(
				888	xfs_buf_t *pb)
				889	{
				890	xfs_bufhash_t *hash = pb->pb_hash;
				891
				892	PB_TRACE(pb, "rele", pb->pb_relse);
				893
				894	/*
				895	* pagebuf_lookup buffers are not hashed, not delayed write,
				896	* and don't have their own release routines. Special case.
				897	*/
				898	if (unlikely(!hash)) {
				899	ASSERT(!pb->pb_relse);
				900	if (atomic_dec_and_test(&pb->pb_hold))
				901	xfs_buf_free(pb);
				902	return;
				903	}
				904
				905	if (atomic_dec_and_lock(&pb->pb_hold, &hash->bh_lock)) {
				906	int do_free = 1;
				907
				908	if (pb->pb_relse) {
				909	atomic_inc(&pb->pb_hold);
				910	spin_unlock(&hash->bh_lock);
				911	(*(pb->pb_relse)) (pb);
				912	spin_lock(&hash->bh_lock);
				913	do_free = 0;
				914	}
				915
				916	if (pb->pb_flags & PBF_DELWRI) {
				917	pb->pb_flags \|= PBF_ASYNC;
				918	atomic_inc(&pb->pb_hold);
				919	pagebuf_delwri_queue(pb, 0);
				920	do_free = 0;
				921	} else if (pb->pb_flags & PBF_FS_MANAGED) {
				922	do_free = 0;
				923	}
				924
				925	if (do_free) {
				926	list_del_init(&pb->pb_hash_list);
				927	spin_unlock(&hash->bh_lock);
				928	pagebuf_free(pb);
				929	} else {
				930	spin_unlock(&hash->bh_lock);
				931	}
				932	}
				933	}
				934
				935
				936	/*
				937	* Mutual exclusion on buffers. Locking model:
				938	*
				939	* Buffers associated with inodes for which buffer locking
				940	* is not enabled are not protected by semaphores, and are
				941	* assumed to be exclusively owned by the caller. There is a
				942	* spinlock in the buffer, used by the caller when concurrent
				943	* access is possible.
				944	*/
				945
				946	/*
				947	* pagebuf_cond_lock
				948	*
				949	* pagebuf_cond_lock locks a buffer object, if it is not already locked.
				950	* Note that this in no way
				951	* locks the underlying pages, so it is only useful for synchronizing
				952	* concurrent use of page buffer objects, not for synchronizing independent
				953	* access to the underlying pages.
				954	*/
				955	int
				956	pagebuf_cond_lock( /* lock buffer, if not locked */
				957	/* returns -EBUSY if locked) */
				958	xfs_buf_t *pb)
				959	{
				960	int locked;
				961
				962	locked = down_trylock(&pb->pb_sema) == 0;
				963	if (locked) {
				964	PB_SET_OWNER(pb);
				965	}
				966	PB_TRACE(pb, "cond_lock", (long)locked);
				967	return(locked ? 0 : -EBUSY);
				968	}
				969
				970	#if defined(DEBUG) \|\| defined(XFS_BLI_TRACE)
				971	/*
				972	* pagebuf_lock_value
				973	*
				974	* Return lock value for a pagebuf
				975	*/
				976	int
				977	pagebuf_lock_value(
				978	xfs_buf_t *pb)
				979	{
				980	return(atomic_read(&pb->pb_sema.count));
				981	}
				982	#endif
				983
				984	/*
				985	* pagebuf_lock
				986	*
				987	* pagebuf_lock locks a buffer object. Note that this in no way
				988	* locks the underlying pages, so it is only useful for synchronizing
				989	* concurrent use of page buffer objects, not for synchronizing independent
				990	* access to the underlying pages.
				991	*/
				992	int
				993	pagebuf_lock(
				994	xfs_buf_t *pb)
				995	{
				996	PB_TRACE(pb, "lock", 0);
				997	if (atomic_read(&pb->pb_io_remaining))
				998	blk_run_address_space(pb->pb_target->pbr_mapping);
				999	down(&pb->pb_sema);
				1000	PB_SET_OWNER(pb);
				1001	PB_TRACE(pb, "locked", 0);
				1002	return 0;
				1003	}
				1004
				1005	/*
				1006	* pagebuf_unlock
				1007	*
				1008	* pagebuf_unlock releases the lock on the buffer object created by
				1009	* pagebuf_lock or pagebuf_cond_lock (not any
				1010	* pinning of underlying pages created by pagebuf_pin).
				1011	*/
				1012	void
				1013	pagebuf_unlock( /* unlock buffer */
				1014	xfs_buf_t pb) / buffer to unlock */
				1015	{
				1016	PB_CLEAR_OWNER(pb);
				1017	up(&pb->pb_sema);
				1018	PB_TRACE(pb, "unlock", 0);
				1019	}
				1020
				1021
				1022	/*
				1023	* Pinning Buffer Storage in Memory
				1024	*/
				1025
				1026	/*
				1027	* pagebuf_pin
				1028	*
				1029	* pagebuf_pin locks all of the memory represented by a buffer in
				1030	* memory. Multiple calls to pagebuf_pin and pagebuf_unpin, for
				1031	* the same or different buffers affecting a given page, will
				1032	* properly count the number of outstanding "pin" requests. The
				1033	* buffer may be released after the pagebuf_pin and a different
				1034	* buffer used when calling pagebuf_unpin, if desired.
				1035	* pagebuf_pin should be used by the file system when it wants be
				1036	* assured that no attempt will be made to force the affected
				1037	* memory to disk. It does not assure that a given logical page
				1038	* will not be moved to a different physical page.
				1039	*/
				1040	void
				1041	pagebuf_pin(
				1042	xfs_buf_t *pb)
				1043	{
				1044	atomic_inc(&pb->pb_pin_count);
				1045	PB_TRACE(pb, "pin", (long)pb->pb_pin_count.counter);
				1046	}
				1047
				1048	/*
				1049	* pagebuf_unpin
				1050	*
				1051	* pagebuf_unpin reverses the locking of memory performed by
				1052	* pagebuf_pin. Note that both functions affected the logical
				1053	* pages associated with the buffer, not the buffer itself.
				1054	*/
				1055	void
				1056	pagebuf_unpin(
				1057	xfs_buf_t *pb)
				1058	{
				1059	if (atomic_dec_and_test(&pb->pb_pin_count)) {
				1060	wake_up_all(&pb->pb_waiters);
				1061	}
				1062	PB_TRACE(pb, "unpin", (long)pb->pb_pin_count.counter);
				1063	}
				1064
				1065	int
				1066	pagebuf_ispin(
				1067	xfs_buf_t *pb)
				1068	{
				1069	return atomic_read(&pb->pb_pin_count);
				1070	}
				1071
				1072	/*
				1073	* pagebuf_wait_unpin
				1074	*
				1075	* pagebuf_wait_unpin waits until all of the memory associated
				1076	* with the buffer is not longer locked in memory. It returns
				1077	* immediately if none of the affected pages are locked.
				1078	*/
				1079	static inline void
				1080	_pagebuf_wait_unpin(
				1081	xfs_buf_t *pb)
				1082	{
				1083	DECLARE_WAITQUEUE (wait, current);
				1084
				1085	if (atomic_read(&pb->pb_pin_count) == 0)
				1086	return;
				1087
				1088	add_wait_queue(&pb->pb_waiters, &wait);
				1089	for (;;) {
				1090	set_current_state(TASK_UNINTERRUPTIBLE);
				1091	if (atomic_read(&pb->pb_pin_count) == 0)
				1092	break;
				1093	if (atomic_read(&pb->pb_io_remaining))
				1094	blk_run_address_space(pb->pb_target->pbr_mapping);
				1095	schedule();
				1096	}
				1097	remove_wait_queue(&pb->pb_waiters, &wait);
				1098	set_current_state(TASK_RUNNING);
				1099	}
				1100
				1101	/*
				1102	* Buffer Utility Routines
				1103	*/
				1104
				1105	/*
				1106	* pagebuf_iodone
				1107	*
				1108	* pagebuf_iodone marks a buffer for which I/O is in progress
				1109	* done with respect to that I/O. The pb_iodone routine, if
				1110	* present, will be called as a side-effect.
				1111	*/
				1112	STATIC void
				1113	pagebuf_iodone_work(
				1114	void *v)
				1115	{
				1116	xfs_buf_t bp = (xfs_buf_t )v;
				1117
				1118	if (bp->pb_iodone)
				1119	(*(bp->pb_iodone))(bp);
				1120	else if (bp->pb_flags & PBF_ASYNC)
				1121	xfs_buf_relse(bp);
				1122	}
				1123
				1124	void
				1125	pagebuf_iodone(
				1126	xfs_buf_t *pb,
				1127	int dataio,
				1128	int schedule)
				1129	{
				1130	pb->pb_flags &= ~(PBF_READ \| PBF_WRITE);
				1131	if (pb->pb_error == 0) {
				1132	pb->pb_flags &= ~(PBF_PARTIAL \| PBF_NONE);
				1133	}
				1134
				1135	PB_TRACE(pb, "iodone", pb->pb_iodone);
				1136
				1137	if ((pb->pb_iodone) \|\| (pb->pb_flags & PBF_ASYNC)) {
				1138	if (schedule) {
				1139	INIT_WORK(&pb->pb_iodone_work, pagebuf_iodone_work, pb);
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1140	queue_work(dataio ? xfsdatad_workqueue :
				1141	xfslogd_workqueue, &pb->pb_iodone_work);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1142	} else {
				1143	pagebuf_iodone_work(pb);
				1144	}
				1145	} else {
				1146	up(&pb->pb_iodonesema);
				1147	}
				1148	}
				1149
				1150	/*
				1151	* pagebuf_ioerror
				1152	*
				1153	* pagebuf_ioerror sets the error code for a buffer.
				1154	*/
				1155	void
				1156	pagebuf_ioerror( /* mark/clear buffer error flag */
				1157	xfs_buf_t pb, / buffer to mark */
				1158	int error) /* error to store (0 if none) */
				1159	{
				1160	ASSERT(error >= 0 && error <= 0xffff);
				1161	pb->pb_error = (unsigned short)error;
				1162	PB_TRACE(pb, "ioerror", (unsigned long)error);
				1163	}
				1164
				1165	/*
				1166	* pagebuf_iostart
				1167	*
				1168	* pagebuf_iostart initiates I/O on a buffer, based on the flags supplied.
				1169	* If necessary, it will arrange for any disk space allocation required,
				1170	* and it will break up the request if the block mappings require it.
				1171	* The pb_iodone routine in the buffer supplied will only be called
				1172	* when all of the subsidiary I/O requests, if any, have been completed.
				1173	* pagebuf_iostart calls the pagebuf_ioinitiate routine or
				1174	* pagebuf_iorequest, if the former routine is not defined, to start
				1175	* the I/O on a given low-level request.
				1176	*/
				1177	int
				1178	pagebuf_iostart( /* start I/O on a buffer */
				1179	xfs_buf_t pb, / buffer to start */
				1180	page_buf_flags_t flags) /* PBF_LOCK, PBF_ASYNC, PBF_READ, */
				1181	/* PBF_WRITE, PBF_DELWRI, */
				1182	/* PBF_DONT_BLOCK */
				1183	{
				1184	int status = 0;
				1185
				1186	PB_TRACE(pb, "iostart", (unsigned long)flags);
				1187
				1188	if (flags & PBF_DELWRI) {
				1189	pb->pb_flags &= ~(PBF_READ \| PBF_WRITE \| PBF_ASYNC);
				1190	pb->pb_flags \|= flags & (PBF_DELWRI \| PBF_ASYNC);
				1191	pagebuf_delwri_queue(pb, 1);
				1192	return status;
				1193	}
				1194
				1195	pb->pb_flags &= ~(PBF_READ \| PBF_WRITE \| PBF_ASYNC \| PBF_DELWRI \| \
				1196	PBF_READ_AHEAD \| _PBF_RUN_QUEUES);
				1197	pb->pb_flags \|= flags & (PBF_READ \| PBF_WRITE \| PBF_ASYNC \| \
				1198	PBF_READ_AHEAD \| _PBF_RUN_QUEUES);
				1199
				1200	BUG_ON(pb->pb_bn == XFS_BUF_DADDR_NULL);
				1201
				1202	/* For writes allow an alternate strategy routine to precede
				1203	* the actual I/O request (which may not be issued at all in
				1204	* a shutdown situation, for example).
				1205	*/
				1206	status = (flags & PBF_WRITE) ?
				1207	pagebuf_iostrategy(pb) : pagebuf_iorequest(pb);
				1208
				1209	/* Wait for I/O if we are not an async request.
				1210	* Note: async I/O request completion will release the buffer,
				1211	* and that can already be done by this point. So using the
				1212	* buffer pointer from here on, after async I/O, is invalid.
				1213	*/
				1214	if (!status && !(flags & PBF_ASYNC))
				1215	status = pagebuf_iowait(pb);
				1216
				1217	return status;
				1218	}
				1219
				1220	/*
				1221	* Helper routine for pagebuf_iorequest
				1222	*/
				1223
				1224	STATIC __inline__ int
				1225	_pagebuf_iolocked(
				1226	xfs_buf_t *pb)
				1227	{
				1228	ASSERT(pb->pb_flags & (PBF_READ\|PBF_WRITE));
				1229	if (pb->pb_flags & PBF_READ)
				1230	return pb->pb_locked;
				1231	return 0;
				1232	}
				1233
				1234	STATIC __inline__ void
				1235	_pagebuf_iodone(
				1236	xfs_buf_t *pb,
				1237	int schedule)
				1238	{
				1239	if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
				1240	pb->pb_locked = 0;
				1241	pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), schedule);
				1242	}
				1243	}
				1244
				1245	STATIC int
				1246	bio_end_io_pagebuf(
				1247	struct bio *bio,
				1248	unsigned int bytes_done,
				1249	int error)
				1250	{
				1251	xfs_buf_t pb = (xfs_buf_t )bio->bi_private;
				1252	unsigned int i, blocksize = pb->pb_target->pbr_bsize;
				1253	struct bio_vec *bvec = bio->bi_io_vec;
				1254
				1255	if (bio->bi_size)
				1256	return 1;
				1257
				1258	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
				1259	pb->pb_error = EIO;
				1260
				1261	for (i = 0; i < bio->bi_vcnt; i++, bvec++) {
				1262	struct page *page = bvec->bv_page;
				1263
				1264	if (pb->pb_error) {
				1265	SetPageError(page);
				1266	} else if (blocksize == PAGE_CACHE_SIZE) {
				1267	SetPageUptodate(page);
				1268	} else if (!PagePrivate(page) &&
				1269	(pb->pb_flags & _PBF_PAGE_CACHE)) {
				1270	set_page_region(page, bvec->bv_offset, bvec->bv_len);
				1271	}
				1272
				1273	if (_pagebuf_iolocked(pb)) {
				1274	unlock_page(page);
				1275	}
				1276	}
				1277
				1278	_pagebuf_iodone(pb, 1);
				1279	bio_put(bio);
				1280	return 0;
				1281	}
				1282
				1283	STATIC void
				1284	_pagebuf_ioapply(
				1285	xfs_buf_t *pb)
				1286	{
				1287	int i, rw, map_i, total_nr_pages, nr_pages;
				1288	struct bio *bio;
				1289	int offset = pb->pb_offset;
				1290	int size = pb->pb_count_desired;
				1291	sector_t sector = pb->pb_bn;
				1292	unsigned int blocksize = pb->pb_target->pbr_bsize;
				1293	int locking = _pagebuf_iolocked(pb);
				1294
				1295	total_nr_pages = pb->pb_page_count;
				1296	map_i = 0;
				1297
				1298	if (pb->pb_flags & _PBF_RUN_QUEUES) {
				1299	pb->pb_flags &= ~_PBF_RUN_QUEUES;
				1300	rw = (pb->pb_flags & PBF_READ) ? READ_SYNC : WRITE_SYNC;
				1301	} else {
				1302	rw = (pb->pb_flags & PBF_READ) ? READ : WRITE;
				1303	}
				1304
				1305	/* Special code path for reading a sub page size pagebuf in --
				1306	* we populate up the whole page, and hence the other metadata
				1307	* in the same page. This optimization is only valid when the
				1308	* filesystem block size and the page size are equal.
				1309	*/
				1310	if ((pb->pb_buffer_length < PAGE_CACHE_SIZE) &&
				1311	(pb->pb_flags & PBF_READ) && locking &&
				1312	(blocksize == PAGE_CACHE_SIZE)) {
				1313	bio = bio_alloc(GFP_NOIO, 1);
				1314
				1315	bio->bi_bdev = pb->pb_target->pbr_bdev;
				1316	bio->bi_sector = sector - (offset >> BBSHIFT);
				1317	bio->bi_end_io = bio_end_io_pagebuf;
				1318	bio->bi_private = pb;
				1319
				1320	bio_add_page(bio, pb->pb_pages[0], PAGE_CACHE_SIZE, 0);
				1321	size = 0;
				1322
				1323	atomic_inc(&pb->pb_io_remaining);
				1324
				1325	goto submit_io;
				1326	}
				1327
				1328	/* Lock down the pages which we need to for the request */
				1329	if (locking && (pb->pb_flags & PBF_WRITE) && (pb->pb_locked == 0)) {
				1330	for (i = 0; size; i++) {
				1331	int nbytes = PAGE_CACHE_SIZE - offset;
				1332	struct page *page = pb->pb_pages[i];
				1333
				1334	if (nbytes > size)
				1335	nbytes = size;
				1336
				1337	lock_page(page);
				1338
				1339	size -= nbytes;
				1340	offset = 0;
				1341	}
				1342	offset = pb->pb_offset;
				1343	size = pb->pb_count_desired;
				1344	}
				1345
				1346	next_chunk:
				1347	atomic_inc(&pb->pb_io_remaining);
				1348	nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
				1349	if (nr_pages > total_nr_pages)
				1350	nr_pages = total_nr_pages;
				1351
				1352	bio = bio_alloc(GFP_NOIO, nr_pages);
				1353	bio->bi_bdev = pb->pb_target->pbr_bdev;
				1354	bio->bi_sector = sector;
				1355	bio->bi_end_io = bio_end_io_pagebuf;
				1356	bio->bi_private = pb;
				1357
				1358	for (; size && nr_pages; nr_pages--, map_i++) {
				1359	int nbytes = PAGE_CACHE_SIZE - offset;
				1360
				1361	if (nbytes > size)
				1362	nbytes = size;
				1363
				1364	if (bio_add_page(bio, pb->pb_pages[map_i],
				1365	nbytes, offset) < nbytes)
				1366	break;
				1367
				1368	offset = 0;
				1369	sector += nbytes >> BBSHIFT;
				1370	size -= nbytes;
				1371	total_nr_pages--;
				1372	}
				1373
				1374	submit_io:
				1375	if (likely(bio->bi_size)) {
				1376	submit_bio(rw, bio);
				1377	if (size)
				1378	goto next_chunk;
				1379	} else {
				1380	bio_put(bio);
				1381	pagebuf_ioerror(pb, EIO);
				1382	}
				1383	}
				1384
				1385	/*
				1386	* pagebuf_iorequest -- the core I/O request routine.
				1387	*/
				1388	int
				1389	pagebuf_iorequest( /* start real I/O */
				1390	xfs_buf_t pb) / buffer to convey to device */
				1391	{
				1392	PB_TRACE(pb, "iorequest", 0);
				1393
				1394	if (pb->pb_flags & PBF_DELWRI) {
				1395	pagebuf_delwri_queue(pb, 1);
				1396	return 0;
				1397	}
				1398
				1399	if (pb->pb_flags & PBF_WRITE) {
				1400	_pagebuf_wait_unpin(pb);
				1401	}
				1402
				1403	pagebuf_hold(pb);
				1404
				1405	/* Set the count to 1 initially, this will stop an I/O
				1406	* completion callout which happens before we have started
				1407	* all the I/O from calling pagebuf_iodone too early.
				1408	*/
				1409	atomic_set(&pb->pb_io_remaining, 1);
				1410	_pagebuf_ioapply(pb);
				1411	_pagebuf_iodone(pb, 0);
				1412
				1413	pagebuf_rele(pb);
				1414	return 0;
				1415	}
				1416
				1417	/*
				1418	* pagebuf_iowait
				1419	*
				1420	* pagebuf_iowait waits for I/O to complete on the buffer supplied.
				1421	* It returns immediately if no I/O is pending. In any case, it returns
				1422	* the error code, if any, or 0 if there is no error.
				1423	*/
				1424	int
				1425	pagebuf_iowait(
				1426	xfs_buf_t *pb)
				1427	{
				1428	PB_TRACE(pb, "iowait", 0);
				1429	if (atomic_read(&pb->pb_io_remaining))
				1430	blk_run_address_space(pb->pb_target->pbr_mapping);
				1431	down(&pb->pb_iodonesema);
				1432	PB_TRACE(pb, "iowaited", (long)pb->pb_error);
				1433	return pb->pb_error;
				1434	}
				1435
				1436	caddr_t
				1437	pagebuf_offset(
				1438	xfs_buf_t *pb,
				1439	size_t offset)
				1440	{
				1441	struct page *page;
				1442
				1443	offset += pb->pb_offset;
				1444
				1445	page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT];
				1446	return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1));
				1447	}
				1448
				1449	/*
				1450	* pagebuf_iomove
				1451	*
				1452	* Move data into or out of a buffer.
				1453	*/
				1454	void
				1455	pagebuf_iomove(
				1456	xfs_buf_t pb, / buffer to process */
				1457	size_t boff, /* starting buffer offset */
				1458	size_t bsize, /* length to copy */
				1459	caddr_t data, /* data address */
				1460	page_buf_rw_t mode) /* read/write flag */
				1461	{
				1462	size_t bend, cpoff, csize;
				1463	struct page *page;
				1464
				1465	bend = boff + bsize;
				1466	while (boff < bend) {
				1467	page = pb->pb_pages[page_buf_btoct(boff + pb->pb_offset)];
				1468	cpoff = page_buf_poff(boff + pb->pb_offset);
				1469	csize = min_t(size_t,
				1470	PAGE_CACHE_SIZE-cpoff, pb->pb_count_desired-boff);
				1471
				1472	ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
				1473
				1474	switch (mode) {
				1475	case PBRW_ZERO:
				1476	memset(page_address(page) + cpoff, 0, csize);
				1477	break;
				1478	case PBRW_READ:
				1479	memcpy(data, page_address(page) + cpoff, csize);
				1480	break;
				1481	case PBRW_WRITE:
				1482	memcpy(page_address(page) + cpoff, data, csize);
				1483	}
				1484
				1485	boff += csize;
				1486	data += csize;
				1487	}
				1488	}
				1489
				1490	/*
				1491	* Handling of buftargs.
				1492	*/
				1493
				1494	/*
				1495	* Wait for any bufs with callbacks that have been submitted but
				1496	* have not yet returned... walk the hash list for the target.
				1497	*/
				1498	void
				1499	xfs_wait_buftarg(
				1500	xfs_buftarg_t *btp)
				1501	{
				1502	xfs_buf_t bp, n;
				1503	xfs_bufhash_t *hash;
				1504	uint i;
				1505
				1506	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
				1507	hash = &btp->bt_hash[i];
				1508	again:
				1509	spin_lock(&hash->bh_lock);
				1510	list_for_each_entry_safe(bp, n, &hash->bh_list, pb_hash_list) {
				1511	ASSERT(btp == bp->pb_target);
				1512	if (!(bp->pb_flags & PBF_FS_MANAGED)) {
				1513	spin_unlock(&hash->bh_lock);
				1514	delay(100);
				1515	goto again;
				1516	}
				1517	}
				1518	spin_unlock(&hash->bh_lock);
				1519	}
				1520	}
				1521
				1522	/*
				1523	* Allocate buffer hash table for a given target.
				1524	* For devices containing metadata (i.e. not the log/realtime devices)
				1525	* we need to allocate a much larger hash table.
				1526	*/
				1527	STATIC void
				1528	xfs_alloc_bufhash(
				1529	xfs_buftarg_t *btp,
				1530	int external)
				1531	{
				1532	unsigned int i;
				1533
				1534	btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */
				1535	btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
				1536	btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) *
				1537	sizeof(xfs_bufhash_t), KM_SLEEP);
				1538	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
				1539	spin_lock_init(&btp->bt_hash[i].bh_lock);
				1540	INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
				1541	}
				1542	}
				1543
				1544	STATIC void
				1545	xfs_free_bufhash(
				1546	xfs_buftarg_t *btp)
				1547	{
				1548	kmem_free(btp->bt_hash,
				1549	(1 << btp->bt_hashshift) * sizeof(xfs_bufhash_t));
				1550	btp->bt_hash = NULL;
				1551	}
				1552
				1553	void
				1554	xfs_free_buftarg(
				1555	xfs_buftarg_t *btp,
				1556	int external)
				1557	{
				1558	xfs_flush_buftarg(btp, 1);
				1559	if (external)
				1560	xfs_blkdev_put(btp->pbr_bdev);
				1561	xfs_free_bufhash(btp);
				1562	iput(btp->pbr_mapping->host);
				1563	kmem_free(btp, sizeof(*btp));
				1564	}
				1565
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1566	STATIC int
				1567	xfs_setsize_buftarg_flags(
				1568	xfs_buftarg_t *btp,
				1569	unsigned int blocksize,
				1570	unsigned int sectorsize,
				1571	int verbose)
				1572	{
				1573	btp->pbr_bsize = blocksize;
				1574	btp->pbr_sshift = ffs(sectorsize) - 1;
				1575	btp->pbr_smask = sectorsize - 1;
				1576
				1577	if (set_blocksize(btp->pbr_bdev, sectorsize)) {
				1578	printk(KERN_WARNING
				1579	"XFS: Cannot set_blocksize to %u on device %s\n",
				1580	sectorsize, XFS_BUFTARG_NAME(btp));
				1581	return EINVAL;
				1582	}
				1583
				1584	if (verbose &&
				1585	(PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
				1586	printk(KERN_WARNING
				1587	"XFS: %u byte sectors in use on device %s. "
				1588	"This is suboptimal; %u or greater is ideal.\n",
				1589	sectorsize, XFS_BUFTARG_NAME(btp),
				1590	(unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
				1591	}
				1592
				1593	return 0;
				1594	}
				1595
				1596	/*
				1597	* When allocating the initial buffer target we have not yet
				1598	* read in the superblock, so don't know what sized sectors
				1599	* are being used is at this early stage. Play safe.
				1600	*/
				1601	STATIC int
				1602	xfs_setsize_buftarg_early(
				1603	xfs_buftarg_t *btp,
				1604	struct block_device *bdev)
				1605	{
				1606	return xfs_setsize_buftarg_flags(btp,
				1607	PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0);
				1608	}
				1609
				1610	int
				1611	xfs_setsize_buftarg(
				1612	xfs_buftarg_t *btp,
				1613	unsigned int blocksize,
				1614	unsigned int sectorsize)
				1615	{
				1616	return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
				1617	}
				1618
				1619	STATIC int
				1620	xfs_mapping_buftarg(
				1621	xfs_buftarg_t *btp,
				1622	struct block_device *bdev)
				1623	{
				1624	struct backing_dev_info *bdi;
				1625	struct inode *inode;
				1626	struct address_space *mapping;
				1627	static struct address_space_operations mapping_aops = {
				1628	.sync_page = block_sync_page,
				1629	};
				1630
				1631	inode = new_inode(bdev->bd_inode->i_sb);
				1632	if (!inode) {
				1633	printk(KERN_WARNING
				1634	"XFS: Cannot allocate mapping inode for device %s\n",
				1635	XFS_BUFTARG_NAME(btp));
				1636	return ENOMEM;
				1637	}
				1638	inode->i_mode = S_IFBLK;
				1639	inode->i_bdev = bdev;
				1640	inode->i_rdev = bdev->bd_dev;
				1641	bdi = blk_get_backing_dev_info(bdev);
				1642	if (!bdi)
				1643	bdi = &default_backing_dev_info;
				1644	mapping = &inode->i_data;
				1645	mapping->a_ops = &mapping_aops;
				1646	mapping->backing_dev_info = bdi;
				1647	mapping_set_gfp_mask(mapping, GFP_NOFS);
				1648	btp->pbr_mapping = mapping;
				1649	return 0;
				1650	}
				1651
				1652	xfs_buftarg_t *
				1653	xfs_alloc_buftarg(
				1654	struct block_device *bdev,
				1655	int external)
				1656	{
				1657	xfs_buftarg_t *btp;
				1658
				1659	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
				1660
				1661	btp->pbr_dev = bdev->bd_dev;
				1662	btp->pbr_bdev = bdev;
				1663	if (xfs_setsize_buftarg_early(btp, bdev))
				1664	goto error;
				1665	if (xfs_mapping_buftarg(btp, bdev))
				1666	goto error;
				1667	xfs_alloc_bufhash(btp, external);
				1668	return btp;
				1669
				1670	error:
				1671	kmem_free(btp, sizeof(*btp));
				1672	return NULL;
				1673	}
				1674
				1675
				1676	/*
				1677	* Pagebuf delayed write buffer handling
				1678	*/
				1679
				1680	STATIC LIST_HEAD(pbd_delwrite_queue);
				1681	STATIC DEFINE_SPINLOCK(pbd_delwrite_lock);
				1682
				1683	STATIC void
				1684	pagebuf_delwri_queue(
				1685	xfs_buf_t *pb,
				1686	int unlock)
				1687	{
				1688	PB_TRACE(pb, "delwri_q", (long)unlock);
				1689	ASSERT(pb->pb_flags & PBF_DELWRI);
				1690
				1691	spin_lock(&pbd_delwrite_lock);
				1692	/* If already in the queue, dequeue and place at tail */
				1693	if (!list_empty(&pb->pb_list)) {
				1694	if (unlock) {
				1695	atomic_dec(&pb->pb_hold);
				1696	}
				1697	list_del(&pb->pb_list);
				1698	}
				1699
				1700	list_add_tail(&pb->pb_list, &pbd_delwrite_queue);
				1701	pb->pb_queuetime = jiffies;
				1702	spin_unlock(&pbd_delwrite_lock);
				1703
				1704	if (unlock)
				1705	pagebuf_unlock(pb);
				1706	}
				1707
				1708	void
				1709	pagebuf_delwri_dequeue(
				1710	xfs_buf_t *pb)
				1711	{
				1712	int dequeued = 0;
				1713
				1714	spin_lock(&pbd_delwrite_lock);
				1715	if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) {
				1716	list_del_init(&pb->pb_list);
				1717	dequeued = 1;
				1718	}
				1719	pb->pb_flags &= ~PBF_DELWRI;
				1720	spin_unlock(&pbd_delwrite_lock);
				1721
				1722	if (dequeued)
				1723	pagebuf_rele(pb);
				1724
				1725	PB_TRACE(pb, "delwri_dq", (long)dequeued);
				1726	}
				1727
				1728	STATIC void
				1729	pagebuf_runall_queues(
				1730	struct workqueue_struct *queue)
				1731	{
				1732	flush_workqueue(queue);
				1733	}
				1734
				1735	/* Defines for pagebuf daemon */
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1736	STATIC DECLARE_COMPLETION(xfsbufd_done);
				1737	STATIC struct task_struct *xfsbufd_task;
				1738	STATIC int xfsbufd_active;
				1739	STATIC int xfsbufd_force_flush;
				1740	STATIC int xfsbufd_force_sleep;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1741
				1742	STATIC int
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1743	xfsbufd_wakeup(
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1744	int priority,
				1745	unsigned int mask)
				1746	{
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1747	if (xfsbufd_force_sleep)
Nathan Scott	abd0cf7	2005-05-05 13:30:13 -0700	[diff] [blame]	1748	return 0;
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1749	xfsbufd_force_flush = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1750	barrier();
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1751	wake_up_process(xfsbufd_task);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1752	return 0;
				1753	}
				1754
				1755	STATIC int
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1756	xfsbufd(
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1757	void *data)
				1758	{
				1759	struct list_head tmp;
				1760	unsigned long age;
				1761	xfs_buftarg_t *target;
				1762	xfs_buf_t pb, n;
				1763
				1764	/* Set up the thread */
				1765	daemonize("xfsbufd");
				1766	current->flags \|= PF_MEMALLOC;
				1767
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1768	xfsbufd_task = current;
				1769	xfsbufd_active = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1770	barrier();
				1771
				1772	INIT_LIST_HEAD(&tmp);
				1773	do {
Christoph Lameter	3e1d1d2	2005-06-24 23:13:50 -0700	[diff] [blame^]	1774	if (unlikely(freezing(current))) {
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1775	xfsbufd_force_sleep = 1;
Christoph Lameter	3e1d1d2	2005-06-24 23:13:50 -0700	[diff] [blame^]	1776	refrigerator();
Nathan Scott	abd0cf7	2005-05-05 13:30:13 -0700	[diff] [blame]	1777	} else {
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1778	xfsbufd_force_sleep = 0;
Nathan Scott	abd0cf7	2005-05-05 13:30:13 -0700	[diff] [blame]	1779	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1780
				1781	set_current_state(TASK_INTERRUPTIBLE);
				1782	schedule_timeout((xfs_buf_timer_centisecs * HZ) / 100);
				1783
				1784	age = (xfs_buf_age_centisecs * HZ) / 100;
				1785	spin_lock(&pbd_delwrite_lock);
				1786	list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
				1787	PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb));
				1788	ASSERT(pb->pb_flags & PBF_DELWRI);
				1789
				1790	if (!pagebuf_ispin(pb) && !pagebuf_cond_lock(pb)) {
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1791	if (!xfsbufd_force_flush &&
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1792	time_before(jiffies,
				1793	pb->pb_queuetime + age)) {
				1794	pagebuf_unlock(pb);
				1795	break;
				1796	}
				1797
				1798	pb->pb_flags &= ~PBF_DELWRI;
				1799	pb->pb_flags \|= PBF_WRITE;
				1800	list_move(&pb->pb_list, &tmp);
				1801	}
				1802	}
				1803	spin_unlock(&pbd_delwrite_lock);
				1804
				1805	while (!list_empty(&tmp)) {
				1806	pb = list_entry(tmp.next, xfs_buf_t, pb_list);
				1807	target = pb->pb_target;
				1808
				1809	list_del_init(&pb->pb_list);
				1810	pagebuf_iostrategy(pb);
				1811
				1812	blk_run_address_space(target->pbr_mapping);
				1813	}
				1814
				1815	if (as_list_len > 0)
				1816	purge_addresses();
				1817
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1818	xfsbufd_force_flush = 0;
				1819	} while (xfsbufd_active);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1820
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1821	complete_and_exit(&xfsbufd_done, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1822	}
				1823
				1824	/*
				1825	* Go through all incore buffers, and release buffers if they belong to
				1826	* the given device. This is used in filesystem error handling to
				1827	* preserve the consistency of its metadata.
				1828	*/
				1829	int
				1830	xfs_flush_buftarg(
				1831	xfs_buftarg_t *target,
				1832	int wait)
				1833	{
				1834	struct list_head tmp;
				1835	xfs_buf_t pb, n;
				1836	int pincount = 0;
				1837
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1838	pagebuf_runall_queues(xfsdatad_workqueue);
				1839	pagebuf_runall_queues(xfslogd_workqueue);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1840
				1841	INIT_LIST_HEAD(&tmp);
				1842	spin_lock(&pbd_delwrite_lock);
				1843	list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
				1844
				1845	if (pb->pb_target != target)
				1846	continue;
				1847
				1848	ASSERT(pb->pb_flags & PBF_DELWRI);
				1849	PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb));
				1850	if (pagebuf_ispin(pb)) {
				1851	pincount++;
				1852	continue;
				1853	}
				1854
				1855	pb->pb_flags &= ~PBF_DELWRI;
				1856	pb->pb_flags \|= PBF_WRITE;
				1857	list_move(&pb->pb_list, &tmp);
				1858	}
				1859	spin_unlock(&pbd_delwrite_lock);
				1860
				1861	/*
				1862	* Dropped the delayed write list lock, now walk the temporary list
				1863	*/
				1864	list_for_each_entry_safe(pb, n, &tmp, pb_list) {
				1865	if (wait)
				1866	pb->pb_flags &= ~PBF_ASYNC;
				1867	else
				1868	list_del_init(&pb->pb_list);
				1869
				1870	pagebuf_lock(pb);
				1871	pagebuf_iostrategy(pb);
				1872	}
				1873
				1874	/*
				1875	* Remaining list items must be flushed before returning
				1876	*/
				1877	while (!list_empty(&tmp)) {
				1878	pb = list_entry(tmp.next, xfs_buf_t, pb_list);
				1879
				1880	list_del_init(&pb->pb_list);
				1881	xfs_iowait(pb);
				1882	xfs_buf_relse(pb);
				1883	}
				1884
				1885	if (wait)
				1886	blk_run_address_space(target->pbr_mapping);
				1887
				1888	return pincount;
				1889	}
				1890
				1891	STATIC int
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1892	xfs_buf_daemons_start(void)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1893	{
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1894	int error = -ENOMEM;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1895
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1896	xfslogd_workqueue = create_workqueue("xfslogd");
				1897	if (!xfslogd_workqueue)
				1898	goto out;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1899
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1900	xfsdatad_workqueue = create_workqueue("xfsdatad");
				1901	if (!xfsdatad_workqueue)
				1902	goto out_destroy_xfslogd_workqueue;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1903
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1904	error = kernel_thread(xfsbufd, NULL, CLONE_FS\|CLONE_FILES);
				1905	if (error < 0)
				1906	goto out_destroy_xfsdatad_workqueue;
				1907	return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1908
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1909	out_destroy_xfsdatad_workqueue:
				1910	destroy_workqueue(xfsdatad_workqueue);
				1911	out_destroy_xfslogd_workqueue:
				1912	destroy_workqueue(xfslogd_workqueue);
				1913	out:
				1914	return error;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1915	}
				1916
				1917	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1918	* Note: do not mark as __exit, it is called from pagebuf_terminate.
				1919	*/
				1920	STATIC void
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1921	xfs_buf_daemons_stop(void)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1922	{
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1923	xfsbufd_active = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1924	barrier();
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1925	wait_for_completion(&xfsbufd_done);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1926
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1927	destroy_workqueue(xfslogd_workqueue);
				1928	destroy_workqueue(xfsdatad_workqueue);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1929	}
				1930
				1931	/*
				1932	* Initialization and Termination
				1933	*/
				1934
				1935	int __init
				1936	pagebuf_init(void)
				1937	{
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1938	int error = -ENOMEM;
				1939
				1940	pagebuf_zone = kmem_zone_init(sizeof(xfs_buf_t), "xfs_buf");
				1941	if (!pagebuf_zone)
				1942	goto out;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1943
				1944	#ifdef PAGEBUF_TRACE
				1945	pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP);
				1946	#endif
				1947
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1948	error = xfs_buf_daemons_start();
Christoph Hellwig	cf9937c	2005-06-21 15:35:24 +1000	[diff] [blame]	1949	if (error)
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1950	goto out_free_buf_zone;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1951
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1952	pagebuf_shake = kmem_shake_register(xfsbufd_wakeup);
				1953	if (!pagebuf_shake) {
				1954	error = -ENOMEM;
				1955	goto out_stop_daemons;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1956	}
				1957
				1958	return 0;
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1959
				1960	out_stop_daemons:
				1961	xfs_buf_daemons_stop();
				1962	out_free_buf_zone:
				1963	#ifdef PAGEBUF_TRACE
				1964	ktrace_free(pagebuf_trace_buf);
				1965	#endif
				1966	kmem_zone_destroy(pagebuf_zone);
				1967	out:
				1968	return error;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1969	}
				1970
				1971
				1972	/*
				1973	* pagebuf_terminate.
				1974	*
				1975	* Note: do not mark as __exit, this is also called from the __init code.
				1976	*/
				1977	void
				1978	pagebuf_terminate(void)
				1979	{
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1980	xfs_buf_daemons_stop();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1981
				1982	#ifdef PAGEBUF_TRACE
				1983	ktrace_free(pagebuf_trace_buf);
				1984	#endif
				1985
Christoph Hellwig	23ea403	2005-06-21 15:14:01 +1000	[diff] [blame]	1986	kmem_zone_destroy(pagebuf_zone);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1987	kmem_shake_deregister(pagebuf_shake);
				1988	}