Blame - fs/xfs/xfs_filestream.c - kernel/msm-4.9

blob: 12b6e7701985378e56f619dfd58f79be0d45c94c [file] [log] [blame]

David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	1	/*
				2	* Copyright (c) 2006-2007 Silicon Graphics, Inc.
				3	* All Rights Reserved.
				4	*
				5	* This program is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU General Public License as
				7	* published by the Free Software Foundation.
				8	*
				9	* This program is distributed in the hope that it would be useful,
				10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				12	* GNU General Public License for more details.
				13	*
				14	* You should have received a copy of the GNU General Public License
				15	* along with this program; if not, write the Free Software Foundation,
				16	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
				17	*/
				18	#include "xfs.h"
Dave Chinner	a4fbe6a	2013-10-23 10:51:50 +1100	[diff] [blame]	19	#include "xfs_format.h"
Dave Chinner	239880e	2013-10-23 10:50:10 +1100	[diff] [blame]	20	#include "xfs_log_format.h"
				21	#include "xfs_trans_resv.h"
				22	#include "xfs_ag.h"
				23	#include "xfs_sb.h"
				24	#include "xfs_mount.h"
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	25	#include "xfs_inum.h"
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	26	#include "xfs_inode.h"
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	27	#include "xfs_bmap.h"
Dave Chinner	6898811	2013-08-12 20:49:42 +1000	[diff] [blame]	28	#include "xfs_bmap_util.h"
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	29	#include "xfs_alloc.h"
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	30	#include "xfs_mru_cache.h"
Dave Chinner	a4fbe6a	2013-10-23 10:51:50 +1100	[diff] [blame]	31	#include "xfs_dinode.h"
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	32	#include "xfs_filestream.h"
Christoph Hellwig	0b1b213	2009-12-14 23:14:59 +0000	[diff] [blame]	33	#include "xfs_trace.h"
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	34
				35	#ifdef XFS_FILESTREAMS_TRACE
				36
				37	ktrace_t *xfs_filestreams_trace_buf;
				38
				39	STATIC void
				40	xfs_filestreams_trace(
				41	xfs_mount_t mp, / mount point */
				42	int type, /* type of trace */
				43	const char func, / source function */
				44	int line, /* source line number */
				45	__psunsigned_t arg0,
				46	__psunsigned_t arg1,
				47	__psunsigned_t arg2,
				48	__psunsigned_t arg3,
				49	__psunsigned_t arg4,
				50	__psunsigned_t arg5)
				51	{
				52	ktrace_enter(xfs_filestreams_trace_buf,
				53	(void *)(__psint_t)(type \| (line << 16)),
				54	(void *)func,
				55	(void *)(__psunsigned_t)current_pid(),
				56	(void *)mp,
				57	(void *)(__psunsigned_t)arg0,
				58	(void *)(__psunsigned_t)arg1,
				59	(void *)(__psunsigned_t)arg2,
				60	(void *)(__psunsigned_t)arg3,
				61	(void *)(__psunsigned_t)arg4,
				62	(void *)(__psunsigned_t)arg5,
				63	NULL, NULL, NULL, NULL, NULL, NULL);
				64	}
				65
				66	#define TRACE0(mp,t) TRACE6(mp,t,0,0,0,0,0,0)
				67	#define TRACE1(mp,t,a0) TRACE6(mp,t,a0,0,0,0,0,0)
				68	#define TRACE2(mp,t,a0,a1) TRACE6(mp,t,a0,a1,0,0,0,0)
				69	#define TRACE3(mp,t,a0,a1,a2) TRACE6(mp,t,a0,a1,a2,0,0,0)
				70	#define TRACE4(mp,t,a0,a1,a2,a3) TRACE6(mp,t,a0,a1,a2,a3,0,0)
				71	#define TRACE5(mp,t,a0,a1,a2,a3,a4) TRACE6(mp,t,a0,a1,a2,a3,a4,0)
				72	#define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \
Harvey Harrison	34a622b	2008-04-10 12:19:21 +1000	[diff] [blame]	73	xfs_filestreams_trace(mp, t, __func__, __LINE__, \
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	74	(__psunsigned_t)a0, (__psunsigned_t)a1, \
				75	(__psunsigned_t)a2, (__psunsigned_t)a3, \
				76	(__psunsigned_t)a4, (__psunsigned_t)a5)
				77
				78	#define TRACE_AG_SCAN(mp, ag, ag2) \
				79	TRACE2(mp, XFS_FSTRM_KTRACE_AGSCAN, ag, ag2);
				80	#define TRACE_AG_PICK1(mp, max_ag, maxfree) \
				81	TRACE2(mp, XFS_FSTRM_KTRACE_AGPICK1, max_ag, maxfree);
				82	#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) \
				83	TRACE6(mp, XFS_FSTRM_KTRACE_AGPICK2, ag, ag2, \
				84	cnt, free, scan, flag)
				85	#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) \
				86	TRACE5(mp, XFS_FSTRM_KTRACE_UPDATE, ip, ag, cnt, ag2, cnt2)
				87	#define TRACE_FREE(mp, ip, pip, ag, cnt) \
				88	TRACE4(mp, XFS_FSTRM_KTRACE_FREE, ip, pip, ag, cnt)
				89	#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) \
				90	TRACE4(mp, XFS_FSTRM_KTRACE_ITEM_LOOKUP, ip, pip, ag, cnt)
				91	#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) \
				92	TRACE4(mp, XFS_FSTRM_KTRACE_ASSOCIATE, ip, pip, ag, cnt)
				93	#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) \
				94	TRACE6(mp, XFS_FSTRM_KTRACE_MOVEAG, ip, pip, oag, ocnt, nag, ncnt)
				95	#define TRACE_ORPHAN(mp, ip, ag) \
				96	TRACE2(mp, XFS_FSTRM_KTRACE_ORPHAN, ip, ag);
				97
				98
				99	#else
				100	#define TRACE_AG_SCAN(mp, ag, ag2)
				101	#define TRACE_AG_PICK1(mp, max_ag, maxfree)
				102	#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag)
				103	#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2)
				104	#define TRACE_FREE(mp, ip, pip, ag, cnt)
				105	#define TRACE_LOOKUP(mp, ip, pip, ag, cnt)
				106	#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt)
				107	#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt)
				108	#define TRACE_ORPHAN(mp, ip, ag)
				109	#endif
				110
				111	static kmem_zone_t *item_zone;
				112
				113	/*
				114	* Structure for associating a file or a directory with an allocation group.
				115	* The parent directory pointer is only needed for files, but since there will
				116	* generally be vastly more files than directories in the cache, using the same
				117	* data structure simplifies the code with very little memory overhead.
				118	*/
				119	typedef struct fstrm_item
				120	{
				121	xfs_agnumber_t ag; /* AG currently in use for the file/directory. */
				122	xfs_inode_t ip; / inode self-pointer. */
				123	xfs_inode_t pip; / Parent directory inode pointer. */
				124	} fstrm_item_t;
				125
Christoph Hellwig	0664ce8	2010-07-20 17:31:01 +1000	[diff] [blame]	126	/*
				127	* Allocation group filestream associations are tracked with per-ag atomic
				128	* counters. These counters allow _xfs_filestream_pick_ag() to tell whether a
				129	* particular AG already has active filestreams associated with it. The mount
				130	* point's m_peraglock is used to protect these counters from per-ag array
				131	* re-allocation during a growfs operation. When xfs_growfs_data_private() is
				132	* about to reallocate the array, it calls xfs_filestream_flush() with the
				133	* m_peraglock held in write mode.
				134	*
				135	* Since xfs_mru_cache_flush() guarantees that all the free functions for all
				136	* the cache elements have finished executing before it returns, it's safe for
				137	* the free functions to use the atomic counters without m_peraglock protection.
				138	* This allows the implementation of xfs_fstrm_free_func() to be agnostic about
				139	* whether it was called with the m_peraglock held in read mode, write mode or
				140	* not held at all. The race condition this addresses is the following:
				141	*
				142	* - The work queue scheduler fires and pulls a filestream directory cache
				143	* element off the LRU end of the cache for deletion, then gets pre-empted.
				144	* - A growfs operation grabs the m_peraglock in write mode, flushes all the
				145	* remaining items from the cache and reallocates the mount point's per-ag
				146	* array, resetting all the counters to zero.
				147	* - The work queue thread resumes and calls the free function for the element
				148	* it started cleaning up earlier. In the process it decrements the
				149	* filestreams counter for an AG that now has no references.
				150	*
				151	* With a shrinkfs feature, the above scenario could panic the system.
				152	*
				153	* All other uses of the following macros should be protected by either the
				154	* m_peraglock held in read mode, or the cache's internal locking exposed by the
				155	* interval between a call to xfs_mru_cache_lookup() and a call to
				156	* xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode
				157	* when new elements are added to the cache.
				158	*
				159	* Combined, these locking rules ensure that no associations will ever exist in
				160	* the cache that reference per-ag array elements that have since been
				161	* reallocated.
				162	*/
				163	static int
				164	xfs_filestream_peek_ag(
				165	xfs_mount_t *mp,
				166	xfs_agnumber_t agno)
				167	{
				168	struct xfs_perag *pag;
				169	int ret;
				170
				171	pag = xfs_perag_get(mp, agno);
				172	ret = atomic_read(&pag->pagf_fstrms);
				173	xfs_perag_put(pag);
				174	return ret;
				175	}
				176
				177	static int
				178	xfs_filestream_get_ag(
				179	xfs_mount_t *mp,
				180	xfs_agnumber_t agno)
				181	{
				182	struct xfs_perag *pag;
				183	int ret;
				184
				185	pag = xfs_perag_get(mp, agno);
				186	ret = atomic_inc_return(&pag->pagf_fstrms);
				187	xfs_perag_put(pag);
				188	return ret;
				189	}
				190
				191	static void
				192	xfs_filestream_put_ag(
				193	xfs_mount_t *mp,
				194	xfs_agnumber_t agno)
				195	{
				196	struct xfs_perag *pag;
				197
				198	pag = xfs_perag_get(mp, agno);
				199	atomic_dec(&pag->pagf_fstrms);
				200	xfs_perag_put(pag);
				201	}
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	202
				203	/*
				204	* Scan the AGs starting at startag looking for an AG that isn't in use and has
				205	* at least minlen blocks free.
				206	*/
				207	static int
				208	_xfs_filestream_pick_ag(
				209	xfs_mount_t *mp,
				210	xfs_agnumber_t startag,
				211	xfs_agnumber_t *agp,
				212	int flags,
				213	xfs_extlen_t minlen)
				214	{
Dave Chinner	4196ac0	2010-01-11 11:47:42 +0000	[diff] [blame]	215	int streams, max_streams;
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	216	int err, trylock, nscan;
Dave Chinner	6cc8764	2009-03-16 08:29:46 +0100	[diff] [blame]	217	xfs_extlen_t longest, free, minfree, maxfree = 0;
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	218	xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
				219	struct xfs_perag *pag;
				220
				221	/* 2% of an AG's blocks must be free for it to be chosen. */
				222	minfree = mp->m_sb.sb_agblocks / 50;
				223
				224	ag = startag;
				225	*agp = NULLAGNUMBER;
				226
				227	/* For the first pass, don't sleep trying to init the per-AG. */
				228	trylock = XFS_ALLOC_FLAG_TRYLOCK;
				229
				230	for (nscan = 0; 1; nscan++) {
Dave Chinner	4196ac0	2010-01-11 11:47:42 +0000	[diff] [blame]	231	pag = xfs_perag_get(mp, ag);
				232	TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms));
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	233
				234	if (!pag->pagf_init) {
				235	err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
Dave Chinner	4196ac0	2010-01-11 11:47:42 +0000	[diff] [blame]	236	if (err && !trylock) {
				237	xfs_perag_put(pag);
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	238	return err;
Dave Chinner	4196ac0	2010-01-11 11:47:42 +0000	[diff] [blame]	239	}
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	240	}
				241
				242	/* Might fail sometimes during the 1st pass with trylock set. */
				243	if (!pag->pagf_init)
				244	goto next_ag;
				245
				246	/* Keep track of the AG with the most free blocks. */
				247	if (pag->pagf_freeblks > maxfree) {
				248	maxfree = pag->pagf_freeblks;
Dave Chinner	4196ac0	2010-01-11 11:47:42 +0000	[diff] [blame]	249	max_streams = atomic_read(&pag->pagf_fstrms);
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	250	max_ag = ag;
				251	}
				252
				253	/*
				254	* The AG reference count does two things: it enforces mutual
				255	* exclusion when examining the suitability of an AG in this
				256	* loop, and it guards against two filestreams being established
				257	* in the same AG as each other.
				258	*/
				259	if (xfs_filestream_get_ag(mp, ag) > 1) {
				260	xfs_filestream_put_ag(mp, ag);
				261	goto next_ag;
				262	}
				263
Dave Chinner	6cc8764	2009-03-16 08:29:46 +0100	[diff] [blame]	264	longest = xfs_alloc_longest_free_extent(mp, pag);
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	265	if (((minlen && longest >= minlen) \|\|
				266	(!minlen && pag->pagf_freeblks >= minfree)) &&
				267	(!pag->pagf_metadata \|\| !(flags & XFS_PICK_USERDATA) \|\|
				268	(flags & XFS_PICK_LOWSPACE))) {
				269
				270	/* Break out, retaining the reference on the AG. */
				271	free = pag->pagf_freeblks;
Dave Chinner	4196ac0	2010-01-11 11:47:42 +0000	[diff] [blame]	272	streams = atomic_read(&pag->pagf_fstrms);
				273	xfs_perag_put(pag);
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	274	*agp = ag;
				275	break;
				276	}
				277
				278	/* Drop the reference on this AG, it's not usable. */
				279	xfs_filestream_put_ag(mp, ag);
				280	next_ag:
Dave Chinner	4196ac0	2010-01-11 11:47:42 +0000	[diff] [blame]	281	xfs_perag_put(pag);
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	282	/* Move to the next AG, wrapping to AG 0 if necessary. */
				283	if (++ag >= mp->m_sb.sb_agcount)
				284	ag = 0;
				285
				286	/* If a full pass of the AGs hasn't been done yet, continue. */
				287	if (ag != startag)
				288	continue;
				289
				290	/* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */
				291	if (trylock != 0) {
				292	trylock = 0;
				293	continue;
				294	}
				295
				296	/* Finally, if lowspace wasn't set, set it for the 3rd pass. */
				297	if (!(flags & XFS_PICK_LOWSPACE)) {
				298	flags \|= XFS_PICK_LOWSPACE;
				299	continue;
				300	}
				301
				302	/*
				303	* Take the AG with the most free space, regardless of whether
				304	* it's already in use by another filestream.
				305	*/
				306	if (max_ag != NULLAGNUMBER) {
				307	xfs_filestream_get_ag(mp, max_ag);
				308	TRACE_AG_PICK1(mp, max_ag, maxfree);
Dave Chinner	4196ac0	2010-01-11 11:47:42 +0000	[diff] [blame]	309	streams = max_streams;
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	310	free = maxfree;
				311	*agp = max_ag;
				312	break;
				313	}
				314
				315	/* take AG 0 if none matched */
				316	TRACE_AG_PICK1(mp, max_ag, maxfree);
				317	*agp = 0;
				318	return 0;
				319	}
				320
Dave Chinner	4196ac0	2010-01-11 11:47:42 +0000	[diff] [blame]	321	TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags);
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	322
				323	return 0;
				324	}
				325
				326	/*
				327	* Set the allocation group number for a file or a directory, updating inode
Dave Chinner	1c1c6eb	2010-01-11 11:47:44 +0000	[diff] [blame]	328	* references and per-AG references as appropriate.
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	329	*/
				330	static int
				331	_xfs_filestream_update_ag(
				332	xfs_inode_t *ip,
				333	xfs_inode_t *pip,
				334	xfs_agnumber_t ag)
				335	{
				336	int err = 0;
				337	xfs_mount_t *mp;
				338	xfs_mru_cache_t *cache;
				339	fstrm_item_t *item;
				340	xfs_agnumber_t old_ag;
				341	xfs_inode_t *old_pip;
				342
				343	/*
				344	* Either ip is a regular file and pip is a directory, or ip is a
				345	* directory and pip is NULL.
				346	*/
Al Viro	abbede1	2011-07-26 02:31:30 -0400	[diff] [blame]	347	ASSERT(ip && ((S_ISREG(ip->i_d.di_mode) && pip &&
Al Viro	0320937	2011-07-25 20:54:24 -0400	[diff] [blame]	348	S_ISDIR(pip->i_d.di_mode)) \|\|
				349	(S_ISDIR(ip->i_d.di_mode) && !pip)));
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	350
				351	mp = ip->i_mount;
				352	cache = mp->m_filestream;
				353
				354	item = xfs_mru_cache_lookup(cache, ip->i_ino);
				355	if (item) {
				356	ASSERT(item->ip == ip);
				357	old_ag = item->ag;
				358	item->ag = ag;
				359	old_pip = item->pip;
				360	item->pip = pip;
				361	xfs_mru_cache_done(cache);
				362
				363	/*
				364	* If the AG has changed, drop the old ref and take a new one,
				365	* effectively transferring the reference from old to new AG.
				366	*/
				367	if (ag != old_ag) {
				368	xfs_filestream_put_ag(mp, old_ag);
				369	xfs_filestream_get_ag(mp, ag);
				370	}
				371
				372	/*
				373	* If ip is a file and its pip has changed, drop the old ref and
				374	* take a new one.
				375	*/
				376	if (pip && pip != old_pip) {
				377	IRELE(old_pip);
				378	IHOLD(pip);
				379	}
				380
				381	TRACE_UPDATE(mp, ip, old_ag, xfs_filestream_peek_ag(mp, old_ag),
				382	ag, xfs_filestream_peek_ag(mp, ag));
				383	return 0;
				384	}
				385
				386	item = kmem_zone_zalloc(item_zone, KM_MAYFAIL);
				387	if (!item)
				388	return ENOMEM;
				389
				390	item->ag = ag;
				391	item->ip = ip;
				392	item->pip = pip;
				393
				394	err = xfs_mru_cache_insert(cache, ip->i_ino, item);
				395	if (err) {
				396	kmem_zone_free(item_zone, item);
				397	return err;
				398	}
				399
				400	/* Take a reference on the AG. */
				401	xfs_filestream_get_ag(mp, ag);
				402
				403	/*
				404	* Take a reference on the inode itself regardless of whether it's a
				405	* regular file or a directory.
				406	*/
				407	IHOLD(ip);
				408
				409	/*
				410	* In the case of a regular file, take a reference on the parent inode
				411	* as well to ensure it remains in-core.
				412	*/
				413	if (pip)
				414	IHOLD(pip);
				415
				416	TRACE_UPDATE(mp, ip, ag, xfs_filestream_peek_ag(mp, ag),
				417	ag, xfs_filestream_peek_ag(mp, ag));
				418
				419	return 0;
				420	}
				421
				422	/* xfs_fstrm_free_func(): callback for freeing cached stream items. */
David Chinner	a8272ce	2007-11-23 16:28:09 +1100	[diff] [blame]	423	STATIC void
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	424	xfs_fstrm_free_func(
Eric Sandeen	bcc7b44	2007-08-30 17:21:38 +1000	[diff] [blame]	425	unsigned long ino,
				426	void *data)
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	427	{
Eric Sandeen	bcc7b44	2007-08-30 17:21:38 +1000	[diff] [blame]	428	fstrm_item_t item = (fstrm_item_t )data;
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	429	xfs_inode_t *ip = item->ip;
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	430
				431	ASSERT(ip->i_ino == ino);
				432
				433	xfs_iflags_clear(ip, XFS_IFILESTREAM);
				434
				435	/* Drop the reference taken on the AG when the item was added. */
Christoph Hellwig	0664ce8	2010-07-20 17:31:01 +1000	[diff] [blame]	436	xfs_filestream_put_ag(ip->i_mount, item->ag);
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	437
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	438	TRACE_FREE(ip->i_mount, ip, item->pip, item->ag,
				439	xfs_filestream_peek_ag(ip->i_mount, item->ag));
				440
				441	/*
				442	* _xfs_filestream_update_ag() always takes a reference on the inode
				443	* itself, whether it's a file or a directory. Release it here.
				444	* This can result in the inode being freed and so we must
				445	* not hold any inode locks when freeing filesstreams objects
				446	* otherwise we can deadlock here.
				447	*/
				448	IRELE(ip);
				449
				450	/*
				451	* In the case of a regular file, _xfs_filestream_update_ag() also
				452	* takes a ref on the parent inode to keep it in-core. Release that
				453	* too.
				454	*/
				455	if (item->pip)
				456	IRELE(item->pip);
				457
				458	/* Finally, free the memory allocated for the item. */
				459	kmem_zone_free(item_zone, item);
				460	}
				461
				462	/*
				463	* xfs_filestream_init() is called at xfs initialisation time to set up the
				464	* memory zone that will be used for filestream data structure allocation.
				465	*/
				466	int
				467	xfs_filestream_init(void)
				468	{
				469	item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item");
Christoph Hellwig	9f8868f	2008-07-18 17:11:46 +1000	[diff] [blame]	470	if (!item_zone)
				471	return -ENOMEM;
Christoph Hellwig	0b1b213	2009-12-14 23:14:59 +0000	[diff] [blame]	472
Christoph Hellwig	9f8868f	2008-07-18 17:11:46 +1000	[diff] [blame]	473	return 0;
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	474	}
				475
				476	/*
				477	* xfs_filestream_uninit() is called at xfs termination time to destroy the
				478	* memory zone that was used for filestream data structure allocation.
				479	*/
				480	void
				481	xfs_filestream_uninit(void)
				482	{
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	483	kmem_zone_destroy(item_zone);
				484	}
				485
				486	/*
				487	* xfs_filestream_mount() is called when a file system is mounted with the
				488	* filestream option. It is responsible for allocating the data structures
				489	* needed to track the new file system's file streams.
				490	*/
				491	int
				492	xfs_filestream_mount(
				493	xfs_mount_t *mp)
				494	{
				495	int err;
				496	unsigned int lifetime, grp_count;
				497
				498	/*
				499	* The filestream timer tunable is currently fixed within the range of
				500	* one second to four minutes, with five seconds being the default. The
				501	* group count is somewhat arbitrary, but it'd be nice to adhere to the
				502	* timer tunable to within about 10 percent. This requires at least 10
				503	* groups.
				504	*/
				505	lifetime = xfs_fstrm_centisecs * 10;
				506	grp_count = 10;
				507
				508	err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count,
Eric Sandeen	bcc7b44	2007-08-30 17:21:38 +1000	[diff] [blame]	509	xfs_fstrm_free_func);
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	510
				511	return err;
				512	}
				513
				514	/*
				515	* xfs_filestream_unmount() is called when a file system that was mounted with
				516	* the filestream option is unmounted. It drains the data structures created
				517	* to track the file system's file streams and frees all the memory that was
				518	* allocated.
				519	*/
				520	void
				521	xfs_filestream_unmount(
				522	xfs_mount_t *mp)
				523	{
				524	xfs_mru_cache_destroy(mp->m_filestream);
				525	}
				526
				527	/*
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	528	* Return the AG of the filestream the file or directory belongs to, or
				529	* NULLAGNUMBER otherwise.
				530	*/
				531	xfs_agnumber_t
				532	xfs_filestream_lookup_ag(
				533	xfs_inode_t *ip)
				534	{
				535	xfs_mru_cache_t *cache;
				536	fstrm_item_t *item;
				537	xfs_agnumber_t ag;
				538	int ref;
				539
Al Viro	0320937	2011-07-25 20:54:24 -0400	[diff] [blame]	540	if (!S_ISREG(ip->i_d.di_mode) && !S_ISDIR(ip->i_d.di_mode)) {
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	541	ASSERT(0);
				542	return NULLAGNUMBER;
				543	}
				544
				545	cache = ip->i_mount->m_filestream;
				546	item = xfs_mru_cache_lookup(cache, ip->i_ino);
				547	if (!item) {
				548	TRACE_LOOKUP(ip->i_mount, ip, NULL, NULLAGNUMBER, 0);
				549	return NULLAGNUMBER;
				550	}
				551
				552	ASSERT(ip == item->ip);
				553	ag = item->ag;
				554	ref = xfs_filestream_peek_ag(ip->i_mount, ag);
				555	xfs_mru_cache_done(cache);
				556
				557	TRACE_LOOKUP(ip->i_mount, ip, item->pip, ag, ref);
				558	return ag;
				559	}
				560
				561	/*
				562	* xfs_filestream_associate() should only be called to associate a regular file
				563	* with its parent directory. Calling it with a child directory isn't
				564	* appropriate because filestreams don't apply to entire directory hierarchies.
				565	* Creating a file in a child directory of an existing filestream directory
				566	* starts a new filestream with its own allocation group association.
				567	*
				568	* Returns < 0 on error, 0 if successful association occurred, > 0 if
				569	* we failed to get an association because of locking issues.
				570	*/
				571	int
				572	xfs_filestream_associate(
				573	xfs_inode_t *pip,
				574	xfs_inode_t *ip)
				575	{
				576	xfs_mount_t *mp;
				577	xfs_mru_cache_t *cache;
				578	fstrm_item_t *item;
				579	xfs_agnumber_t ag, rotorstep, startag;
				580	int err = 0;
				581
Al Viro	0320937	2011-07-25 20:54:24 -0400	[diff] [blame]	582	ASSERT(S_ISDIR(pip->i_d.di_mode));
				583	ASSERT(S_ISREG(ip->i_d.di_mode));
				584	if (!S_ISDIR(pip->i_d.di_mode) \|\| !S_ISREG(ip->i_d.di_mode))
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	585	return -EINVAL;
				586
				587	mp = pip->i_mount;
				588	cache = mp->m_filestream;
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	589
				590	/*
				591	* We have a problem, Houston.
				592	*
				593	* Taking the iolock here violates inode locking order - we already
				594	* hold the ilock. Hence if we block getting this lock we may never
				595	* wake. Unfortunately, that means if we can't get the lock, we're
				596	* screwed in terms of getting a stream association - we can't spin
				597	* waiting for the lock because someone else is waiting on the lock we
				598	* hold and we cannot drop that as we are in a transaction here.
				599	*
Christoph Hellwig	075fe10	2009-06-08 15:35:48 +0200	[diff] [blame]	600	* Lucky for us, this inversion is not a problem because it's a
				601	* directory inode that we are trying to lock here.
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	602	*
				603	* So, if we can't get the iolock without sleeping then just give up
				604	*/
Dave Chinner	1c1c6eb	2010-01-11 11:47:44 +0000	[diff] [blame]	605	if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL))
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	606	return 1;
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	607
				608	/* If the parent directory is already in the cache, use its AG. */
				609	item = xfs_mru_cache_lookup(cache, pip->i_ino);
				610	if (item) {
				611	ASSERT(item->ip == pip);
				612	ag = item->ag;
				613	xfs_mru_cache_done(cache);
				614
				615	TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag));
				616	err = _xfs_filestream_update_ag(ip, pip, ag);
				617
				618	goto exit;
				619	}
				620
				621	/*
				622	* Set the starting AG using the rotor for inode32, otherwise
				623	* use the directory inode's AG.
				624	*/
				625	if (mp->m_flags & XFS_MOUNT_32BITINODES) {
				626	rotorstep = xfs_rotorstep;
				627	startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount;
				628	mp->m_agfrotor = (mp->m_agfrotor + 1) %
				629	(mp->m_sb.sb_agcount * rotorstep);
				630	} else
				631	startag = XFS_INO_TO_AGNO(mp, pip->i_ino);
				632
				633	/* Pick a new AG for the parent inode starting at startag. */
				634	err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0);
				635	if (err \|\| ag == NULLAGNUMBER)
				636	goto exit_did_pick;
				637
				638	/* Associate the parent inode with the AG. */
				639	err = _xfs_filestream_update_ag(pip, NULL, ag);
				640	if (err)
				641	goto exit_did_pick;
				642
				643	/* Associate the file inode with the AG. */
				644	err = _xfs_filestream_update_ag(ip, pip, ag);
				645	if (err)
				646	goto exit_did_pick;
				647
				648	TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag));
				649
				650	exit_did_pick:
				651	/*
				652	* If _xfs_filestream_pick_ag() returned a valid AG, remove the
				653	* reference it took on it, since the file and directory will have taken
				654	* their own now if they were successfully cached.
				655	*/
				656	if (ag != NULLAGNUMBER)
				657	xfs_filestream_put_ag(mp, ag);
				658
				659	exit:
				660	xfs_iunlock(pip, XFS_IOLOCK_EXCL);
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	661	return -err;
				662	}
				663
				664	/*
				665	* Pick a new allocation group for the current file and its file stream. This
				666	* function is called by xfs_bmap_filestreams() with the mount point's per-ag
				667	* lock held.
				668	*/
				669	int
				670	xfs_filestream_new_ag(
Dave Chinner	6898811	2013-08-12 20:49:42 +1000	[diff] [blame]	671	struct xfs_bmalloca *ap,
				672	xfs_agnumber_t *agp)
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	673	{
				674	int flags, err;
				675	xfs_inode_t ip, pip = NULL;
				676	xfs_mount_t *mp;
				677	xfs_mru_cache_t *cache;
				678	xfs_extlen_t minlen;
				679	fstrm_item_t dir, file;
				680	xfs_agnumber_t ag = NULLAGNUMBER;
				681
				682	ip = ap->ip;
				683	mp = ip->i_mount;
				684	cache = mp->m_filestream;
Dave Chinner	3a75667	2011-09-18 20:40:58 +0000	[diff] [blame]	685	minlen = ap->length;
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	686	*agp = NULLAGNUMBER;
				687
				688	/*
				689	* Look for the file in the cache, removing it if it's found. Doing
				690	* this allows it to be held across the dir lookup that follows.
				691	*/
				692	file = xfs_mru_cache_remove(cache, ip->i_ino);
				693	if (file) {
				694	ASSERT(ip == file->ip);
				695
				696	/* Save the file's parent inode and old AG number for later. */
				697	pip = file->pip;
				698	ag = file->ag;
				699
				700	/* Look for the file's directory in the cache. */
				701	dir = xfs_mru_cache_lookup(cache, pip->i_ino);
				702	if (dir) {
				703	ASSERT(pip == dir->ip);
				704
				705	/*
				706	* If the directory has already moved on to a new AG,
				707	* use that AG as the new AG for the file. Don't
				708	* forget to twiddle the AG refcounts to match the
				709	* movement.
				710	*/
				711	if (dir->ag != file->ag) {
				712	xfs_filestream_put_ag(mp, file->ag);
				713	xfs_filestream_get_ag(mp, dir->ag);
				714	*agp = file->ag = dir->ag;
				715	}
				716
				717	xfs_mru_cache_done(cache);
				718	}
				719
				720	/*
				721	* Put the file back in the cache. If this fails, the free
				722	* function needs to be called to tidy up in the same way as if
				723	* the item had simply expired from the cache.
				724	*/
				725	err = xfs_mru_cache_insert(cache, ip->i_ino, file);
				726	if (err) {
				727	xfs_fstrm_free_func(ip->i_ino, file);
				728	return err;
				729	}
				730
				731	/*
				732	* If the file's AG was moved to the directory's new AG, there's
				733	* nothing more to be done.
				734	*/
				735	if (*agp != NULLAGNUMBER) {
				736	TRACE_MOVEAG(mp, ip, pip,
				737	ag, xfs_filestream_peek_ag(mp, ag),
				738	agp, xfs_filestream_peek_ag(mp, agp));
				739	return 0;
				740	}
				741	}
				742
				743	/*
				744	* If the file's parent directory is known, take its iolock in exclusive
				745	* mode to prevent two sibling files from racing each other to migrate
				746	* themselves and their parent to different AGs.
Christoph Hellwig	785ce41	2010-11-06 11:42:44 +0000	[diff] [blame]	747	*
				748	* Note that we lock the parent directory iolock inside the child
				749	* iolock here. That's fine as we never hold both parent and child
				750	* iolock in any other place. This is different from the ilock,
				751	* which requires locking of the child after the parent for namespace
				752	* operations.
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	753	*/
				754	if (pip)
Christoph Hellwig	785ce41	2010-11-06 11:42:44 +0000	[diff] [blame]	755	xfs_ilock(pip, XFS_IOLOCK_EXCL \| XFS_IOLOCK_PARENT);
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	756
				757	/*
				758	* A new AG needs to be found for the file. If the file's parent
				759	* directory is also known, it will be moved to the new AG as well to
				760	* ensure that files created inside it in future use the new AG.
				761	*/
				762	ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount;
				763	flags = (ap->userdata ? XFS_PICK_USERDATA : 0) \|
Dave Chinner	0937e0f	2011-09-18 20:40:57 +0000	[diff] [blame]	764	(ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0);
David Chinner	2a82b8b	2007-07-11 11:09:12 +1000	[diff] [blame]	765
				766	err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen);
				767	if (err \|\| *agp == NULLAGNUMBER)
				768	goto exit;
				769
				770	/*
				771	* If the file wasn't found in the file cache, then its parent directory
				772	* inode isn't known. For this to have happened, the file must either
				773	* be pre-existing, or it was created long enough ago that its cache
				774	* entry has expired. This isn't the sort of usage that the filestreams
				775	* allocator is trying to optimise, so there's no point trying to track
				776	* its new AG somehow in the filestream data structures.
				777	*/
				778	if (!pip) {
				779	TRACE_ORPHAN(mp, ip, *agp);
				780	goto exit;
				781	}
				782
				783	/* Associate the parent inode with the AG. */
				784	err = _xfs_filestream_update_ag(pip, NULL, *agp);
				785	if (err)
				786	goto exit;
				787
				788	/* Associate the file inode with the AG. */
				789	err = _xfs_filestream_update_ag(ip, pip, *agp);
				790	if (err)
				791	goto exit;
				792
				793	TRACE_MOVEAG(mp, ip, pip, NULLAGNUMBER, 0,
				794	agp, xfs_filestream_peek_ag(mp, agp));
				795
				796	exit:
				797	/*
				798	* If _xfs_filestream_pick_ag() returned a valid AG, remove the
				799	* reference it took on it, since the file and directory will have taken
				800	* their own now if they were successfully cached.
				801	*/
				802	if (*agp != NULLAGNUMBER)
				803	xfs_filestream_put_ag(mp, *agp);
				804	else
				805	*agp = 0;
				806
				807	if (pip)
				808	xfs_iunlock(pip, XFS_IOLOCK_EXCL);
				809
				810	return err;
				811	}
				812
				813	/*
				814	* Remove an association between an inode and a filestream object.
				815	* Typically this is done on last close of an unlinked file.
				816	*/
				817	void
				818	xfs_filestream_deassociate(
				819	xfs_inode_t *ip)
				820	{
				821	xfs_mru_cache_t *cache = ip->i_mount->m_filestream;
				822
				823	xfs_mru_cache_delete(cache, ip->i_ino);
				824	}