Blame - fs/ceph/caps.c - kernel/msm-5.4

blob: 40b8d347124440bc4ea52d3d78d3293cabb909a2 [file] [log] [blame]

Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1	#include "ceph_debug.h"
				2
				3	#include <linux/fs.h>
				4	#include <linux/kernel.h>
				5	#include <linux/sched.h>
				6	#include <linux/vmalloc.h>
				7	#include <linux/wait.h>
				8
				9	#include "super.h"
				10	#include "decode.h"
				11	#include "messenger.h"
				12
				13	/*
				14	* Capability management
				15	*
				16	* The Ceph metadata servers control client access to inode metadata
				17	* and file data by issuing capabilities, granting clients permission
				18	* to read and/or write both inode field and file data to OSDs
				19	* (storage nodes). Each capability consists of a set of bits
				20	* indicating which operations are allowed.
				21	*
				22	* If the client holds a *_SHARED cap, the client has a coherent value
				23	* that can be safely read from the cached inode.
				24	*
				25	* In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
				26	* client is allowed to change inode attributes (e.g., file size,
				27	* mtime), note its dirty state in the ceph_cap, and asynchronously
				28	* flush that metadata change to the MDS.
				29	*
				30	* In the event of a conflicting operation (perhaps by another
				31	* client), the MDS will revoke the conflicting client capabilities.
				32	*
				33	* In order for a client to cache an inode, it must hold a capability
				34	* with at least one MDS server. When inodes are released, release
				35	* notifications are batched and periodically sent en masse to the MDS
				36	* cluster to release server state.
				37	*/
				38
				39
				40	/*
				41	* Generate readable cap strings for debugging output.
				42	*/
				43	#define MAX_CAP_STR 20
				44	static char cap_str[MAX_CAP_STR][40];
				45	static DEFINE_SPINLOCK(cap_str_lock);
				46	static int last_cap_str;
				47
				48	static char gcap_string(char s, int c)
				49	{
				50	if (c & CEPH_CAP_GSHARED)
				51	*s++ = 's';
				52	if (c & CEPH_CAP_GEXCL)
				53	*s++ = 'x';
				54	if (c & CEPH_CAP_GCACHE)
				55	*s++ = 'c';
				56	if (c & CEPH_CAP_GRD)
				57	*s++ = 'r';
				58	if (c & CEPH_CAP_GWR)
				59	*s++ = 'w';
				60	if (c & CEPH_CAP_GBUFFER)
				61	*s++ = 'b';
				62	if (c & CEPH_CAP_GLAZYIO)
				63	*s++ = 'l';
				64	return s;
				65	}
				66
				67	const char *ceph_cap_string(int caps)
				68	{
				69	int i;
				70	char *s;
				71	int c;
				72
				73	spin_lock(&cap_str_lock);
				74	i = last_cap_str++;
				75	if (last_cap_str == MAX_CAP_STR)
				76	last_cap_str = 0;
				77	spin_unlock(&cap_str_lock);
				78
				79	s = cap_str[i];
				80
				81	if (caps & CEPH_CAP_PIN)
				82	*s++ = 'p';
				83
				84	c = (caps >> CEPH_CAP_SAUTH) & 3;
				85	if (c) {
				86	*s++ = 'A';
				87	s = gcap_string(s, c);
				88	}
				89
				90	c = (caps >> CEPH_CAP_SLINK) & 3;
				91	if (c) {
				92	*s++ = 'L';
				93	s = gcap_string(s, c);
				94	}
				95
				96	c = (caps >> CEPH_CAP_SXATTR) & 3;
				97	if (c) {
				98	*s++ = 'X';
				99	s = gcap_string(s, c);
				100	}
				101
				102	c = caps >> CEPH_CAP_SFILE;
				103	if (c) {
				104	*s++ = 'F';
				105	s = gcap_string(s, c);
				106	}
				107
				108	if (s == cap_str[i])
				109	*s++ = '-';
				110	*s = 0;
				111	return cap_str[i];
				112	}
				113
				114	/*
				115	* Cap reservations
				116	*
				117	* Maintain a global pool of preallocated struct ceph_caps, referenced
				118	* by struct ceph_caps_reservations. This ensures that we preallocate
				119	* memory needed to successfully process an MDS response. (If an MDS
				120	* sends us cap information and we fail to process it, we will have
				121	* problems due to the client and MDS being out of sync.)
				122	*
				123	* Reservations are 'owned' by a ceph_cap_reservation context.
				124	*/
				125	static spinlock_t caps_list_lock;
				126	static struct list_head caps_list; /* unused (reserved or unreserved) */
				127	static int caps_total_count; /* total caps allocated */
				128	static int caps_use_count; /* in use */
				129	static int caps_reserve_count; /* unused, reserved */
				130	static int caps_avail_count; /* unused, unreserved */
				131
				132	void __init ceph_caps_init(void)
				133	{
				134	INIT_LIST_HEAD(&caps_list);
				135	spin_lock_init(&caps_list_lock);
				136	}
				137
				138	void ceph_caps_finalize(void)
				139	{
				140	struct ceph_cap *cap;
				141
				142	spin_lock(&caps_list_lock);
				143	while (!list_empty(&caps_list)) {
				144	cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
				145	list_del(&cap->caps_item);
				146	kmem_cache_free(ceph_cap_cachep, cap);
				147	}
				148	caps_total_count = 0;
				149	caps_avail_count = 0;
				150	caps_use_count = 0;
				151	caps_reserve_count = 0;
				152	spin_unlock(&caps_list_lock);
				153	}
				154
				155	int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
				156	{
				157	int i;
				158	struct ceph_cap *cap;
				159	int have;
				160	int alloc = 0;
				161	LIST_HEAD(newcaps);
				162	int ret = 0;
				163
				164	dout("reserve caps ctx=%p need=%d\n", ctx, need);
				165
				166	/* first reserve any caps that are already allocated */
				167	spin_lock(&caps_list_lock);
				168	if (caps_avail_count >= need)
				169	have = need;
				170	else
				171	have = caps_avail_count;
				172	caps_avail_count -= have;
				173	caps_reserve_count += have;
				174	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
				175	caps_avail_count);
				176	spin_unlock(&caps_list_lock);
				177
				178	for (i = have; i < need; i++) {
				179	cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
				180	if (!cap) {
				181	ret = -ENOMEM;
				182	goto out_alloc_count;
				183	}
				184	list_add(&cap->caps_item, &newcaps);
				185	alloc++;
				186	}
				187	BUG_ON(have + alloc != need);
				188
				189	spin_lock(&caps_list_lock);
				190	caps_total_count += alloc;
				191	caps_reserve_count += alloc;
				192	list_splice(&newcaps, &caps_list);
				193
				194	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
				195	caps_avail_count);
				196	spin_unlock(&caps_list_lock);
				197
				198	ctx->count = need;
				199	dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
				200	ctx, caps_total_count, caps_use_count, caps_reserve_count,
				201	caps_avail_count);
				202	return 0;
				203
				204	out_alloc_count:
				205	/* we didn't manage to reserve as much as we needed */
				206	pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
				207	ctx, need, have);
				208	return ret;
				209	}
				210
				211	int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
				212	{
				213	dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
				214	if (ctx->count) {
				215	spin_lock(&caps_list_lock);
				216	BUG_ON(caps_reserve_count < ctx->count);
				217	caps_reserve_count -= ctx->count;
				218	caps_avail_count += ctx->count;
				219	ctx->count = 0;
				220	dout("unreserve caps %d = %d used + %d resv + %d avail\n",
				221	caps_total_count, caps_use_count, caps_reserve_count,
				222	caps_avail_count);
				223	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
				224	caps_avail_count);
				225	spin_unlock(&caps_list_lock);
				226	}
				227	return 0;
				228	}
				229
				230	static struct ceph_cap get_cap(struct ceph_cap_reservation ctx)
				231	{
				232	struct ceph_cap *cap = NULL;
				233
				234	/* temporary, until we do something about cap import/export */
				235	if (!ctx)
				236	return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
				237
				238	spin_lock(&caps_list_lock);
				239	dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
				240	ctx, ctx->count, caps_total_count, caps_use_count,
				241	caps_reserve_count, caps_avail_count);
				242	BUG_ON(!ctx->count);
				243	BUG_ON(ctx->count > caps_reserve_count);
				244	BUG_ON(list_empty(&caps_list));
				245
				246	ctx->count--;
				247	caps_reserve_count--;
				248	caps_use_count++;
				249
				250	cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
				251	list_del(&cap->caps_item);
				252
				253	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
				254	caps_avail_count);
				255	spin_unlock(&caps_list_lock);
				256	return cap;
				257	}
				258
				259	static void put_cap(struct ceph_cap *cap,
				260	struct ceph_cap_reservation *ctx)
				261	{
				262	spin_lock(&caps_list_lock);
				263	dout("put_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
				264	ctx, ctx ? ctx->count : 0, caps_total_count, caps_use_count,
				265	caps_reserve_count, caps_avail_count);
				266	caps_use_count--;
				267	/*
				268	* Keep some preallocated caps around, at least enough to do a
				269	* readdir (which needs to preallocate lots of them), to avoid
				270	* lots of free/alloc churn.
				271	*/
				272	if (caps_avail_count >= caps_reserve_count +
				273	ceph_client(cap->ci->vfs_inode.i_sb)->mount_args.max_readdir) {
				274	caps_total_count--;
				275	kmem_cache_free(ceph_cap_cachep, cap);
				276	} else {
				277	if (ctx) {
				278	ctx->count++;
				279	caps_reserve_count++;
				280	} else {
				281	caps_avail_count++;
				282	}
				283	list_add(&cap->caps_item, &caps_list);
				284	}
				285
				286	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
				287	caps_avail_count);
				288	spin_unlock(&caps_list_lock);
				289	}
				290
				291	void ceph_reservation_status(struct ceph_client *client,
				292	int total, int avail, int used, int reserved)
				293	{
				294	if (total)
				295	*total = caps_total_count;
				296	if (avail)
				297	*avail = caps_avail_count;
				298	if (used)
				299	*used = caps_use_count;
				300	if (reserved)
				301	*reserved = caps_reserve_count;
				302	}
				303
				304	/*
				305	* Find ceph_cap for given mds, if any.
				306	*
				307	* Called with i_lock held.
				308	*/
				309	static struct ceph_cap __get_cap_for_mds(struct ceph_inode_info ci, int mds)
				310	{
				311	struct ceph_cap *cap;
				312	struct rb_node *n = ci->i_caps.rb_node;
				313
				314	while (n) {
				315	cap = rb_entry(n, struct ceph_cap, ci_node);
				316	if (mds < cap->mds)
				317	n = n->rb_left;
				318	else if (mds > cap->mds)
				319	n = n->rb_right;
				320	else
				321	return cap;
				322	}
				323	return NULL;
				324	}
				325
				326	/*
				327	* Return id of any MDS with a cap, preferably FILE_WR\|WRBUFFER\|EXCL, else
				328	* -1.
				329	*/
				330	static int __ceph_get_cap_mds(struct ceph_inode_info ci, u32 mseq)
				331	{
				332	struct ceph_cap *cap;
				333	int mds = -1;
				334	struct rb_node *p;
				335
				336	/* prefer mds with WR\|WRBUFFER\|EXCL caps */
				337	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				338	cap = rb_entry(p, struct ceph_cap, ci_node);
				339	mds = cap->mds;
				340	if (mseq)
				341	*mseq = cap->mseq;
				342	if (cap->issued & (CEPH_CAP_FILE_WR \|
				343	CEPH_CAP_FILE_BUFFER \|
				344	CEPH_CAP_FILE_EXCL))
				345	break;
				346	}
				347	return mds;
				348	}
				349
				350	int ceph_get_cap_mds(struct inode *inode)
				351	{
				352	int mds;
				353	spin_lock(&inode->i_lock);
				354	mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
				355	spin_unlock(&inode->i_lock);
				356	return mds;
				357	}
				358
				359	/*
				360	* Called under i_lock.
				361	*/
				362	static void __insert_cap_node(struct ceph_inode_info *ci,
				363	struct ceph_cap *new)
				364	{
				365	struct rb_node **p = &ci->i_caps.rb_node;
				366	struct rb_node *parent = NULL;
				367	struct ceph_cap *cap = NULL;
				368
				369	while (*p) {
				370	parent = *p;
				371	cap = rb_entry(parent, struct ceph_cap, ci_node);
				372	if (new->mds < cap->mds)
				373	p = &(*p)->rb_left;
				374	else if (new->mds > cap->mds)
				375	p = &(*p)->rb_right;
				376	else
				377	BUG();
				378	}
				379
				380	rb_link_node(&new->ci_node, parent, p);
				381	rb_insert_color(&new->ci_node, &ci->i_caps);
				382	}
				383
				384	/*
				385	* (re)set cap hold timeouts, which control the delayed release
				386	* of unused caps back to the MDS. Should be called on cap use.
				387	*/
				388	static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
				389	struct ceph_inode_info *ci)
				390	{
				391	struct ceph_mount_args *ma = &mdsc->client->mount_args;
				392
				393	ci->i_hold_caps_min = round_jiffies(jiffies +
				394	ma->caps_wanted_delay_min * HZ);
				395	ci->i_hold_caps_max = round_jiffies(jiffies +
				396	ma->caps_wanted_delay_max * HZ);
				397	dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
				398	ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
				399	}
				400
				401	/*
				402	* (Re)queue cap at the end of the delayed cap release list.
				403	*
				404	* If I_FLUSH is set, leave the inode at the front of the list.
				405	*
				406	* Caller holds i_lock
				407	* -> we take mdsc->cap_delay_lock
				408	*/
				409	static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
				410	struct ceph_inode_info *ci)
				411	{
				412	__cap_set_timeouts(mdsc, ci);
				413	dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
				414	ci->i_ceph_flags, ci->i_hold_caps_max);
				415	if (!mdsc->stopping) {
				416	spin_lock(&mdsc->cap_delay_lock);
				417	if (!list_empty(&ci->i_cap_delay_list)) {
				418	if (ci->i_ceph_flags & CEPH_I_FLUSH)
				419	goto no_change;
				420	list_del_init(&ci->i_cap_delay_list);
				421	}
				422	list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
				423	no_change:
				424	spin_unlock(&mdsc->cap_delay_lock);
				425	}
				426	}
				427
				428	/*
				429	* Queue an inode for immediate writeback. Mark inode with I_FLUSH,
				430	* indicating we should send a cap message to flush dirty metadata
				431	* asap, and move to the front of the delayed cap list.
				432	*/
				433	static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
				434	struct ceph_inode_info *ci)
				435	{
				436	dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
				437	spin_lock(&mdsc->cap_delay_lock);
				438	ci->i_ceph_flags \|= CEPH_I_FLUSH;
				439	if (!list_empty(&ci->i_cap_delay_list))
				440	list_del_init(&ci->i_cap_delay_list);
				441	list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
				442	spin_unlock(&mdsc->cap_delay_lock);
				443	}
				444
				445	/*
				446	* Cancel delayed work on cap.
				447	*
				448	* Caller must hold i_lock.
				449	*/
				450	static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
				451	struct ceph_inode_info *ci)
				452	{
				453	dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
				454	if (list_empty(&ci->i_cap_delay_list))
				455	return;
				456	spin_lock(&mdsc->cap_delay_lock);
				457	list_del_init(&ci->i_cap_delay_list);
				458	spin_unlock(&mdsc->cap_delay_lock);
				459	}
				460
				461	/*
				462	* Common issue checks for add_cap, handle_cap_grant.
				463	*/
				464	static void __check_cap_issue(struct ceph_inode_info ci, struct ceph_cap cap,
				465	unsigned issued)
				466	{
				467	unsigned had = __ceph_caps_issued(ci, NULL);
				468
				469	/*
				470	* Each time we receive FILE_CACHE anew, we increment
				471	* i_rdcache_gen.
				472	*/
				473	if ((issued & CEPH_CAP_FILE_CACHE) &&
				474	(had & CEPH_CAP_FILE_CACHE) == 0)
				475	ci->i_rdcache_gen++;
				476
				477	/*
				478	* if we are newly issued FILE_SHARED, clear I_COMPLETE; we
				479	* don't know what happened to this directory while we didn't
				480	* have the cap.
				481	*/
				482	if ((issued & CEPH_CAP_FILE_SHARED) &&
				483	(had & CEPH_CAP_FILE_SHARED) == 0) {
				484	ci->i_shared_gen++;
				485	if (S_ISDIR(ci->vfs_inode.i_mode)) {
				486	dout(" marking %p NOT complete\n", &ci->vfs_inode);
				487	ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
				488	}
				489	}
				490	}
				491
				492	/*
				493	* Add a capability under the given MDS session.
				494	*
				495	* Caller should hold session snap_rwsem (read) and s_mutex.
				496	*
				497	* @fmode is the open file mode, if we are opening a file, otherwise
				498	* it is < 0. (This is so we can atomically add the cap and add an
				499	* open file reference to it.)
				500	*/
				501	int ceph_add_cap(struct inode *inode,
				502	struct ceph_mds_session *session, u64 cap_id,
				503	int fmode, unsigned issued, unsigned wanted,
				504	unsigned seq, unsigned mseq, u64 realmino, int flags,
				505	struct ceph_cap_reservation *caps_reservation)
				506	{
				507	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
				508	struct ceph_inode_info *ci = ceph_inode(inode);
				509	struct ceph_cap *new_cap = NULL;
				510	struct ceph_cap *cap;
				511	int mds = session->s_mds;
				512	int actual_wanted;
				513
				514	dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
				515	session->s_mds, cap_id, ceph_cap_string(issued), seq);
				516
				517	/*
				518	* If we are opening the file, include file mode wanted bits
				519	* in wanted.
				520	*/
				521	if (fmode >= 0)
				522	wanted \|= ceph_caps_for_mode(fmode);
				523
				524	retry:
				525	spin_lock(&inode->i_lock);
				526	cap = __get_cap_for_mds(ci, mds);
				527	if (!cap) {
				528	if (new_cap) {
				529	cap = new_cap;
				530	new_cap = NULL;
				531	} else {
				532	spin_unlock(&inode->i_lock);
				533	new_cap = get_cap(caps_reservation);
				534	if (new_cap == NULL)
				535	return -ENOMEM;
				536	goto retry;
				537	}
				538
				539	cap->issued = 0;
				540	cap->implemented = 0;
				541	cap->mds = mds;
				542	cap->mds_wanted = 0;
				543
				544	cap->ci = ci;
				545	__insert_cap_node(ci, cap);
				546
				547	/* clear out old exporting info? (i.e. on cap import) */
				548	if (ci->i_cap_exporting_mds == mds) {
				549	ci->i_cap_exporting_issued = 0;
				550	ci->i_cap_exporting_mseq = 0;
				551	ci->i_cap_exporting_mds = -1;
				552	}
				553
				554	/* add to session cap list */
				555	cap->session = session;
				556	spin_lock(&session->s_cap_lock);
				557	list_add_tail(&cap->session_caps, &session->s_caps);
				558	session->s_nr_caps++;
				559	spin_unlock(&session->s_cap_lock);
				560	}
				561
				562	if (!ci->i_snap_realm) {
				563	/*
				564	* add this inode to the appropriate snap realm
				565	*/
				566	struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
				567	realmino);
				568	if (realm) {
				569	ceph_get_snap_realm(mdsc, realm);
				570	spin_lock(&realm->inodes_with_caps_lock);
				571	ci->i_snap_realm = realm;
				572	list_add(&ci->i_snap_realm_item,
				573	&realm->inodes_with_caps);
				574	spin_unlock(&realm->inodes_with_caps_lock);
				575	} else {
				576	pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
				577	realmino);
				578	}
				579	}
				580
				581	__check_cap_issue(ci, cap, issued);
				582
				583	/*
				584	* If we are issued caps we don't want, or the mds' wanted
				585	* value appears to be off, queue a check so we'll release
				586	* later and/or update the mds wanted value.
				587	*/
				588	actual_wanted = __ceph_caps_wanted(ci);
				589	if ((wanted & ~actual_wanted) \|\|
				590	(issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
				591	dout(" issued %s, mds wanted %s, actual %s, queueing\n",
				592	ceph_cap_string(issued), ceph_cap_string(wanted),
				593	ceph_cap_string(actual_wanted));
				594	__cap_delay_requeue(mdsc, ci);
				595	}
				596
				597	if (flags & CEPH_CAP_FLAG_AUTH)
				598	ci->i_auth_cap = cap;
				599	else if (ci->i_auth_cap == cap)
				600	ci->i_auth_cap = NULL;
				601
				602	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
				603	inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
				604	ceph_cap_string(issued\|cap->issued), seq, mds);
				605	cap->cap_id = cap_id;
				606	cap->issued = issued;
				607	cap->implemented \|= issued;
				608	cap->mds_wanted \|= wanted;
				609	cap->seq = seq;
				610	cap->issue_seq = seq;
				611	cap->mseq = mseq;
				612	cap->gen = session->s_cap_gen;
				613
				614	if (fmode >= 0)
				615	__ceph_get_fmode(ci, fmode);
				616	spin_unlock(&inode->i_lock);
				617	wake_up(&ci->i_cap_wq);
				618	return 0;
				619	}
				620
				621	/*
				622	* Return true if cap has not timed out and belongs to the current
				623	* generation of the MDS session (i.e. has not gone 'stale' due to
				624	* us losing touch with the mds).
				625	*/
				626	static int __cap_is_valid(struct ceph_cap *cap)
				627	{
				628	unsigned long ttl;
				629	u32 gen;
				630
				631	spin_lock(&cap->session->s_cap_lock);
				632	gen = cap->session->s_cap_gen;
				633	ttl = cap->session->s_cap_ttl;
				634	spin_unlock(&cap->session->s_cap_lock);
				635
				636	if (cap->gen < gen \|\| time_after_eq(jiffies, ttl)) {
				637	dout("__cap_is_valid %p cap %p issued %s "
				638	"but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
				639	cap, ceph_cap_string(cap->issued), cap->gen, gen);
				640	return 0;
				641	}
				642
				643	return 1;
				644	}
				645
				646	/*
				647	* Return set of valid cap bits issued to us. Note that caps time
				648	* out, and may be invalidated in bulk if the client session times out
				649	* and session->s_cap_gen is bumped.
				650	*/
				651	int __ceph_caps_issued(struct ceph_inode_info ci, int implemented)
				652	{
				653	int have = ci->i_snap_caps;
				654	struct ceph_cap *cap;
				655	struct rb_node *p;
				656
				657	if (implemented)
				658	*implemented = 0;
				659	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				660	cap = rb_entry(p, struct ceph_cap, ci_node);
				661	if (!__cap_is_valid(cap))
				662	continue;
				663	dout("__ceph_caps_issued %p cap %p issued %s\n",
				664	&ci->vfs_inode, cap, ceph_cap_string(cap->issued));
				665	have \|= cap->issued;
				666	if (implemented)
				667	*implemented \|= cap->implemented;
				668	}
				669	return have;
				670	}
				671
				672	/*
				673	* Get cap bits issued by caps other than @ocap
				674	*/
				675	int __ceph_caps_issued_other(struct ceph_inode_info ci, struct ceph_cap ocap)
				676	{
				677	int have = ci->i_snap_caps;
				678	struct ceph_cap *cap;
				679	struct rb_node *p;
				680
				681	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				682	cap = rb_entry(p, struct ceph_cap, ci_node);
				683	if (cap == ocap)
				684	continue;
				685	if (!__cap_is_valid(cap))
				686	continue;
				687	have \|= cap->issued;
				688	}
				689	return have;
				690	}
				691
				692	/*
				693	* Move a cap to the end of the LRU (oldest caps at list head, newest
				694	* at list tail).
				695	*/
				696	static void __touch_cap(struct ceph_cap *cap)
				697	{
				698	struct ceph_mds_session *s = cap->session;
				699
				700	dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
				701	s->s_mds);
				702	spin_lock(&s->s_cap_lock);
				703	list_move_tail(&cap->session_caps, &s->s_caps);
				704	spin_unlock(&s->s_cap_lock);
				705	}
				706
				707	/*
				708	* Check if we hold the given mask. If so, move the cap(s) to the
				709	* front of their respective LRUs. (This is the preferred way for
				710	* callers to check for caps they want.)
				711	*/
				712	int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
				713	{
				714	struct ceph_cap *cap;
				715	struct rb_node *p;
				716	int have = ci->i_snap_caps;
				717
				718	if ((have & mask) == mask) {
				719	dout("__ceph_caps_issued_mask %p snap issued %s"
				720	" (mask %s)\n", &ci->vfs_inode,
				721	ceph_cap_string(have),
				722	ceph_cap_string(mask));
				723	return 1;
				724	}
				725
				726	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				727	cap = rb_entry(p, struct ceph_cap, ci_node);
				728	if (!__cap_is_valid(cap))
				729	continue;
				730	if ((cap->issued & mask) == mask) {
				731	dout("__ceph_caps_issued_mask %p cap %p issued %s"
				732	" (mask %s)\n", &ci->vfs_inode, cap,
				733	ceph_cap_string(cap->issued),
				734	ceph_cap_string(mask));
				735	if (touch)
				736	__touch_cap(cap);
				737	return 1;
				738	}
				739
				740	/* does a combination of caps satisfy mask? */
				741	have \|= cap->issued;
				742	if ((have & mask) == mask) {
				743	dout("__ceph_caps_issued_mask %p combo issued %s"
				744	" (mask %s)\n", &ci->vfs_inode,
				745	ceph_cap_string(cap->issued),
				746	ceph_cap_string(mask));
				747	if (touch) {
				748	struct rb_node *q;
				749
				750	/* touch this + preceeding caps */
				751	__touch_cap(cap);
				752	for (q = rb_first(&ci->i_caps); q != p;
				753	q = rb_next(q)) {
				754	cap = rb_entry(q, struct ceph_cap,
				755	ci_node);
				756	if (!__cap_is_valid(cap))
				757	continue;
				758	__touch_cap(cap);
				759	}
				760	}
				761	return 1;
				762	}
				763	}
				764
				765	return 0;
				766	}
				767
				768	/*
				769	* Return true if mask caps are currently being revoked by an MDS.
				770	*/
				771	int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
				772	{
				773	struct inode *inode = &ci->vfs_inode;
				774	struct ceph_cap *cap;
				775	struct rb_node *p;
				776	int ret = 0;
				777
				778	spin_lock(&inode->i_lock);
				779	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				780	cap = rb_entry(p, struct ceph_cap, ci_node);
				781	if (__cap_is_valid(cap) &&
				782	(cap->implemented & ~cap->issued & mask)) {
				783	ret = 1;
				784	break;
				785	}
				786	}
				787	spin_unlock(&inode->i_lock);
				788	dout("ceph_caps_revoking %p %s = %d\n", inode,
				789	ceph_cap_string(mask), ret);
				790	return ret;
				791	}
				792
				793	int __ceph_caps_used(struct ceph_inode_info *ci)
				794	{
				795	int used = 0;
				796	if (ci->i_pin_ref)
				797	used \|= CEPH_CAP_PIN;
				798	if (ci->i_rd_ref)
				799	used \|= CEPH_CAP_FILE_RD;
				800	if (ci->i_rdcache_ref \|\| ci->i_rdcache_gen)
				801	used \|= CEPH_CAP_FILE_CACHE;
				802	if (ci->i_wr_ref)
				803	used \|= CEPH_CAP_FILE_WR;
				804	if (ci->i_wrbuffer_ref)
				805	used \|= CEPH_CAP_FILE_BUFFER;
				806	return used;
				807	}
				808
				809	/*
				810	* wanted, by virtue of open file modes
				811	*/
				812	int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
				813	{
				814	int want = 0;
				815	int mode;
				816	for (mode = 0; mode < 4; mode++)
				817	if (ci->i_nr_by_mode[mode])
				818	want \|= ceph_caps_for_mode(mode);
				819	return want;
				820	}
				821
				822	/*
				823	* Return caps we have registered with the MDS(s) as 'wanted'.
				824	*/
				825	int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
				826	{
				827	struct ceph_cap *cap;
				828	struct rb_node *p;
				829	int mds_wanted = 0;
				830
				831	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				832	cap = rb_entry(p, struct ceph_cap, ci_node);
				833	if (!__cap_is_valid(cap))
				834	continue;
				835	mds_wanted \|= cap->mds_wanted;
				836	}
				837	return mds_wanted;
				838	}
				839
				840	/*
				841	* called under i_lock
				842	*/
				843	static int __ceph_is_any_caps(struct ceph_inode_info *ci)
				844	{
				845	return !RB_EMPTY_ROOT(&ci->i_caps) \|\| ci->i_cap_exporting_mds >= 0;
				846	}
				847
				848	/*
				849	* caller should hold i_lock, and session s_mutex.
				850	* returns true if this is the last cap. if so, caller should iput.
				851	*/
				852	void __ceph_remove_cap(struct ceph_cap *cap,
				853	struct ceph_cap_reservation *ctx)
				854	{
				855	struct ceph_mds_session *session = cap->session;
				856	struct ceph_inode_info *ci = cap->ci;
				857	struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
				858
				859	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
				860
				861	/* remove from session list */
				862	spin_lock(&session->s_cap_lock);
				863	list_del_init(&cap->session_caps);
				864	session->s_nr_caps--;
				865	spin_unlock(&session->s_cap_lock);
				866
				867	/* remove from inode list */
				868	rb_erase(&cap->ci_node, &ci->i_caps);
				869	cap->session = NULL;
				870	if (ci->i_auth_cap == cap)
				871	ci->i_auth_cap = NULL;
				872
				873	put_cap(cap, ctx);
				874
				875	if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
				876	struct ceph_snap_realm *realm = ci->i_snap_realm;
				877	spin_lock(&realm->inodes_with_caps_lock);
				878	list_del_init(&ci->i_snap_realm_item);
				879	ci->i_snap_realm_counter++;
				880	ci->i_snap_realm = NULL;
				881	spin_unlock(&realm->inodes_with_caps_lock);
				882	ceph_put_snap_realm(mdsc, realm);
				883	}
				884	if (!__ceph_is_any_real_caps(ci))
				885	__cap_delay_cancel(mdsc, ci);
				886	}
				887
				888	/*
				889	* Build and send a cap message to the given MDS.
				890	*
				891	* Caller should be holding s_mutex.
				892	*/
				893	static int send_cap_msg(struct ceph_mds_session *session,
				894	u64 ino, u64 cid, int op,
				895	int caps, int wanted, int dirty,
				896	u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
				897	u64 size, u64 max_size,
				898	struct timespec mtime, struct timespec atime,
				899	u64 time_warp_seq,
				900	uid_t uid, gid_t gid, mode_t mode,
				901	u64 xattr_version,
				902	struct ceph_buffer *xattrs_buf,
				903	u64 follows)
				904	{
				905	struct ceph_mds_caps *fc;
				906	struct ceph_msg *msg;
				907
				908	dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
				909	" seq %u/%u mseq %u follows %lld size %llu/%llu"
				910	" xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
				911	cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
				912	ceph_cap_string(dirty),
				913	seq, issue_seq, mseq, follows, size, max_size,
				914	xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
				915
				916	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
				917	if (IS_ERR(msg))
				918	return PTR_ERR(msg);
				919
				920	fc = msg->front.iov_base;
				921
				922	memset(fc, 0, sizeof(*fc));
				923
				924	fc->cap_id = cpu_to_le64(cid);
				925	fc->op = cpu_to_le32(op);
				926	fc->seq = cpu_to_le32(seq);
				927	fc->client_tid = cpu_to_le64(flush_tid);
				928	fc->issue_seq = cpu_to_le32(issue_seq);
				929	fc->migrate_seq = cpu_to_le32(mseq);
				930	fc->caps = cpu_to_le32(caps);
				931	fc->wanted = cpu_to_le32(wanted);
				932	fc->dirty = cpu_to_le32(dirty);
				933	fc->ino = cpu_to_le64(ino);
				934	fc->snap_follows = cpu_to_le64(follows);
				935
				936	fc->size = cpu_to_le64(size);
				937	fc->max_size = cpu_to_le64(max_size);
				938	if (mtime)
				939	ceph_encode_timespec(&fc->mtime, mtime);
				940	if (atime)
				941	ceph_encode_timespec(&fc->atime, atime);
				942	fc->time_warp_seq = cpu_to_le32(time_warp_seq);
				943
				944	fc->uid = cpu_to_le32(uid);
				945	fc->gid = cpu_to_le32(gid);
				946	fc->mode = cpu_to_le32(mode);
				947
				948	fc->xattr_version = cpu_to_le64(xattr_version);
				949	if (xattrs_buf) {
				950	msg->middle = ceph_buffer_get(xattrs_buf);
				951	fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
				952	msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
				953	}
				954
				955	ceph_con_send(&session->s_con, msg);
				956	return 0;
				957	}
				958
				959	/*
				960	* Queue cap releases when an inode is dropped from our
				961	* cache.
				962	*/
				963	void ceph_queue_caps_release(struct inode *inode)
				964	{
				965	struct ceph_inode_info *ci = ceph_inode(inode);
				966	struct rb_node *p;
				967
				968	spin_lock(&inode->i_lock);
				969	p = rb_first(&ci->i_caps);
				970	while (p) {
				971	struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
				972	struct ceph_mds_session *session = cap->session;
				973	struct ceph_msg *msg;
				974	struct ceph_mds_cap_release *head;
				975	struct ceph_mds_cap_item *item;
				976
				977	spin_lock(&session->s_cap_lock);
				978	BUG_ON(!session->s_num_cap_releases);
				979	msg = list_first_entry(&session->s_cap_releases,
				980	struct ceph_msg, list_head);
				981
				982	dout(" adding %p release to mds%d msg %p (%d left)\n",
				983	inode, session->s_mds, msg, session->s_num_cap_releases);
				984
				985	BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
				986	head = msg->front.iov_base;
				987	head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
				988	item = msg->front.iov_base + msg->front.iov_len;
				989	item->ino = cpu_to_le64(ceph_ino(inode));
				990	item->cap_id = cpu_to_le64(cap->cap_id);
				991	item->migrate_seq = cpu_to_le32(cap->mseq);
				992	item->seq = cpu_to_le32(cap->issue_seq);
				993
				994	session->s_num_cap_releases--;
				995
				996	msg->front.iov_len += sizeof(*item);
				997	if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
				998	dout(" release msg %p full\n", msg);
				999	list_move_tail(&msg->list_head,
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame^]	1000	&session->s_cap_releases_done);
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1001	} else {
				1002	dout(" release msg %p at %d/%d (%d)\n", msg,
				1003	(int)le32_to_cpu(head->num),
				1004	(int)CEPH_CAPS_PER_RELEASE,
				1005	(int)msg->front.iov_len);
				1006	}
				1007	spin_unlock(&session->s_cap_lock);
				1008	p = rb_next(p);
				1009	__ceph_remove_cap(cap, NULL);
				1010
				1011	}
				1012	spin_unlock(&inode->i_lock);
				1013	}
				1014
				1015	/*
				1016	* Send a cap msg on the given inode. Update our caps state, then
				1017	* drop i_lock and send the message.
				1018	*
				1019	* Make note of max_size reported/requested from mds, revoked caps
				1020	* that have now been implemented.
				1021	*
				1022	* Make half-hearted attempt ot to invalidate page cache if we are
				1023	* dropping RDCACHE. Note that this will leave behind locked pages
				1024	* that we'll then need to deal with elsewhere.
				1025	*
				1026	* Return non-zero if delayed release, or we experienced an error
				1027	* such that the caller should requeue + retry later.
				1028	*
				1029	* called with i_lock, then drops it.
				1030	* caller should hold snap_rwsem (read), s_mutex.
				1031	*/
				1032	static int __send_cap(struct ceph_mds_client mdsc, struct ceph_cap cap,
				1033	int op, int used, int want, int retain, int flushing,
				1034	unsigned *pflush_tid)
				1035	__releases(cap->ci->vfs_inode->i_lock)
				1036	{
				1037	struct ceph_inode_info *ci = cap->ci;
				1038	struct inode *inode = &ci->vfs_inode;
				1039	u64 cap_id = cap->cap_id;
				1040	int held = cap->issued \| cap->implemented;
				1041	int revoking = cap->implemented & ~cap->issued;
				1042	int dropping = cap->issued & ~retain;
				1043	int keep;
				1044	u64 seq, issue_seq, mseq, time_warp_seq, follows;
				1045	u64 size, max_size;
				1046	struct timespec mtime, atime;
				1047	int wake = 0;
				1048	mode_t mode;
				1049	uid_t uid;
				1050	gid_t gid;
				1051	struct ceph_mds_session *session;
				1052	u64 xattr_version = 0;
				1053	int delayed = 0;
				1054	u64 flush_tid = 0;
				1055	int i;
				1056	int ret;
				1057
				1058	dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
				1059	inode, cap, cap->session,
				1060	ceph_cap_string(held), ceph_cap_string(held & retain),
				1061	ceph_cap_string(revoking));
				1062	BUG_ON((retain & CEPH_CAP_PIN) == 0);
				1063
				1064	session = cap->session;
				1065
				1066	/* don't release wanted unless we've waited a bit. */
				1067	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
				1068	time_before(jiffies, ci->i_hold_caps_min)) {
				1069	dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
				1070	ceph_cap_string(cap->issued),
				1071	ceph_cap_string(cap->issued & retain),
				1072	ceph_cap_string(cap->mds_wanted),
				1073	ceph_cap_string(want));
				1074	want \|= cap->mds_wanted;
				1075	retain \|= cap->issued;
				1076	delayed = 1;
				1077	}
				1078	ci->i_ceph_flags &= ~(CEPH_I_NODELAY \| CEPH_I_FLUSH);
				1079
				1080	cap->issued &= retain; /* drop bits we don't want */
				1081	if (cap->implemented & ~cap->issued) {
				1082	/*
				1083	* Wake up any waiters on wanted -> needed transition.
				1084	* This is due to the weird transition from buffered
				1085	* to sync IO... we need to flush dirty pages _before_
				1086	* allowing sync writes to avoid reordering.
				1087	*/
				1088	wake = 1;
				1089	}
				1090	cap->implemented &= cap->issued \| used;
				1091	cap->mds_wanted = want;
				1092
				1093	if (flushing) {
				1094	/*
				1095	* assign a tid for flush operations so we can avoid
				1096	* flush1 -> dirty1 -> flush2 -> flushack1 -> mark
				1097	* clean type races. track latest tid for every bit
				1098	* so we can handle flush AxFw, flush Fw, and have the
				1099	* first ack clean Ax.
				1100	*/
				1101	flush_tid = ++ci->i_cap_flush_last_tid;
				1102	if (pflush_tid)
				1103	*pflush_tid = flush_tid;
				1104	dout(" cap_flush_tid %d\n", (int)flush_tid);
				1105	for (i = 0; i < CEPH_CAP_BITS; i++)
				1106	if (flushing & (1 << i))
				1107	ci->i_cap_flush_tid[i] = flush_tid;
				1108	}
				1109
				1110	keep = cap->implemented;
				1111	seq = cap->seq;
				1112	issue_seq = cap->issue_seq;
				1113	mseq = cap->mseq;
				1114	size = inode->i_size;
				1115	ci->i_reported_size = size;
				1116	max_size = ci->i_wanted_max_size;
				1117	ci->i_requested_max_size = max_size;
				1118	mtime = inode->i_mtime;
				1119	atime = inode->i_atime;
				1120	time_warp_seq = ci->i_time_warp_seq;
				1121	follows = ci->i_snap_realm->cached_context->seq;
				1122	uid = inode->i_uid;
				1123	gid = inode->i_gid;
				1124	mode = inode->i_mode;
				1125
				1126	if (dropping & CEPH_CAP_XATTR_EXCL) {
				1127	__ceph_build_xattrs_blob(ci);
				1128	xattr_version = ci->i_xattrs.version + 1;
				1129	}
				1130
				1131	spin_unlock(&inode->i_lock);
				1132
				1133	if (dropping & CEPH_CAP_FILE_CACHE) {
				1134	/* invalidate what we can */
				1135	dout("invalidating pages on %p\n", inode);
				1136	invalidate_mapping_pages(&inode->i_data, 0, -1);
				1137	}
				1138
				1139	ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
				1140	op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
				1141	size, max_size, &mtime, &atime, time_warp_seq,
				1142	uid, gid, mode,
				1143	xattr_version,
				1144	(flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
				1145	follows);
				1146	if (ret < 0) {
				1147	dout("error sending cap msg, must requeue %p\n", inode);
				1148	delayed = 1;
				1149	}
				1150
				1151	if (wake)
				1152	wake_up(&ci->i_cap_wq);
				1153
				1154	return delayed;
				1155	}
				1156
				1157	/*
				1158	* When a snapshot is taken, clients accumulate dirty metadata on
				1159	* inodes with capabilities in ceph_cap_snaps to describe the file
				1160	* state at the time the snapshot was taken. This must be flushed
				1161	* asynchronously back to the MDS once sync writes complete and dirty
				1162	* data is written out.
				1163	*
				1164	* Called under i_lock. Takes s_mutex as needed.
				1165	*/
				1166	void __ceph_flush_snaps(struct ceph_inode_info *ci,
				1167	struct ceph_mds_session **psession)
				1168	{
				1169	struct inode *inode = &ci->vfs_inode;
				1170	int mds;
				1171	struct ceph_cap_snap *capsnap;
				1172	u32 mseq;
				1173	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
				1174	struct ceph_mds_session session = NULL; / if session != NULL, we hold
				1175	session->s_mutex */
				1176	u64 next_follows = 0; /* keep track of how far we've gotten through the
				1177	i_cap_snaps list, and skip these entries next time
				1178	around to avoid an infinite loop */
				1179
				1180	if (psession)
				1181	session = *psession;
				1182
				1183	dout("__flush_snaps %p\n", inode);
				1184	retry:
				1185	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
				1186	/* avoid an infiniute loop after retry */
				1187	if (capsnap->follows < next_follows)
				1188	continue;
				1189	/*
				1190	* we need to wait for sync writes to complete and for dirty
				1191	* pages to be written out.
				1192	*/
				1193	if (capsnap->dirty_pages \|\| capsnap->writing)
				1194	continue;
				1195
				1196	/* pick mds, take s_mutex */
				1197	mds = __ceph_get_cap_mds(ci, &mseq);
				1198	if (session && session->s_mds != mds) {
				1199	dout("oops, wrong session %p mutex\n", session);
				1200	mutex_unlock(&session->s_mutex);
				1201	ceph_put_mds_session(session);
				1202	session = NULL;
				1203	}
				1204	if (!session) {
				1205	spin_unlock(&inode->i_lock);
				1206	mutex_lock(&mdsc->mutex);
				1207	session = __ceph_lookup_mds_session(mdsc, mds);
				1208	mutex_unlock(&mdsc->mutex);
				1209	if (session) {
				1210	dout("inverting session/ino locks on %p\n",
				1211	session);
				1212	mutex_lock(&session->s_mutex);
				1213	}
				1214	/*
				1215	* if session == NULL, we raced against a cap
				1216	* deletion. retry, and we'll get a better
				1217	* @mds value next time.
				1218	*/
				1219	spin_lock(&inode->i_lock);
				1220	goto retry;
				1221	}
				1222
				1223	capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
				1224	atomic_inc(&capsnap->nref);
				1225	if (!list_empty(&capsnap->flushing_item))
				1226	list_del_init(&capsnap->flushing_item);
				1227	list_add_tail(&capsnap->flushing_item,
				1228	&session->s_cap_snaps_flushing);
				1229	spin_unlock(&inode->i_lock);
				1230
				1231	dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
				1232	inode, capsnap, next_follows, capsnap->size);
				1233	send_cap_msg(session, ceph_vino(inode).ino, 0,
				1234	CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
				1235	capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
				1236	capsnap->size, 0,
				1237	&capsnap->mtime, &capsnap->atime,
				1238	capsnap->time_warp_seq,
				1239	capsnap->uid, capsnap->gid, capsnap->mode,
				1240	0, NULL,
				1241	capsnap->follows);
				1242
				1243	next_follows = capsnap->follows + 1;
				1244	ceph_put_cap_snap(capsnap);
				1245
				1246	spin_lock(&inode->i_lock);
				1247	goto retry;
				1248	}
				1249
				1250	/* we flushed them all; remove this inode from the queue */
				1251	spin_lock(&mdsc->snap_flush_lock);
				1252	list_del_init(&ci->i_snap_flush_item);
				1253	spin_unlock(&mdsc->snap_flush_lock);
				1254
				1255	if (psession)
				1256	*psession = session;
				1257	else if (session) {
				1258	mutex_unlock(&session->s_mutex);
				1259	ceph_put_mds_session(session);
				1260	}
				1261	}
				1262
				1263	static void ceph_flush_snaps(struct ceph_inode_info *ci)
				1264	{
				1265	struct inode *inode = &ci->vfs_inode;
				1266
				1267	spin_lock(&inode->i_lock);
				1268	__ceph_flush_snaps(ci, NULL);
				1269	spin_unlock(&inode->i_lock);
				1270	}
				1271
				1272	/*
				1273	* Add dirty inode to the flushing list. Assigned a seq number so we
				1274	* can wait for caps to flush without starving.
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1275	*
				1276	* Called under i_lock.
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1277	*/
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1278	static int __mark_caps_flushing(struct inode *inode,
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1279	struct ceph_mds_session *session)
				1280	{
				1281	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
				1282	struct ceph_inode_info *ci = ceph_inode(inode);
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1283	int flushing;
				1284
				1285	BUG_ON(ci->i_dirty_caps == 0);
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1286	BUG_ON(list_empty(&ci->i_dirty_item));
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1287
				1288	flushing = ci->i_dirty_caps;
				1289	dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
				1290	ceph_cap_string(flushing),
				1291	ceph_cap_string(ci->i_flushing_caps),
				1292	ceph_cap_string(ci->i_flushing_caps \| flushing));
				1293	ci->i_flushing_caps \|= flushing;
				1294	ci->i_dirty_caps = 0;
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame^]	1295	dout(" inode %p now !dirty\n", inode);
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1296
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1297	spin_lock(&mdsc->cap_dirty_lock);
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame^]	1298	list_del_init(&ci->i_dirty_item);
				1299
				1300	ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1301	if (list_empty(&ci->i_flushing_item)) {
				1302	list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
				1303	mdsc->num_cap_flushing++;
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame^]	1304	dout(" inode %p now flushing seq %lld\n", inode,
				1305	ci->i_cap_flush_seq);
				1306	} else {
				1307	list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
				1308	dout(" inode %p now flushing (more) seq %lld\n", inode,
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1309	ci->i_cap_flush_seq);
				1310	}
				1311	spin_unlock(&mdsc->cap_dirty_lock);
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1312
				1313	return flushing;
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1314	}
				1315
				1316	/*
				1317	* Swiss army knife function to examine currently used and wanted
				1318	* versus held caps. Release, flush, ack revoked caps to mds as
				1319	* appropriate.
				1320	*
				1321	* CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
				1322	* cap release further.
				1323	* CHECK_CAPS_AUTHONLY - we should only check the auth cap
				1324	* CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
				1325	* further delay.
				1326	*/
				1327	void ceph_check_caps(struct ceph_inode_info *ci, int flags,
				1328	struct ceph_mds_session *session)
				1329	{
				1330	struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
				1331	struct ceph_mds_client *mdsc = &client->mdsc;
				1332	struct inode *inode = &ci->vfs_inode;
				1333	struct ceph_cap *cap;
				1334	int file_wanted, used;
				1335	int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
				1336	int drop_session_lock = session ? 0 : 1;
				1337	int want, retain, revoking, flushing = 0;
				1338	int mds = -1; /* keep track of how far we've gone through i_caps list
				1339	to avoid an infinite loop on retry */
				1340	struct rb_node *p;
				1341	int tried_invalidate = 0;
				1342	int delayed = 0, sent = 0, force_requeue = 0, num;
				1343	int is_delayed = flags & CHECK_CAPS_NODELAY;
				1344
				1345	/* if we are unmounting, flush any unused caps immediately. */
				1346	if (mdsc->stopping)
				1347	is_delayed = 1;
				1348
				1349	spin_lock(&inode->i_lock);
				1350
				1351	if (ci->i_ceph_flags & CEPH_I_FLUSH)
				1352	flags \|= CHECK_CAPS_FLUSH;
				1353
				1354	/* flush snaps first time around only */
				1355	if (!list_empty(&ci->i_cap_snaps))
				1356	__ceph_flush_snaps(ci, &session);
				1357	goto retry_locked;
				1358	retry:
				1359	spin_lock(&inode->i_lock);
				1360	retry_locked:
				1361	file_wanted = __ceph_caps_file_wanted(ci);
				1362	used = __ceph_caps_used(ci);
				1363	want = file_wanted \| used;
				1364
				1365	retain = want \| CEPH_CAP_PIN;
				1366	if (!mdsc->stopping && inode->i_nlink > 0) {
				1367	if (want) {
				1368	retain \|= CEPH_CAP_ANY; /* be greedy */
				1369	} else {
				1370	retain \|= CEPH_CAP_ANY_SHARED;
				1371	/*
				1372	* keep RD only if we didn't have the file open RW,
				1373	* because then the mds would revoke it anyway to
				1374	* journal max_size=0.
				1375	*/
				1376	if (ci->i_max_size == 0)
				1377	retain \|= CEPH_CAP_ANY_RD;
				1378	}
				1379	}
				1380
				1381	dout("check_caps %p file_want %s used %s dirty %s flushing %s"
				1382	" issued %s retain %s %s%s%s\n", inode,
				1383	ceph_cap_string(file_wanted),
				1384	ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
				1385	ceph_cap_string(ci->i_flushing_caps),
				1386	ceph_cap_string(__ceph_caps_issued(ci, NULL)),
				1387	ceph_cap_string(retain),
				1388	(flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
				1389	(flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
				1390	(flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
				1391
				1392	/*
				1393	* If we no longer need to hold onto old our caps, and we may
				1394	* have cached pages, but don't want them, then try to invalidate.
				1395	* If we fail, it's because pages are locked.... try again later.
				1396	*/
				1397	if ((!is_delayed \|\| mdsc->stopping) &&
				1398	ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
				1399	ci->i_rdcache_gen && /* may have cached pages */
				1400	file_wanted == 0 && /* no open files */
				1401	!ci->i_truncate_pending &&
				1402	!tried_invalidate) {
				1403	u32 invalidating_gen = ci->i_rdcache_gen;
				1404	int ret;
				1405
				1406	dout("check_caps trying to invalidate on %p\n", inode);
				1407	spin_unlock(&inode->i_lock);
				1408	ret = invalidate_inode_pages2(&inode->i_data);
				1409	spin_lock(&inode->i_lock);
				1410	if (ret == 0 && invalidating_gen == ci->i_rdcache_gen) {
				1411	/* success. */
				1412	ci->i_rdcache_gen = 0;
				1413	ci->i_rdcache_revoking = 0;
				1414	} else {
				1415	dout("check_caps failed to invalidate pages\n");
				1416	/* we failed to invalidate pages. check these
				1417	caps again later. */
				1418	force_requeue = 1;
				1419	__cap_set_timeouts(mdsc, ci);
				1420	}
				1421	tried_invalidate = 1;
				1422	goto retry_locked;
				1423	}
				1424
				1425	num = 0;
				1426	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				1427	cap = rb_entry(p, struct ceph_cap, ci_node);
				1428	num++;
				1429
				1430	/* avoid looping forever */
				1431	if (mds >= cap->mds \|\|
				1432	((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
				1433	continue;
				1434
				1435	/* NOTE: no side-effects allowed, until we take s_mutex */
				1436
				1437	revoking = cap->implemented & ~cap->issued;
				1438	if (revoking)
				1439	dout("mds%d revoking %s\n", cap->mds,
				1440	ceph_cap_string(revoking));
				1441
				1442	if (cap == ci->i_auth_cap &&
				1443	(cap->issued & CEPH_CAP_FILE_WR)) {
				1444	/* request larger max_size from MDS? */
				1445	if (ci->i_wanted_max_size > ci->i_max_size &&
				1446	ci->i_wanted_max_size > ci->i_requested_max_size) {
				1447	dout("requesting new max_size\n");
				1448	goto ack;
				1449	}
				1450
				1451	/* approaching file_max? */
				1452	if ((inode->i_size << 1) >= ci->i_max_size &&
				1453	(ci->i_reported_size << 1) < ci->i_max_size) {
				1454	dout("i_size approaching max_size\n");
				1455	goto ack;
				1456	}
				1457	}
				1458	/* flush anything dirty? */
				1459	if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
				1460	ci->i_dirty_caps) {
				1461	dout("flushing dirty caps\n");
				1462	goto ack;
				1463	}
				1464
				1465	/* completed revocation? going down and there are no caps? */
				1466	if (revoking && (revoking & used) == 0) {
				1467	dout("completed revocation of %s\n",
				1468	ceph_cap_string(cap->implemented & ~cap->issued));
				1469	goto ack;
				1470	}
				1471
				1472	/* want more caps from mds? */
				1473	if (want & ~(cap->mds_wanted \| cap->issued))
				1474	goto ack;
				1475
				1476	/* things we might delay */
				1477	if ((cap->issued & ~retain) == 0 &&
				1478	cap->mds_wanted == want)
				1479	continue; /* nope, all good */
				1480
				1481	if (is_delayed)
				1482	goto ack;
				1483
				1484	/* delay? */
				1485	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
				1486	time_before(jiffies, ci->i_hold_caps_max)) {
				1487	dout(" delaying issued %s -> %s, wanted %s -> %s\n",
				1488	ceph_cap_string(cap->issued),
				1489	ceph_cap_string(cap->issued & retain),
				1490	ceph_cap_string(cap->mds_wanted),
				1491	ceph_cap_string(want));
				1492	delayed++;
				1493	continue;
				1494	}
				1495
				1496	ack:
				1497	if (session && session != cap->session) {
				1498	dout("oops, wrong session %p mutex\n", session);
				1499	mutex_unlock(&session->s_mutex);
				1500	session = NULL;
				1501	}
				1502	if (!session) {
				1503	session = cap->session;
				1504	if (mutex_trylock(&session->s_mutex) == 0) {
				1505	dout("inverting session/ino locks on %p\n",
				1506	session);
				1507	spin_unlock(&inode->i_lock);
				1508	if (took_snap_rwsem) {
				1509	up_read(&mdsc->snap_rwsem);
				1510	took_snap_rwsem = 0;
				1511	}
				1512	mutex_lock(&session->s_mutex);
				1513	goto retry;
				1514	}
				1515	}
				1516	/* take snap_rwsem after session mutex */
				1517	if (!took_snap_rwsem) {
				1518	if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
				1519	dout("inverting snap/in locks on %p\n",
				1520	inode);
				1521	spin_unlock(&inode->i_lock);
				1522	down_read(&mdsc->snap_rwsem);
				1523	took_snap_rwsem = 1;
				1524	goto retry;
				1525	}
				1526	took_snap_rwsem = 1;
				1527	}
				1528
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1529	if (cap == ci->i_auth_cap && ci->i_dirty_caps)
				1530	flushing = __mark_caps_flushing(inode, session);
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1531
				1532	mds = cap->mds; /* remember mds, so we don't repeat */
				1533	sent++;
				1534
				1535	/* __send_cap drops i_lock */
				1536	delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
				1537	retain, flushing, NULL);
				1538	goto retry; /* retake i_lock and restart our cap scan. */
				1539	}
				1540
				1541	/*
				1542	* Reschedule delayed caps release if we delayed anything,
				1543	* otherwise cancel.
				1544	*/
				1545	if (delayed && is_delayed)
				1546	force_requeue = 1; /* __send_cap delayed release; requeue */
				1547	if (!delayed && !is_delayed)
				1548	__cap_delay_cancel(mdsc, ci);
				1549	else if (!is_delayed \|\| force_requeue)
				1550	__cap_delay_requeue(mdsc, ci);
				1551
				1552	spin_unlock(&inode->i_lock);
				1553
				1554	if (session && drop_session_lock)
				1555	mutex_unlock(&session->s_mutex);
				1556	if (took_snap_rwsem)
				1557	up_read(&mdsc->snap_rwsem);
				1558	}
				1559
				1560	/*
				1561	* Mark caps dirty. If inode is newly dirty, add to the global dirty
				1562	* list.
				1563	*/
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame^]	1564	void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1565	{
				1566	struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
				1567	struct inode *inode = &ci->vfs_inode;
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame^]	1568	int was_dirty = ci->i_dirty_caps;
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1569	int dirty = 0;
				1570
				1571	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
				1572	ceph_cap_string(mask), ceph_cap_string(ci->i_dirty_caps),
				1573	ceph_cap_string(ci->i_dirty_caps \| mask));
				1574	ci->i_dirty_caps \|= mask;
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame^]	1575	if (!was_dirty) {
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1576	dout(" inode %p now dirty\n", &ci->vfs_inode);
				1577	spin_lock(&mdsc->cap_dirty_lock);
				1578	list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
				1579	spin_unlock(&mdsc->cap_dirty_lock);
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame^]	1580	if (ci->i_flushing_caps == 0) {
				1581	igrab(inode);
				1582	dirty \|= I_DIRTY_SYNC;
				1583	}
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1584	}
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame^]	1585	if (((was_dirty \| ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1586	(mask & CEPH_CAP_FILE_BUFFER))
				1587	dirty \|= I_DIRTY_DATASYNC;
				1588	if (dirty)
				1589	__mark_inode_dirty(inode, dirty);
				1590	__cap_delay_requeue(mdsc, ci);
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1591	}
				1592
				1593	/*
				1594	* Try to flush dirty caps back to the auth mds.
				1595	*/
				1596	static int try_flush_caps(struct inode inode, struct ceph_mds_session session,
				1597	unsigned *flush_tid)
				1598	{
				1599	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
				1600	struct ceph_inode_info *ci = ceph_inode(inode);
				1601	int unlock_session = session ? 0 : 1;
				1602	int flushing = 0;
				1603
				1604	retry:
				1605	spin_lock(&inode->i_lock);
				1606	if (ci->i_dirty_caps && ci->i_auth_cap) {
				1607	struct ceph_cap *cap = ci->i_auth_cap;
				1608	int used = __ceph_caps_used(ci);
				1609	int want = __ceph_caps_wanted(ci);
				1610	int delayed;
				1611
				1612	if (!session) {
				1613	spin_unlock(&inode->i_lock);
				1614	session = cap->session;
				1615	mutex_lock(&session->s_mutex);
				1616	goto retry;
				1617	}
				1618	BUG_ON(session != cap->session);
				1619	if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
				1620	goto out;
				1621
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1622	flushing = __mark_caps_flushing(inode, session);
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1623
				1624	/* __send_cap drops i_lock */
				1625	delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
				1626	cap->issued \| cap->implemented, flushing,
				1627	flush_tid);
				1628	if (!delayed)
				1629	goto out_unlocked;
				1630
				1631	spin_lock(&inode->i_lock);
				1632	__cap_delay_requeue(mdsc, ci);
				1633	}
				1634	out:
				1635	spin_unlock(&inode->i_lock);
				1636	out_unlocked:
				1637	if (session && unlock_session)
				1638	mutex_unlock(&session->s_mutex);
				1639	return flushing;
				1640	}
				1641
				1642	/*
				1643	* Return true if we've flushed caps through the given flush_tid.
				1644	*/
				1645	static int caps_are_flushed(struct inode *inode, unsigned tid)
				1646	{
				1647	struct ceph_inode_info *ci = ceph_inode(inode);
				1648	int dirty, i, ret = 1;
				1649
				1650	spin_lock(&inode->i_lock);
				1651	dirty = __ceph_caps_dirty(ci);
				1652	for (i = 0; i < CEPH_CAP_BITS; i++)
				1653	if ((ci->i_flushing_caps & (1 << i)) &&
				1654	ci->i_cap_flush_tid[i] <= tid) {
				1655	/* still flushing this bit */
				1656	ret = 0;
				1657	break;
				1658	}
				1659	spin_unlock(&inode->i_lock);
				1660	return ret;
				1661	}
				1662
				1663	/*
				1664	* Wait on any unsafe replies for the given inode. First wait on the
				1665	* newest request, and make that the upper bound. Then, if there are
				1666	* more requests, keep waiting on the oldest as long as it is still older
				1667	* than the original request.
				1668	*/
				1669	static void sync_write_wait(struct inode *inode)
				1670	{
				1671	struct ceph_inode_info *ci = ceph_inode(inode);
				1672	struct list_head *head = &ci->i_unsafe_writes;
				1673	struct ceph_osd_request *req;
				1674	u64 last_tid;
				1675
				1676	spin_lock(&ci->i_unsafe_lock);
				1677	if (list_empty(head))
				1678	goto out;
				1679
				1680	/* set upper bound as _last_ entry in chain */
				1681	req = list_entry(head->prev, struct ceph_osd_request,
				1682	r_unsafe_item);
				1683	last_tid = req->r_tid;
				1684
				1685	do {
				1686	ceph_osdc_get_request(req);
				1687	spin_unlock(&ci->i_unsafe_lock);
				1688	dout("sync_write_wait on tid %llu (until %llu)\n",
				1689	req->r_tid, last_tid);
				1690	wait_for_completion(&req->r_safe_completion);
				1691	spin_lock(&ci->i_unsafe_lock);
				1692	ceph_osdc_put_request(req);
				1693
				1694	/*
				1695	* from here on look at first entry in chain, since we
				1696	* only want to wait for anything older than last_tid
				1697	*/
				1698	if (list_empty(head))
				1699	break;
				1700	req = list_entry(head->next, struct ceph_osd_request,
				1701	r_unsafe_item);
				1702	} while (req->r_tid < last_tid);
				1703	out:
				1704	spin_unlock(&ci->i_unsafe_lock);
				1705	}
				1706
				1707	int ceph_fsync(struct file file, struct dentry dentry, int datasync)
				1708	{
				1709	struct inode *inode = dentry->d_inode;
				1710	struct ceph_inode_info *ci = ceph_inode(inode);
				1711	unsigned flush_tid;
				1712	int ret;
				1713	int dirty;
				1714
				1715	dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
				1716	sync_write_wait(inode);
				1717
				1718	ret = filemap_write_and_wait(inode->i_mapping);
				1719	if (ret < 0)
				1720	return ret;
				1721
				1722	dirty = try_flush_caps(inode, NULL, &flush_tid);
				1723	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
				1724
				1725	/*
				1726	* only wait on non-file metadata writeback (the mds
				1727	* can recover size and mtime, so we don't need to
				1728	* wait for that)
				1729	*/
				1730	if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
				1731	dout("fsync waiting for flush_tid %u\n", flush_tid);
				1732	ret = wait_event_interruptible(ci->i_cap_wq,
				1733	caps_are_flushed(inode, flush_tid));
				1734	}
				1735
				1736	dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
				1737	return ret;
				1738	}
				1739
				1740	/*
				1741	* Flush any dirty caps back to the mds. If we aren't asked to wait,
				1742	* queue inode for flush but don't do so immediately, because we can
				1743	* get by with fewer MDS messages if we wait for data writeback to
				1744	* complete first.
				1745	*/
				1746	int ceph_write_inode(struct inode *inode, int wait)
				1747	{
				1748	struct ceph_inode_info *ci = ceph_inode(inode);
				1749	unsigned flush_tid;
				1750	int err = 0;
				1751	int dirty;
				1752
				1753	dout("write_inode %p wait=%d\n", inode, wait);
				1754	if (wait) {
				1755	dirty = try_flush_caps(inode, NULL, &flush_tid);
				1756	if (dirty)
				1757	err = wait_event_interruptible(ci->i_cap_wq,
				1758	caps_are_flushed(inode, flush_tid));
				1759	} else {
				1760	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
				1761
				1762	spin_lock(&inode->i_lock);
				1763	if (__ceph_caps_dirty(ci))
				1764	__cap_delay_requeue_front(mdsc, ci);
				1765	spin_unlock(&inode->i_lock);
				1766	}
				1767	return err;
				1768	}
				1769
				1770	/*
				1771	* After a recovering MDS goes active, we need to resend any caps
				1772	* we were flushing.
				1773	*
				1774	* Caller holds session->s_mutex.
				1775	*/
				1776	static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
				1777	struct ceph_mds_session *session)
				1778	{
				1779	struct ceph_cap_snap *capsnap;
				1780
				1781	dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
				1782	list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
				1783	flushing_item) {
				1784	struct ceph_inode_info *ci = capsnap->ci;
				1785	struct inode *inode = &ci->vfs_inode;
				1786	struct ceph_cap *cap;
				1787
				1788	spin_lock(&inode->i_lock);
				1789	cap = ci->i_auth_cap;
				1790	if (cap && cap->session == session) {
				1791	dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
				1792	cap, capsnap);
				1793	__ceph_flush_snaps(ci, &session);
				1794	} else {
				1795	pr_err("%p auth cap %p not mds%d ???\n", inode,
				1796	cap, session->s_mds);
				1797	spin_unlock(&inode->i_lock);
				1798	}
				1799	}
				1800	}
				1801
				1802	void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
				1803	struct ceph_mds_session *session)
				1804	{
				1805	struct ceph_inode_info *ci;
				1806
				1807	kick_flushing_capsnaps(mdsc, session);
				1808
				1809	dout("kick_flushing_caps mds%d\n", session->s_mds);
				1810	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
				1811	struct inode *inode = &ci->vfs_inode;
				1812	struct ceph_cap *cap;
				1813	int delayed = 0;
				1814
				1815	spin_lock(&inode->i_lock);
				1816	cap = ci->i_auth_cap;
				1817	if (cap && cap->session == session) {
				1818	dout("kick_flushing_caps %p cap %p %s\n", inode,
				1819	cap, ceph_cap_string(ci->i_flushing_caps));
				1820	delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
				1821	__ceph_caps_used(ci),
				1822	__ceph_caps_wanted(ci),
				1823	cap->issued \| cap->implemented,
				1824	ci->i_flushing_caps, NULL);
				1825	if (delayed) {
				1826	spin_lock(&inode->i_lock);
				1827	__cap_delay_requeue(mdsc, ci);
				1828	spin_unlock(&inode->i_lock);
				1829	}
				1830	} else {
				1831	pr_err("%p auth cap %p not mds%d ???\n", inode,
				1832	cap, session->s_mds);
				1833	spin_unlock(&inode->i_lock);
				1834	}
				1835	}
				1836	}
				1837
				1838
				1839	/*
				1840	* Take references to capabilities we hold, so that we don't release
				1841	* them to the MDS prematurely.
				1842	*
				1843	* Protected by i_lock.
				1844	*/
				1845	static void __take_cap_refs(struct ceph_inode_info *ci, int got)
				1846	{
				1847	if (got & CEPH_CAP_PIN)
				1848	ci->i_pin_ref++;
				1849	if (got & CEPH_CAP_FILE_RD)
				1850	ci->i_rd_ref++;
				1851	if (got & CEPH_CAP_FILE_CACHE)
				1852	ci->i_rdcache_ref++;
				1853	if (got & CEPH_CAP_FILE_WR)
				1854	ci->i_wr_ref++;
				1855	if (got & CEPH_CAP_FILE_BUFFER) {
				1856	if (ci->i_wrbuffer_ref == 0)
				1857	igrab(&ci->vfs_inode);
				1858	ci->i_wrbuffer_ref++;
				1859	dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
				1860	&ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
				1861	}
				1862	}
				1863
				1864	/*
				1865	* Try to grab cap references. Specify those refs we @want, and the
				1866	* minimal set we @need. Also include the larger offset we are writing
				1867	* to (when applicable), and check against max_size here as well.
				1868	* Note that caller is responsible for ensuring max_size increases are
				1869	* requested from the MDS.
				1870	*/
				1871	static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
				1872	int got, loff_t endoff, int check_max, int *err)
				1873	{
				1874	struct inode *inode = &ci->vfs_inode;
				1875	int ret = 0;
				1876	int have, implemented;
				1877
				1878	dout("get_cap_refs %p need %s want %s\n", inode,
				1879	ceph_cap_string(need), ceph_cap_string(want));
				1880	spin_lock(&inode->i_lock);
				1881
				1882	/* make sure we _have_ some caps! */
				1883	if (!__ceph_is_any_caps(ci)) {
				1884	dout("get_cap_refs %p no real caps\n", inode);
				1885	*err = -EBADF;
				1886	ret = 1;
				1887	goto out;
				1888	}
				1889
				1890	if (need & CEPH_CAP_FILE_WR) {
				1891	if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
				1892	dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
				1893	inode, endoff, ci->i_max_size);
				1894	if (endoff > ci->i_wanted_max_size) {
				1895	*check_max = 1;
				1896	ret = 1;
				1897	}
				1898	goto out;
				1899	}
				1900	/*
				1901	* If a sync write is in progress, we must wait, so that we
				1902	* can get a final snapshot value for size+mtime.
				1903	*/
				1904	if (__ceph_have_pending_cap_snap(ci)) {
				1905	dout("get_cap_refs %p cap_snap_pending\n", inode);
				1906	goto out;
				1907	}
				1908	}
				1909	have = __ceph_caps_issued(ci, &implemented);
				1910
				1911	/*
				1912	* disallow writes while a truncate is pending
				1913	*/
				1914	if (ci->i_truncate_pending)
				1915	have &= ~CEPH_CAP_FILE_WR;
				1916
				1917	if ((have & need) == need) {
				1918	/*
				1919	* Look at (implemented & ~have & not) so that we keep waiting
				1920	* on transition from wanted -> needed caps. This is needed
				1921	* for WRBUFFER\|WR -> WR to avoid a new WR sync write from
				1922	* going before a prior buffered writeback happens.
				1923	*/
				1924	int not = want & ~(have & need);
				1925	int revoking = implemented & ~have;
				1926	dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
				1927	inode, ceph_cap_string(have), ceph_cap_string(not),
				1928	ceph_cap_string(revoking));
				1929	if ((revoking & not) == 0) {
				1930	*got = need \| (have & want);
				1931	__take_cap_refs(ci, *got);
				1932	ret = 1;
				1933	}
				1934	} else {
				1935	dout("get_cap_refs %p have %s needed %s\n", inode,
				1936	ceph_cap_string(have), ceph_cap_string(need));
				1937	}
				1938	out:
				1939	spin_unlock(&inode->i_lock);
				1940	dout("get_cap_refs %p ret %d got %s\n", inode,
				1941	ret, ceph_cap_string(*got));
				1942	return ret;
				1943	}
				1944
				1945	/*
				1946	* Check the offset we are writing up to against our current
				1947	* max_size. If necessary, tell the MDS we want to write to
				1948	* a larger offset.
				1949	*/
				1950	static void check_max_size(struct inode *inode, loff_t endoff)
				1951	{
				1952	struct ceph_inode_info *ci = ceph_inode(inode);
				1953	int check = 0;
				1954
				1955	/* do we need to explicitly request a larger max_size? */
				1956	spin_lock(&inode->i_lock);
				1957	if ((endoff >= ci->i_max_size \|\|
				1958	endoff > (inode->i_size << 1)) &&
				1959	endoff > ci->i_wanted_max_size) {
				1960	dout("write %p at large endoff %llu, req max_size\n",
				1961	inode, endoff);
				1962	ci->i_wanted_max_size = endoff;
				1963	check = 1;
				1964	}
				1965	spin_unlock(&inode->i_lock);
				1966	if (check)
				1967	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
				1968	}
				1969
				1970	/*
				1971	* Wait for caps, and take cap references. If we can't get a WR cap
				1972	* due to a small max_size, make sure we check_max_size (and possibly
				1973	* ask the mds) so we don't get hung up indefinitely.
				1974	*/
				1975	int ceph_get_caps(struct ceph_inode_info ci, int need, int want, int got,
				1976	loff_t endoff)
				1977	{
				1978	int check_max, ret, err;
				1979
				1980	retry:
				1981	if (endoff > 0)
				1982	check_max_size(&ci->vfs_inode, endoff);
				1983	check_max = 0;
				1984	err = 0;
				1985	ret = wait_event_interruptible(ci->i_cap_wq,
				1986	try_get_cap_refs(ci, need, want,
				1987	got, endoff,
				1988	&check_max, &err));
				1989	if (err)
				1990	ret = err;
				1991	if (check_max)
				1992	goto retry;
				1993	return ret;
				1994	}
				1995
				1996	/*
				1997	* Take cap refs. Caller must already know we hold at least one ref
				1998	* on the caps in question or we don't know this is safe.
				1999	*/
				2000	void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
				2001	{
				2002	spin_lock(&ci->vfs_inode.i_lock);
				2003	__take_cap_refs(ci, caps);
				2004	spin_unlock(&ci->vfs_inode.i_lock);
				2005	}
				2006
				2007	/*
				2008	* Release cap refs.
				2009	*
				2010	* If we released the last ref on any given cap, call ceph_check_caps
				2011	* to release (or schedule a release).
				2012	*
				2013	* If we are releasing a WR cap (from a sync write), finalize any affected
				2014	* cap_snap, and wake up any waiters.
				2015	*/
				2016	void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
				2017	{
				2018	struct inode *inode = &ci->vfs_inode;
				2019	int last = 0, put = 0, flushsnaps = 0, wake = 0;
				2020	struct ceph_cap_snap *capsnap;
				2021
				2022	spin_lock(&inode->i_lock);
				2023	if (had & CEPH_CAP_PIN)
				2024	--ci->i_pin_ref;
				2025	if (had & CEPH_CAP_FILE_RD)
				2026	if (--ci->i_rd_ref == 0)
				2027	last++;
				2028	if (had & CEPH_CAP_FILE_CACHE)
				2029	if (--ci->i_rdcache_ref == 0)
				2030	last++;
				2031	if (had & CEPH_CAP_FILE_BUFFER) {
				2032	if (--ci->i_wrbuffer_ref == 0) {
				2033	last++;
				2034	put++;
				2035	}
				2036	dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
				2037	inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
				2038	}
				2039	if (had & CEPH_CAP_FILE_WR)
				2040	if (--ci->i_wr_ref == 0) {
				2041	last++;
				2042	if (!list_empty(&ci->i_cap_snaps)) {
				2043	capsnap = list_first_entry(&ci->i_cap_snaps,
				2044	struct ceph_cap_snap,
				2045	ci_item);
				2046	if (capsnap->writing) {
				2047	capsnap->writing = 0;
				2048	flushsnaps =
				2049	__ceph_finish_cap_snap(ci,
				2050	capsnap);
				2051	wake = 1;
				2052	}
				2053	}
				2054	}
				2055	spin_unlock(&inode->i_lock);
				2056
				2057	dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had),
				2058	last ? "last" : "");
				2059
				2060	if (last && !flushsnaps)
				2061	ceph_check_caps(ci, 0, NULL);
				2062	else if (flushsnaps)
				2063	ceph_flush_snaps(ci);
				2064	if (wake)
				2065	wake_up(&ci->i_cap_wq);
				2066	if (put)
				2067	iput(inode);
				2068	}
				2069
				2070	/*
				2071	* Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
				2072	* context. Adjust per-snap dirty page accounting as appropriate.
				2073	* Once all dirty data for a cap_snap is flushed, flush snapped file
				2074	* metadata back to the MDS. If we dropped the last ref, call
				2075	* ceph_check_caps.
				2076	*/
				2077	void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
				2078	struct ceph_snap_context *snapc)
				2079	{
				2080	struct inode *inode = &ci->vfs_inode;
				2081	int last = 0;
				2082	int last_snap = 0;
				2083	int found = 0;
				2084	struct ceph_cap_snap *capsnap = NULL;
				2085
				2086	spin_lock(&inode->i_lock);
				2087	ci->i_wrbuffer_ref -= nr;
				2088	last = !ci->i_wrbuffer_ref;
				2089
				2090	if (ci->i_head_snapc == snapc) {
				2091	ci->i_wrbuffer_ref_head -= nr;
				2092	if (!ci->i_wrbuffer_ref_head) {
				2093	ceph_put_snap_context(ci->i_head_snapc);
				2094	ci->i_head_snapc = NULL;
				2095	}
				2096	dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
				2097	inode,
				2098	ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
				2099	ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
				2100	last ? " LAST" : "");
				2101	} else {
				2102	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
				2103	if (capsnap->context == snapc) {
				2104	found = 1;
				2105	capsnap->dirty_pages -= nr;
				2106	last_snap = !capsnap->dirty_pages;
				2107	break;
				2108	}
				2109	}
				2110	BUG_ON(!found);
				2111	dout("put_wrbuffer_cap_refs on %p cap_snap %p "
				2112	" snap %lld %d/%d -> %d/%d %s%s\n",
				2113	inode, capsnap, capsnap->context->seq,
				2114	ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
				2115	ci->i_wrbuffer_ref, capsnap->dirty_pages,
				2116	last ? " (wrbuffer last)" : "",
				2117	last_snap ? " (capsnap last)" : "");
				2118	}
				2119
				2120	spin_unlock(&inode->i_lock);
				2121
				2122	if (last) {
				2123	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
				2124	iput(inode);
				2125	} else if (last_snap) {
				2126	ceph_flush_snaps(ci);
				2127	wake_up(&ci->i_cap_wq);
				2128	}
				2129	}
				2130
				2131	/*
				2132	* Handle a cap GRANT message from the MDS. (Note that a GRANT may
				2133	* actually be a revocation if it specifies a smaller cap set.)
				2134	*
				2135	* caller holds s_mutex.
				2136	* return value:
				2137	* 0 - ok
				2138	* 1 - check_caps on auth cap only (writeback)
				2139	* 2 - check_caps (ack revoke)
				2140	*/
				2141	static int handle_cap_grant(struct inode inode, struct ceph_mds_caps grant,
				2142	struct ceph_mds_session *session,
				2143	struct ceph_cap *cap,
				2144	struct ceph_buffer *xattr_buf)
				2145	__releases(inode->i_lock)
				2146
				2147	{
				2148	struct ceph_inode_info *ci = ceph_inode(inode);
				2149	int mds = session->s_mds;
				2150	int seq = le32_to_cpu(grant->seq);
				2151	int newcaps = le32_to_cpu(grant->caps);
				2152	int issued, implemented, used, wanted, dirty;
				2153	u64 size = le64_to_cpu(grant->size);
				2154	u64 max_size = le64_to_cpu(grant->max_size);
				2155	struct timespec mtime, atime, ctime;
				2156	int reply = 0;
				2157	int wake = 0;
				2158	int writeback = 0;
				2159	int revoked_rdcache = 0;
				2160	int invalidate_async = 0;
				2161	int tried_invalidate = 0;
				2162	int ret;
				2163
				2164	dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
				2165	inode, cap, mds, seq, ceph_cap_string(newcaps));
				2166	dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
				2167	inode->i_size);
				2168
				2169	/*
				2170	* If CACHE is being revoked, and we have no dirty buffers,
				2171	* try to invalidate (once). (If there are dirty buffers, we
				2172	* will invalidate _after_ writeback.)
				2173	*/
				2174	restart:
				2175	if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
				2176	!ci->i_wrbuffer_ref && !tried_invalidate) {
				2177	dout("CACHE invalidation\n");
				2178	spin_unlock(&inode->i_lock);
				2179	tried_invalidate = 1;
				2180
				2181	ret = invalidate_inode_pages2(&inode->i_data);
				2182	spin_lock(&inode->i_lock);
				2183	if (ret < 0) {
				2184	/* there were locked pages.. invalidate later
				2185	in a separate thread. */
				2186	if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
				2187	invalidate_async = 1;
				2188	ci->i_rdcache_revoking = ci->i_rdcache_gen;
				2189	}
				2190	} else {
				2191	/* we successfully invalidated those pages */
				2192	revoked_rdcache = 1;
				2193	ci->i_rdcache_gen = 0;
				2194	ci->i_rdcache_revoking = 0;
				2195	}
				2196	goto restart;
				2197	}
				2198
				2199	/* side effects now are allowed */
				2200
				2201	issued = __ceph_caps_issued(ci, &implemented);
				2202	issued \|= implemented \| __ceph_caps_dirty(ci);
				2203
				2204	cap->gen = session->s_cap_gen;
				2205
				2206	__check_cap_issue(ci, cap, newcaps);
				2207
				2208	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
				2209	inode->i_mode = le32_to_cpu(grant->mode);
				2210	inode->i_uid = le32_to_cpu(grant->uid);
				2211	inode->i_gid = le32_to_cpu(grant->gid);
				2212	dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
				2213	inode->i_uid, inode->i_gid);
				2214	}
				2215
				2216	if ((issued & CEPH_CAP_LINK_EXCL) == 0)
				2217	inode->i_nlink = le32_to_cpu(grant->nlink);
				2218
				2219	if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
				2220	int len = le32_to_cpu(grant->xattr_len);
				2221	u64 version = le64_to_cpu(grant->xattr_version);
				2222
				2223	if (version > ci->i_xattrs.version) {
				2224	dout(" got new xattrs v%llu on %p len %d\n",
				2225	version, inode, len);
				2226	if (ci->i_xattrs.blob)
				2227	ceph_buffer_put(ci->i_xattrs.blob);
				2228	ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
				2229	ci->i_xattrs.version = version;
				2230	}
				2231	}
				2232
				2233	/* size/ctime/mtime/atime? */
				2234	ceph_fill_file_size(inode, issued,
				2235	le32_to_cpu(grant->truncate_seq),
				2236	le64_to_cpu(grant->truncate_size), size);
				2237	ceph_decode_timespec(&mtime, &grant->mtime);
				2238	ceph_decode_timespec(&atime, &grant->atime);
				2239	ceph_decode_timespec(&ctime, &grant->ctime);
				2240	ceph_fill_file_time(inode, issued,
				2241	le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
				2242	&atime);
				2243
				2244	/* max size increase? */
				2245	if (max_size != ci->i_max_size) {
				2246	dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
				2247	ci->i_max_size = max_size;
				2248	if (max_size >= ci->i_wanted_max_size) {
				2249	ci->i_wanted_max_size = 0; /* reset */
				2250	ci->i_requested_max_size = 0;
				2251	}
				2252	wake = 1;
				2253	}
				2254
				2255	/* check cap bits */
				2256	wanted = __ceph_caps_wanted(ci);
				2257	used = __ceph_caps_used(ci);
				2258	dirty = __ceph_caps_dirty(ci);
				2259	dout(" my wanted = %s, used = %s, dirty %s\n",
				2260	ceph_cap_string(wanted),
				2261	ceph_cap_string(used),
				2262	ceph_cap_string(dirty));
				2263	if (wanted != le32_to_cpu(grant->wanted)) {
				2264	dout("mds wanted %s -> %s\n",
				2265	ceph_cap_string(le32_to_cpu(grant->wanted)),
				2266	ceph_cap_string(wanted));
				2267	grant->wanted = cpu_to_le32(wanted);
				2268	}
				2269
				2270	cap->seq = seq;
				2271
				2272	/* file layout may have changed */
				2273	ci->i_layout = grant->layout;
				2274
				2275	/* revocation, grant, or no-op? */
				2276	if (cap->issued & ~newcaps) {
				2277	dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
				2278	ceph_cap_string(newcaps));
				2279	if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
				2280	writeback = 1; /* will delay ack */
				2281	else if (dirty & ~newcaps)
				2282	reply = 1; /* initiate writeback in check_caps */
				2283	else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 \|\|
				2284	revoked_rdcache)
				2285	reply = 2; /* send revoke ack in check_caps */
				2286	cap->issued = newcaps;
				2287	} else if (cap->issued == newcaps) {
				2288	dout("caps unchanged: %s -> %s\n",
				2289	ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
				2290	} else {
				2291	dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
				2292	ceph_cap_string(newcaps));
				2293	cap->issued = newcaps;
				2294	cap->implemented \|= newcaps; /* add bits only, to
				2295	* avoid stepping on a
				2296	* pending revocation */
				2297	wake = 1;
				2298	}
				2299
				2300	spin_unlock(&inode->i_lock);
				2301	if (writeback) {
				2302	/*
				2303	* queue inode for writeback: we can't actually call
				2304	* filemap_write_and_wait, etc. from message handler
				2305	* context.
				2306	*/
				2307	dout("queueing %p for writeback\n", inode);
				2308	if (ceph_queue_writeback(inode))
				2309	igrab(inode);
				2310	}
				2311	if (invalidate_async) {
				2312	dout("queueing %p for page invalidation\n", inode);
				2313	if (ceph_queue_page_invalidation(inode))
				2314	igrab(inode);
				2315	}
				2316	if (wake)
				2317	wake_up(&ci->i_cap_wq);
				2318	return reply;
				2319	}
				2320
				2321	/*
				2322	* Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
				2323	* MDS has been safely committed.
				2324	*/
				2325	static void handle_cap_flush_ack(struct inode *inode,
				2326	struct ceph_mds_caps *m,
				2327	struct ceph_mds_session *session,
				2328	struct ceph_cap *cap)
				2329	__releases(inode->i_lock)
				2330	{
				2331	struct ceph_inode_info *ci = ceph_inode(inode);
				2332	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
				2333	unsigned seq = le32_to_cpu(m->seq);
				2334	int dirty = le32_to_cpu(m->dirty);
				2335	int cleaned = 0;
				2336	u64 flush_tid = le64_to_cpu(m->client_tid);
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame^]	2337	int drop = 0;
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2338	int i;
				2339
				2340	for (i = 0; i < CEPH_CAP_BITS; i++)
				2341	if ((dirty & (1 << i)) &&
				2342	flush_tid == ci->i_cap_flush_tid[i])
				2343	cleaned \|= 1 << i;
				2344
				2345	dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
				2346	" flushing %s -> %s\n",
				2347	inode, session->s_mds, seq, ceph_cap_string(dirty),
				2348	ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
				2349	ceph_cap_string(ci->i_flushing_caps & ~cleaned));
				2350
				2351	if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
				2352	goto out;
				2353
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2354	ci->i_flushing_caps &= ~cleaned;
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2355
				2356	spin_lock(&mdsc->cap_dirty_lock);
				2357	if (ci->i_flushing_caps == 0) {
				2358	list_del_init(&ci->i_flushing_item);
				2359	if (!list_empty(&session->s_cap_flushing))
				2360	dout(" mds%d still flushing cap on %p\n",
				2361	session->s_mds,
				2362	&list_entry(session->s_cap_flushing.next,
				2363	struct ceph_inode_info,
				2364	i_flushing_item)->vfs_inode);
				2365	mdsc->num_cap_flushing--;
				2366	wake_up(&mdsc->cap_flushing_wq);
				2367	dout(" inode %p now !flushing\n", inode);
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame^]	2368
				2369	if (ci->i_dirty_caps == 0) {
				2370	dout(" inode %p now clean\n", inode);
				2371	BUG_ON(!list_empty(&ci->i_dirty_item));
				2372	drop = 1;
				2373	}
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2374	}
				2375	spin_unlock(&mdsc->cap_dirty_lock);
				2376	wake_up(&ci->i_cap_wq);
				2377
				2378	out:
				2379	spin_unlock(&inode->i_lock);
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame^]	2380	if (drop)
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2381	iput(inode);
				2382	}
				2383
				2384	/*
				2385	* Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
				2386	* throw away our cap_snap.
				2387	*
				2388	* Caller hold s_mutex.
				2389	*/
				2390	static void handle_cap_flushsnap_ack(struct inode *inode,
				2391	struct ceph_mds_caps *m,
				2392	struct ceph_mds_session *session)
				2393	{
				2394	struct ceph_inode_info *ci = ceph_inode(inode);
				2395	u64 follows = le64_to_cpu(m->snap_follows);
				2396	u64 flush_tid = le64_to_cpu(m->client_tid);
				2397	struct ceph_cap_snap *capsnap;
				2398	int drop = 0;
				2399
				2400	dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
				2401	inode, ci, session->s_mds, follows);
				2402
				2403	spin_lock(&inode->i_lock);
				2404	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
				2405	if (capsnap->follows == follows) {
				2406	if (capsnap->flush_tid != flush_tid) {
				2407	dout(" cap_snap %p follows %lld tid %lld !="
				2408	" %lld\n", capsnap, follows,
				2409	flush_tid, capsnap->flush_tid);
				2410	break;
				2411	}
				2412	WARN_ON(capsnap->dirty_pages \|\| capsnap->writing);
				2413	dout(" removing cap_snap %p follows %lld\n",
				2414	capsnap, follows);
				2415	ceph_put_snap_context(capsnap->context);
				2416	list_del(&capsnap->ci_item);
				2417	list_del(&capsnap->flushing_item);
				2418	ceph_put_cap_snap(capsnap);
				2419	drop = 1;
				2420	break;
				2421	} else {
				2422	dout(" skipping cap_snap %p follows %lld\n",
				2423	capsnap, capsnap->follows);
				2424	}
				2425	}
				2426	spin_unlock(&inode->i_lock);
				2427	if (drop)
				2428	iput(inode);
				2429	}
				2430
				2431	/*
				2432	* Handle TRUNC from MDS, indicating file truncation.
				2433	*
				2434	* caller hold s_mutex.
				2435	*/
				2436	static void handle_cap_trunc(struct inode *inode,
				2437	struct ceph_mds_caps *trunc,
				2438	struct ceph_mds_session *session)
				2439	__releases(inode->i_lock)
				2440	{
				2441	struct ceph_inode_info *ci = ceph_inode(inode);
				2442	int mds = session->s_mds;
				2443	int seq = le32_to_cpu(trunc->seq);
				2444	u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
				2445	u64 truncate_size = le64_to_cpu(trunc->truncate_size);
				2446	u64 size = le64_to_cpu(trunc->size);
				2447	int implemented = 0;
				2448	int dirty = __ceph_caps_dirty(ci);
				2449	int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
				2450	int queue_trunc = 0;
				2451
				2452	issued \|= implemented \| dirty;
				2453
				2454	dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
				2455	inode, mds, seq, truncate_size, truncate_seq);
				2456	queue_trunc = ceph_fill_file_size(inode, issued,
				2457	truncate_seq, truncate_size, size);
				2458	spin_unlock(&inode->i_lock);
				2459
				2460	if (queue_trunc)
				2461	if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
				2462	&ci->i_vmtruncate_work))
				2463	igrab(inode);
				2464	}
				2465
				2466	/*
				2467	* Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
				2468	* different one. If we are the most recent migration we've seen (as
				2469	* indicated by mseq), make note of the migrating cap bits for the
				2470	* duration (until we see the corresponding IMPORT).
				2471	*
				2472	* caller holds s_mutex
				2473	*/
				2474	static void handle_cap_export(struct inode inode, struct ceph_mds_caps ex,
				2475	struct ceph_mds_session *session)
				2476	{
				2477	struct ceph_inode_info *ci = ceph_inode(inode);
				2478	int mds = session->s_mds;
				2479	unsigned mseq = le32_to_cpu(ex->migrate_seq);
				2480	struct ceph_cap cap = NULL, t;
				2481	struct rb_node *p;
				2482	int remember = 1;
				2483
				2484	dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
				2485	inode, ci, mds, mseq);
				2486
				2487	spin_lock(&inode->i_lock);
				2488
				2489	/* make sure we haven't seen a higher mseq */
				2490	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				2491	t = rb_entry(p, struct ceph_cap, ci_node);
				2492	if (ceph_seq_cmp(t->mseq, mseq) > 0) {
				2493	dout(" higher mseq on cap from mds%d\n",
				2494	t->session->s_mds);
				2495	remember = 0;
				2496	}
				2497	if (t->session->s_mds == mds)
				2498	cap = t;
				2499	}
				2500
				2501	if (cap) {
				2502	if (remember) {
				2503	/* make note */
				2504	ci->i_cap_exporting_mds = mds;
				2505	ci->i_cap_exporting_mseq = mseq;
				2506	ci->i_cap_exporting_issued = cap->issued;
				2507	}
				2508	__ceph_remove_cap(cap, NULL);
				2509	} else {
				2510	WARN_ON(!cap);
				2511	}
				2512
				2513	spin_unlock(&inode->i_lock);
				2514	}
				2515
				2516	/*
				2517	* Handle cap IMPORT. If there are temp bits from an older EXPORT,
				2518	* clean them up.
				2519	*
				2520	* caller holds s_mutex.
				2521	*/
				2522	static void handle_cap_import(struct ceph_mds_client *mdsc,
				2523	struct inode inode, struct ceph_mds_caps im,
				2524	struct ceph_mds_session *session,
				2525	void *snaptrace, int snaptrace_len)
				2526	{
				2527	struct ceph_inode_info *ci = ceph_inode(inode);
				2528	int mds = session->s_mds;
				2529	unsigned issued = le32_to_cpu(im->caps);
				2530	unsigned wanted = le32_to_cpu(im->wanted);
				2531	unsigned seq = le32_to_cpu(im->seq);
				2532	unsigned mseq = le32_to_cpu(im->migrate_seq);
				2533	u64 realmino = le64_to_cpu(im->realm);
				2534	u64 cap_id = le64_to_cpu(im->cap_id);
				2535
				2536	if (ci->i_cap_exporting_mds >= 0 &&
				2537	ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
				2538	dout("handle_cap_import inode %p ci %p mds%d mseq %d"
				2539	" - cleared exporting from mds%d\n",
				2540	inode, ci, mds, mseq,
				2541	ci->i_cap_exporting_mds);
				2542	ci->i_cap_exporting_issued = 0;
				2543	ci->i_cap_exporting_mseq = 0;
				2544	ci->i_cap_exporting_mds = -1;
				2545	} else {
				2546	dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
				2547	inode, ci, mds, mseq);
				2548	}
				2549
				2550	down_write(&mdsc->snap_rwsem);
				2551	ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
				2552	false);
				2553	downgrade_write(&mdsc->snap_rwsem);
				2554	ceph_add_cap(inode, session, cap_id, -1,
				2555	issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
				2556	NULL /* no caps context */);
				2557	try_flush_caps(inode, session, NULL);
				2558	up_read(&mdsc->snap_rwsem);
				2559	}
				2560
				2561	/*
				2562	* Handle a caps message from the MDS.
				2563	*
				2564	* Identify the appropriate session, inode, and call the right handler
				2565	* based on the cap op.
				2566	*/
				2567	void ceph_handle_caps(struct ceph_mds_session *session,
				2568	struct ceph_msg *msg)
				2569	{
				2570	struct ceph_mds_client *mdsc = session->s_mdsc;
				2571	struct super_block *sb = mdsc->client->sb;
				2572	struct inode *inode;
				2573	struct ceph_cap *cap;
				2574	struct ceph_mds_caps *h;
				2575	int mds = le64_to_cpu(msg->hdr.src.name.num);
				2576	int op;
				2577	u32 seq;
				2578	struct ceph_vino vino;
				2579	u64 cap_id;
				2580	u64 size, max_size;
				2581	int check_caps = 0;
				2582	int r;
				2583
				2584	dout("handle_caps from mds%d\n", mds);
				2585
				2586	/* decode */
				2587	if (msg->front.iov_len < sizeof(*h))
				2588	goto bad;
				2589	h = msg->front.iov_base;
				2590	op = le32_to_cpu(h->op);
				2591	vino.ino = le64_to_cpu(h->ino);
				2592	vino.snap = CEPH_NOSNAP;
				2593	cap_id = le64_to_cpu(h->cap_id);
				2594	seq = le32_to_cpu(h->seq);
				2595	size = le64_to_cpu(h->size);
				2596	max_size = le64_to_cpu(h->max_size);
				2597
				2598	mutex_lock(&session->s_mutex);
				2599	session->s_seq++;
				2600	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
				2601	(unsigned)seq);
				2602
				2603	/* lookup ino */
				2604	inode = ceph_find_inode(sb, vino);
				2605	dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
				2606	vino.snap, inode);
				2607	if (!inode) {
				2608	dout(" i don't have ino %llx\n", vino.ino);
				2609	goto done;
				2610	}
				2611
				2612	/* these will work even if we don't have a cap yet */
				2613	switch (op) {
				2614	case CEPH_CAP_OP_FLUSHSNAP_ACK:
				2615	handle_cap_flushsnap_ack(inode, h, session);
				2616	goto done;
				2617
				2618	case CEPH_CAP_OP_EXPORT:
				2619	handle_cap_export(inode, h, session);
				2620	goto done;
				2621
				2622	case CEPH_CAP_OP_IMPORT:
				2623	handle_cap_import(mdsc, inode, h, session,
				2624	msg->middle,
				2625	le32_to_cpu(h->snap_trace_len));
				2626	check_caps = 1; /* we may have sent a RELEASE to the old auth */
				2627	goto done;
				2628	}
				2629
				2630	/* the rest require a cap */
				2631	spin_lock(&inode->i_lock);
				2632	cap = __get_cap_for_mds(ceph_inode(inode), mds);
				2633	if (!cap) {
				2634	dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
				2635	inode, ceph_ino(inode), ceph_snap(inode), mds);
				2636	spin_unlock(&inode->i_lock);
				2637	goto done;
				2638	}
				2639
				2640	/* note that each of these drops i_lock for us */
				2641	switch (op) {
				2642	case CEPH_CAP_OP_REVOKE:
				2643	case CEPH_CAP_OP_GRANT:
				2644	r = handle_cap_grant(inode, h, session, cap, msg->middle);
				2645	if (r == 1)
				2646	ceph_check_caps(ceph_inode(inode),
				2647	CHECK_CAPS_NODELAY\|CHECK_CAPS_AUTHONLY,
				2648	session);
				2649	else if (r == 2)
				2650	ceph_check_caps(ceph_inode(inode),
				2651	CHECK_CAPS_NODELAY,
				2652	session);
				2653	break;
				2654
				2655	case CEPH_CAP_OP_FLUSH_ACK:
				2656	handle_cap_flush_ack(inode, h, session, cap);
				2657	break;
				2658
				2659	case CEPH_CAP_OP_TRUNC:
				2660	handle_cap_trunc(inode, h, session);
				2661	break;
				2662
				2663	default:
				2664	spin_unlock(&inode->i_lock);
				2665	pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
				2666	ceph_cap_op_name(op));
				2667	}
				2668
				2669	done:
				2670	mutex_unlock(&session->s_mutex);
				2671
				2672	if (check_caps)
				2673	ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL);
				2674	if (inode)
				2675	iput(inode);
				2676	return;
				2677
				2678	bad:
				2679	pr_err("ceph_handle_caps: corrupt message\n");
				2680	return;
				2681	}
				2682
				2683	/*
				2684	* Delayed work handler to process end of delayed cap release LRU list.
				2685	*/
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame^]	2686	void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2687	{
				2688	struct ceph_inode_info *ci;
				2689	int flags = CHECK_CAPS_NODELAY;
				2690
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2691	dout("check_delayed_caps\n");
				2692	while (1) {
				2693	spin_lock(&mdsc->cap_delay_lock);
				2694	if (list_empty(&mdsc->cap_delay_list))
				2695	break;
				2696	ci = list_first_entry(&mdsc->cap_delay_list,
				2697	struct ceph_inode_info,
				2698	i_cap_delay_list);
				2699	if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
				2700	time_before(jiffies, ci->i_hold_caps_max))
				2701	break;
				2702	list_del_init(&ci->i_cap_delay_list);
				2703	spin_unlock(&mdsc->cap_delay_lock);
				2704	dout("check_delayed_caps on %p\n", &ci->vfs_inode);
				2705	ceph_check_caps(ci, flags, NULL);
				2706	}
				2707	spin_unlock(&mdsc->cap_delay_lock);
				2708	}
				2709
				2710	/*
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame^]	2711	* Flush all dirty caps to the mds
				2712	*/
				2713	void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
				2714	{
				2715	struct ceph_inode_info *ci;
				2716	struct inode *inode;
				2717
				2718	dout("flush_dirty_caps\n");
				2719	spin_lock(&mdsc->cap_dirty_lock);
				2720	while (!list_empty(&mdsc->cap_dirty)) {
				2721	ci = list_first_entry(&mdsc->cap_dirty,
				2722	struct ceph_inode_info,
				2723	i_dirty_item);
				2724	inode = igrab(&ci->vfs_inode);
				2725	spin_unlock(&mdsc->cap_dirty_lock);
				2726	if (inode) {
				2727	ceph_check_caps(ci, CHECK_CAPS_NODELAY\|CHECK_CAPS_FLUSH,
				2728	NULL);
				2729	iput(inode);
				2730	}
				2731	spin_lock(&mdsc->cap_dirty_lock);
				2732	}
				2733	spin_unlock(&mdsc->cap_dirty_lock);
				2734	}
				2735
				2736	/*
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2737	* Drop open file reference. If we were the last open file,
				2738	* we may need to release capabilities to the MDS (or schedule
				2739	* their delayed release).
				2740	*/
				2741	void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
				2742	{
				2743	struct inode *inode = &ci->vfs_inode;
				2744	int last = 0;
				2745
				2746	spin_lock(&inode->i_lock);
				2747	dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
				2748	ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
				2749	BUG_ON(ci->i_nr_by_mode[fmode] == 0);
				2750	if (--ci->i_nr_by_mode[fmode] == 0)
				2751	last++;
				2752	spin_unlock(&inode->i_lock);
				2753
				2754	if (last && ci->i_vino.snap == CEPH_NOSNAP)
				2755	ceph_check_caps(ci, 0, NULL);
				2756	}
				2757
				2758	/*
				2759	* Helpers for embedding cap and dentry lease releases into mds
				2760	* requests.
				2761	*
				2762	* @force is used by dentry_release (below) to force inclusion of a
				2763	* record for the directory inode, even when there aren't any caps to
				2764	* drop.
				2765	*/
				2766	int ceph_encode_inode_release(void *p, struct inode inode,
				2767	int mds, int drop, int unless, int force)
				2768	{
				2769	struct ceph_inode_info *ci = ceph_inode(inode);
				2770	struct ceph_cap *cap;
				2771	struct ceph_mds_request_release rel = p;
				2772	int ret = 0;
				2773
				2774	dout("encode_inode_release %p mds%d drop %s unless %s\n", inode,
				2775	mds, ceph_cap_string(drop), ceph_cap_string(unless));
				2776
				2777	spin_lock(&inode->i_lock);
				2778	cap = __get_cap_for_mds(ci, mds);
				2779	if (cap && __cap_is_valid(cap)) {
				2780	if (force \|\|
				2781	((cap->issued & drop) &&
				2782	(cap->issued & unless) == 0)) {
				2783	if ((cap->issued & drop) &&
				2784	(cap->issued & unless) == 0) {
				2785	dout("encode_inode_release %p cap %p %s -> "
				2786	"%s\n", inode, cap,
				2787	ceph_cap_string(cap->issued),
				2788	ceph_cap_string(cap->issued & ~drop));
				2789	cap->issued &= ~drop;
				2790	cap->implemented &= ~drop;
				2791	if (ci->i_ceph_flags & CEPH_I_NODELAY) {
				2792	int wanted = __ceph_caps_wanted(ci);
				2793	dout(" wanted %s -> %s (act %s)\n",
				2794	ceph_cap_string(cap->mds_wanted),
				2795	ceph_cap_string(cap->mds_wanted &
				2796	~wanted),
				2797	ceph_cap_string(wanted));
				2798	cap->mds_wanted &= wanted;
				2799	}
				2800	} else {
				2801	dout("encode_inode_release %p cap %p %s"
				2802	" (force)\n", inode, cap,
				2803	ceph_cap_string(cap->issued));
				2804	}
				2805
				2806	rel->ino = cpu_to_le64(ceph_ino(inode));
				2807	rel->cap_id = cpu_to_le64(cap->cap_id);
				2808	rel->seq = cpu_to_le32(cap->seq);
				2809	rel->issue_seq = cpu_to_le32(cap->issue_seq),
				2810	rel->mseq = cpu_to_le32(cap->mseq);
				2811	rel->caps = cpu_to_le32(cap->issued);
				2812	rel->wanted = cpu_to_le32(cap->mds_wanted);
				2813	rel->dname_len = 0;
				2814	rel->dname_seq = 0;
				2815	p += sizeof(rel);
				2816	ret = 1;
				2817	} else {
				2818	dout("encode_inode_release %p cap %p %s\n",
				2819	inode, cap, ceph_cap_string(cap->issued));
				2820	}
				2821	}
				2822	spin_unlock(&inode->i_lock);
				2823	return ret;
				2824	}
				2825
				2826	int ceph_encode_dentry_release(void *p, struct dentry dentry,
				2827	int mds, int drop, int unless)
				2828	{
				2829	struct inode *dir = dentry->d_parent->d_inode;
				2830	struct ceph_mds_request_release rel = p;
				2831	struct ceph_dentry_info *di = ceph_dentry(dentry);
				2832	int force = 0;
				2833	int ret;
				2834
				2835	/*
				2836	* force an record for the directory caps if we have a dentry lease.
				2837	* this is racy (can't take i_lock and d_lock together), but it
				2838	* doesn't have to be perfect; the mds will revoke anything we don't
				2839	* release.
				2840	*/
				2841	spin_lock(&dentry->d_lock);
				2842	if (di->lease_session && di->lease_session->s_mds == mds)
				2843	force = 1;
				2844	spin_unlock(&dentry->d_lock);
				2845
				2846	ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
				2847
				2848	spin_lock(&dentry->d_lock);
				2849	if (ret && di->lease_session && di->lease_session->s_mds == mds) {
				2850	dout("encode_dentry_release %p mds%d seq %d\n",
				2851	dentry, mds, (int)di->lease_seq);
				2852	rel->dname_len = cpu_to_le32(dentry->d_name.len);
				2853	memcpy(*p, dentry->d_name.name, dentry->d_name.len);
				2854	*p += dentry->d_name.len;
				2855	rel->dname_seq = cpu_to_le32(di->lease_seq);
				2856	}
				2857	spin_unlock(&dentry->d_lock);
				2858	return ret;
				2859	}