Blame - fs/ceph/caps.c - kernel/msm-5.4

blob: 775e6f6fc97079a5c9cb99e07022cfa92c4d368e [file] [log] [blame]

Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1	#include "ceph_debug.h"
				2
				3	#include <linux/fs.h>
				4	#include <linux/kernel.h>
				5	#include <linux/sched.h>
				6	#include <linux/vmalloc.h>
				7	#include <linux/wait.h>
				8
				9	#include "super.h"
				10	#include "decode.h"
				11	#include "messenger.h"
				12
				13	/*
				14	* Capability management
				15	*
				16	* The Ceph metadata servers control client access to inode metadata
				17	* and file data by issuing capabilities, granting clients permission
				18	* to read and/or write both inode field and file data to OSDs
				19	* (storage nodes). Each capability consists of a set of bits
				20	* indicating which operations are allowed.
				21	*
				22	* If the client holds a *_SHARED cap, the client has a coherent value
				23	* that can be safely read from the cached inode.
				24	*
				25	* In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
				26	* client is allowed to change inode attributes (e.g., file size,
				27	* mtime), note its dirty state in the ceph_cap, and asynchronously
				28	* flush that metadata change to the MDS.
				29	*
				30	* In the event of a conflicting operation (perhaps by another
				31	* client), the MDS will revoke the conflicting client capabilities.
				32	*
				33	* In order for a client to cache an inode, it must hold a capability
				34	* with at least one MDS server. When inodes are released, release
				35	* notifications are batched and periodically sent en masse to the MDS
				36	* cluster to release server state.
				37	*/
				38
				39
				40	/*
				41	* Generate readable cap strings for debugging output.
				42	*/
				43	#define MAX_CAP_STR 20
				44	static char cap_str[MAX_CAP_STR][40];
				45	static DEFINE_SPINLOCK(cap_str_lock);
				46	static int last_cap_str;
				47
				48	static char gcap_string(char s, int c)
				49	{
				50	if (c & CEPH_CAP_GSHARED)
				51	*s++ = 's';
				52	if (c & CEPH_CAP_GEXCL)
				53	*s++ = 'x';
				54	if (c & CEPH_CAP_GCACHE)
				55	*s++ = 'c';
				56	if (c & CEPH_CAP_GRD)
				57	*s++ = 'r';
				58	if (c & CEPH_CAP_GWR)
				59	*s++ = 'w';
				60	if (c & CEPH_CAP_GBUFFER)
				61	*s++ = 'b';
				62	if (c & CEPH_CAP_GLAZYIO)
				63	*s++ = 'l';
				64	return s;
				65	}
				66
				67	const char *ceph_cap_string(int caps)
				68	{
				69	int i;
				70	char *s;
				71	int c;
				72
				73	spin_lock(&cap_str_lock);
				74	i = last_cap_str++;
				75	if (last_cap_str == MAX_CAP_STR)
				76	last_cap_str = 0;
				77	spin_unlock(&cap_str_lock);
				78
				79	s = cap_str[i];
				80
				81	if (caps & CEPH_CAP_PIN)
				82	*s++ = 'p';
				83
				84	c = (caps >> CEPH_CAP_SAUTH) & 3;
				85	if (c) {
				86	*s++ = 'A';
				87	s = gcap_string(s, c);
				88	}
				89
				90	c = (caps >> CEPH_CAP_SLINK) & 3;
				91	if (c) {
				92	*s++ = 'L';
				93	s = gcap_string(s, c);
				94	}
				95
				96	c = (caps >> CEPH_CAP_SXATTR) & 3;
				97	if (c) {
				98	*s++ = 'X';
				99	s = gcap_string(s, c);
				100	}
				101
				102	c = caps >> CEPH_CAP_SFILE;
				103	if (c) {
				104	*s++ = 'F';
				105	s = gcap_string(s, c);
				106	}
				107
				108	if (s == cap_str[i])
				109	*s++ = '-';
				110	*s = 0;
				111	return cap_str[i];
				112	}
				113
				114	/*
				115	* Cap reservations
				116	*
				117	* Maintain a global pool of preallocated struct ceph_caps, referenced
				118	* by struct ceph_caps_reservations. This ensures that we preallocate
				119	* memory needed to successfully process an MDS response. (If an MDS
				120	* sends us cap information and we fail to process it, we will have
				121	* problems due to the client and MDS being out of sync.)
				122	*
				123	* Reservations are 'owned' by a ceph_cap_reservation context.
				124	*/
				125	static spinlock_t caps_list_lock;
				126	static struct list_head caps_list; /* unused (reserved or unreserved) */
				127	static int caps_total_count; /* total caps allocated */
				128	static int caps_use_count; /* in use */
				129	static int caps_reserve_count; /* unused, reserved */
				130	static int caps_avail_count; /* unused, unreserved */
				131
				132	void __init ceph_caps_init(void)
				133	{
				134	INIT_LIST_HEAD(&caps_list);
				135	spin_lock_init(&caps_list_lock);
				136	}
				137
				138	void ceph_caps_finalize(void)
				139	{
				140	struct ceph_cap *cap;
				141
				142	spin_lock(&caps_list_lock);
				143	while (!list_empty(&caps_list)) {
				144	cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
				145	list_del(&cap->caps_item);
				146	kmem_cache_free(ceph_cap_cachep, cap);
				147	}
				148	caps_total_count = 0;
				149	caps_avail_count = 0;
				150	caps_use_count = 0;
				151	caps_reserve_count = 0;
				152	spin_unlock(&caps_list_lock);
				153	}
				154
				155	int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
				156	{
				157	int i;
				158	struct ceph_cap *cap;
				159	int have;
				160	int alloc = 0;
				161	LIST_HEAD(newcaps);
				162	int ret = 0;
				163
				164	dout("reserve caps ctx=%p need=%d\n", ctx, need);
				165
				166	/* first reserve any caps that are already allocated */
				167	spin_lock(&caps_list_lock);
				168	if (caps_avail_count >= need)
				169	have = need;
				170	else
				171	have = caps_avail_count;
				172	caps_avail_count -= have;
				173	caps_reserve_count += have;
				174	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
				175	caps_avail_count);
				176	spin_unlock(&caps_list_lock);
				177
				178	for (i = have; i < need; i++) {
				179	cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
				180	if (!cap) {
				181	ret = -ENOMEM;
				182	goto out_alloc_count;
				183	}
				184	list_add(&cap->caps_item, &newcaps);
				185	alloc++;
				186	}
				187	BUG_ON(have + alloc != need);
				188
				189	spin_lock(&caps_list_lock);
				190	caps_total_count += alloc;
				191	caps_reserve_count += alloc;
				192	list_splice(&newcaps, &caps_list);
				193
				194	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
				195	caps_avail_count);
				196	spin_unlock(&caps_list_lock);
				197
				198	ctx->count = need;
				199	dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
				200	ctx, caps_total_count, caps_use_count, caps_reserve_count,
				201	caps_avail_count);
				202	return 0;
				203
				204	out_alloc_count:
				205	/* we didn't manage to reserve as much as we needed */
				206	pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
				207	ctx, need, have);
				208	return ret;
				209	}
				210
				211	int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
				212	{
				213	dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
				214	if (ctx->count) {
				215	spin_lock(&caps_list_lock);
				216	BUG_ON(caps_reserve_count < ctx->count);
				217	caps_reserve_count -= ctx->count;
				218	caps_avail_count += ctx->count;
				219	ctx->count = 0;
				220	dout("unreserve caps %d = %d used + %d resv + %d avail\n",
				221	caps_total_count, caps_use_count, caps_reserve_count,
				222	caps_avail_count);
				223	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
				224	caps_avail_count);
				225	spin_unlock(&caps_list_lock);
				226	}
				227	return 0;
				228	}
				229
				230	static struct ceph_cap get_cap(struct ceph_cap_reservation ctx)
				231	{
				232	struct ceph_cap *cap = NULL;
				233
				234	/* temporary, until we do something about cap import/export */
				235	if (!ctx)
				236	return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
				237
				238	spin_lock(&caps_list_lock);
				239	dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
				240	ctx, ctx->count, caps_total_count, caps_use_count,
				241	caps_reserve_count, caps_avail_count);
				242	BUG_ON(!ctx->count);
				243	BUG_ON(ctx->count > caps_reserve_count);
				244	BUG_ON(list_empty(&caps_list));
				245
				246	ctx->count--;
				247	caps_reserve_count--;
				248	caps_use_count++;
				249
				250	cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
				251	list_del(&cap->caps_item);
				252
				253	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
				254	caps_avail_count);
				255	spin_unlock(&caps_list_lock);
				256	return cap;
				257	}
				258
				259	static void put_cap(struct ceph_cap *cap,
				260	struct ceph_cap_reservation *ctx)
				261	{
				262	spin_lock(&caps_list_lock);
				263	dout("put_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
				264	ctx, ctx ? ctx->count : 0, caps_total_count, caps_use_count,
				265	caps_reserve_count, caps_avail_count);
				266	caps_use_count--;
				267	/*
				268	* Keep some preallocated caps around, at least enough to do a
				269	* readdir (which needs to preallocate lots of them), to avoid
				270	* lots of free/alloc churn.
				271	*/
				272	if (caps_avail_count >= caps_reserve_count +
Sage Weil	6b80518	2009-10-27 11:50:50 -0700	[diff] [blame]	273	ceph_client(cap->ci->vfs_inode.i_sb)->mount_args->max_readdir) {
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	274	caps_total_count--;
				275	kmem_cache_free(ceph_cap_cachep, cap);
				276	} else {
				277	if (ctx) {
				278	ctx->count++;
				279	caps_reserve_count++;
				280	} else {
				281	caps_avail_count++;
				282	}
				283	list_add(&cap->caps_item, &caps_list);
				284	}
				285
				286	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
				287	caps_avail_count);
				288	spin_unlock(&caps_list_lock);
				289	}
				290
				291	void ceph_reservation_status(struct ceph_client *client,
				292	int total, int avail, int used, int reserved)
				293	{
				294	if (total)
				295	*total = caps_total_count;
				296	if (avail)
				297	*avail = caps_avail_count;
				298	if (used)
				299	*used = caps_use_count;
				300	if (reserved)
				301	*reserved = caps_reserve_count;
				302	}
				303
				304	/*
				305	* Find ceph_cap for given mds, if any.
				306	*
				307	* Called with i_lock held.
				308	*/
				309	static struct ceph_cap __get_cap_for_mds(struct ceph_inode_info ci, int mds)
				310	{
				311	struct ceph_cap *cap;
				312	struct rb_node *n = ci->i_caps.rb_node;
				313
				314	while (n) {
				315	cap = rb_entry(n, struct ceph_cap, ci_node);
				316	if (mds < cap->mds)
				317	n = n->rb_left;
				318	else if (mds > cap->mds)
				319	n = n->rb_right;
				320	else
				321	return cap;
				322	}
				323	return NULL;
				324	}
				325
				326	/*
				327	* Return id of any MDS with a cap, preferably FILE_WR\|WRBUFFER\|EXCL, else
				328	* -1.
				329	*/
				330	static int __ceph_get_cap_mds(struct ceph_inode_info ci, u32 mseq)
				331	{
				332	struct ceph_cap *cap;
				333	int mds = -1;
				334	struct rb_node *p;
				335
				336	/* prefer mds with WR\|WRBUFFER\|EXCL caps */
				337	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				338	cap = rb_entry(p, struct ceph_cap, ci_node);
				339	mds = cap->mds;
				340	if (mseq)
				341	*mseq = cap->mseq;
				342	if (cap->issued & (CEPH_CAP_FILE_WR \|
				343	CEPH_CAP_FILE_BUFFER \|
				344	CEPH_CAP_FILE_EXCL))
				345	break;
				346	}
				347	return mds;
				348	}
				349
				350	int ceph_get_cap_mds(struct inode *inode)
				351	{
				352	int mds;
				353	spin_lock(&inode->i_lock);
				354	mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
				355	spin_unlock(&inode->i_lock);
				356	return mds;
				357	}
				358
				359	/*
				360	* Called under i_lock.
				361	*/
				362	static void __insert_cap_node(struct ceph_inode_info *ci,
				363	struct ceph_cap *new)
				364	{
				365	struct rb_node **p = &ci->i_caps.rb_node;
				366	struct rb_node *parent = NULL;
				367	struct ceph_cap *cap = NULL;
				368
				369	while (*p) {
				370	parent = *p;
				371	cap = rb_entry(parent, struct ceph_cap, ci_node);
				372	if (new->mds < cap->mds)
				373	p = &(*p)->rb_left;
				374	else if (new->mds > cap->mds)
				375	p = &(*p)->rb_right;
				376	else
				377	BUG();
				378	}
				379
				380	rb_link_node(&new->ci_node, parent, p);
				381	rb_insert_color(&new->ci_node, &ci->i_caps);
				382	}
				383
				384	/*
				385	* (re)set cap hold timeouts, which control the delayed release
				386	* of unused caps back to the MDS. Should be called on cap use.
				387	*/
				388	static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
				389	struct ceph_inode_info *ci)
				390	{
Sage Weil	6b80518	2009-10-27 11:50:50 -0700	[diff] [blame]	391	struct ceph_mount_args *ma = mdsc->client->mount_args;
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	392
				393	ci->i_hold_caps_min = round_jiffies(jiffies +
				394	ma->caps_wanted_delay_min * HZ);
				395	ci->i_hold_caps_max = round_jiffies(jiffies +
				396	ma->caps_wanted_delay_max * HZ);
				397	dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
				398	ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
				399	}
				400
				401	/*
				402	* (Re)queue cap at the end of the delayed cap release list.
				403	*
				404	* If I_FLUSH is set, leave the inode at the front of the list.
				405	*
				406	* Caller holds i_lock
				407	* -> we take mdsc->cap_delay_lock
				408	*/
				409	static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
				410	struct ceph_inode_info *ci)
				411	{
				412	__cap_set_timeouts(mdsc, ci);
				413	dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
				414	ci->i_ceph_flags, ci->i_hold_caps_max);
				415	if (!mdsc->stopping) {
				416	spin_lock(&mdsc->cap_delay_lock);
				417	if (!list_empty(&ci->i_cap_delay_list)) {
				418	if (ci->i_ceph_flags & CEPH_I_FLUSH)
				419	goto no_change;
				420	list_del_init(&ci->i_cap_delay_list);
				421	}
				422	list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
				423	no_change:
				424	spin_unlock(&mdsc->cap_delay_lock);
				425	}
				426	}
				427
				428	/*
				429	* Queue an inode for immediate writeback. Mark inode with I_FLUSH,
				430	* indicating we should send a cap message to flush dirty metadata
				431	* asap, and move to the front of the delayed cap list.
				432	*/
				433	static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
				434	struct ceph_inode_info *ci)
				435	{
				436	dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
				437	spin_lock(&mdsc->cap_delay_lock);
				438	ci->i_ceph_flags \|= CEPH_I_FLUSH;
				439	if (!list_empty(&ci->i_cap_delay_list))
				440	list_del_init(&ci->i_cap_delay_list);
				441	list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
				442	spin_unlock(&mdsc->cap_delay_lock);
				443	}
				444
				445	/*
				446	* Cancel delayed work on cap.
				447	*
				448	* Caller must hold i_lock.
				449	*/
				450	static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
				451	struct ceph_inode_info *ci)
				452	{
				453	dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
				454	if (list_empty(&ci->i_cap_delay_list))
				455	return;
				456	spin_lock(&mdsc->cap_delay_lock);
				457	list_del_init(&ci->i_cap_delay_list);
				458	spin_unlock(&mdsc->cap_delay_lock);
				459	}
				460
				461	/*
				462	* Common issue checks for add_cap, handle_cap_grant.
				463	*/
				464	static void __check_cap_issue(struct ceph_inode_info ci, struct ceph_cap cap,
				465	unsigned issued)
				466	{
				467	unsigned had = __ceph_caps_issued(ci, NULL);
				468
				469	/*
				470	* Each time we receive FILE_CACHE anew, we increment
				471	* i_rdcache_gen.
				472	*/
				473	if ((issued & CEPH_CAP_FILE_CACHE) &&
				474	(had & CEPH_CAP_FILE_CACHE) == 0)
				475	ci->i_rdcache_gen++;
				476
				477	/*
				478	* if we are newly issued FILE_SHARED, clear I_COMPLETE; we
				479	* don't know what happened to this directory while we didn't
				480	* have the cap.
				481	*/
				482	if ((issued & CEPH_CAP_FILE_SHARED) &&
				483	(had & CEPH_CAP_FILE_SHARED) == 0) {
				484	ci->i_shared_gen++;
				485	if (S_ISDIR(ci->vfs_inode.i_mode)) {
				486	dout(" marking %p NOT complete\n", &ci->vfs_inode);
				487	ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
				488	}
				489	}
				490	}
				491
				492	/*
				493	* Add a capability under the given MDS session.
				494	*
				495	* Caller should hold session snap_rwsem (read) and s_mutex.
				496	*
				497	* @fmode is the open file mode, if we are opening a file, otherwise
				498	* it is < 0. (This is so we can atomically add the cap and add an
				499	* open file reference to it.)
				500	*/
				501	int ceph_add_cap(struct inode *inode,
				502	struct ceph_mds_session *session, u64 cap_id,
				503	int fmode, unsigned issued, unsigned wanted,
				504	unsigned seq, unsigned mseq, u64 realmino, int flags,
				505	struct ceph_cap_reservation *caps_reservation)
				506	{
				507	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
				508	struct ceph_inode_info *ci = ceph_inode(inode);
				509	struct ceph_cap *new_cap = NULL;
				510	struct ceph_cap *cap;
				511	int mds = session->s_mds;
				512	int actual_wanted;
				513
				514	dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
				515	session->s_mds, cap_id, ceph_cap_string(issued), seq);
				516
				517	/*
				518	* If we are opening the file, include file mode wanted bits
				519	* in wanted.
				520	*/
				521	if (fmode >= 0)
				522	wanted \|= ceph_caps_for_mode(fmode);
				523
				524	retry:
				525	spin_lock(&inode->i_lock);
				526	cap = __get_cap_for_mds(ci, mds);
				527	if (!cap) {
				528	if (new_cap) {
				529	cap = new_cap;
				530	new_cap = NULL;
				531	} else {
				532	spin_unlock(&inode->i_lock);
				533	new_cap = get_cap(caps_reservation);
				534	if (new_cap == NULL)
				535	return -ENOMEM;
				536	goto retry;
				537	}
				538
				539	cap->issued = 0;
				540	cap->implemented = 0;
				541	cap->mds = mds;
				542	cap->mds_wanted = 0;
				543
				544	cap->ci = ci;
				545	__insert_cap_node(ci, cap);
				546
				547	/* clear out old exporting info? (i.e. on cap import) */
				548	if (ci->i_cap_exporting_mds == mds) {
				549	ci->i_cap_exporting_issued = 0;
				550	ci->i_cap_exporting_mseq = 0;
				551	ci->i_cap_exporting_mds = -1;
				552	}
				553
				554	/* add to session cap list */
				555	cap->session = session;
				556	spin_lock(&session->s_cap_lock);
				557	list_add_tail(&cap->session_caps, &session->s_caps);
				558	session->s_nr_caps++;
				559	spin_unlock(&session->s_cap_lock);
				560	}
				561
				562	if (!ci->i_snap_realm) {
				563	/*
				564	* add this inode to the appropriate snap realm
				565	*/
				566	struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
				567	realmino);
				568	if (realm) {
				569	ceph_get_snap_realm(mdsc, realm);
				570	spin_lock(&realm->inodes_with_caps_lock);
				571	ci->i_snap_realm = realm;
				572	list_add(&ci->i_snap_realm_item,
				573	&realm->inodes_with_caps);
				574	spin_unlock(&realm->inodes_with_caps_lock);
				575	} else {
				576	pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
				577	realmino);
				578	}
				579	}
				580
				581	__check_cap_issue(ci, cap, issued);
				582
				583	/*
				584	* If we are issued caps we don't want, or the mds' wanted
				585	* value appears to be off, queue a check so we'll release
				586	* later and/or update the mds wanted value.
				587	*/
				588	actual_wanted = __ceph_caps_wanted(ci);
				589	if ((wanted & ~actual_wanted) \|\|
				590	(issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
				591	dout(" issued %s, mds wanted %s, actual %s, queueing\n",
				592	ceph_cap_string(issued), ceph_cap_string(wanted),
				593	ceph_cap_string(actual_wanted));
				594	__cap_delay_requeue(mdsc, ci);
				595	}
				596
				597	if (flags & CEPH_CAP_FLAG_AUTH)
				598	ci->i_auth_cap = cap;
				599	else if (ci->i_auth_cap == cap)
				600	ci->i_auth_cap = NULL;
				601
				602	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
				603	inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
				604	ceph_cap_string(issued\|cap->issued), seq, mds);
				605	cap->cap_id = cap_id;
				606	cap->issued = issued;
				607	cap->implemented \|= issued;
				608	cap->mds_wanted \|= wanted;
				609	cap->seq = seq;
				610	cap->issue_seq = seq;
				611	cap->mseq = mseq;
Sage Weil	685f9a5d	2009-11-09 12:05:48 -0800	[diff] [blame^]	612	cap->cap_gen = session->s_cap_gen;
				613	cap->recon_gen = session->s_recon_gen;
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	614
				615	if (fmode >= 0)
				616	__ceph_get_fmode(ci, fmode);
				617	spin_unlock(&inode->i_lock);
				618	wake_up(&ci->i_cap_wq);
				619	return 0;
				620	}
				621
				622	/*
				623	* Return true if cap has not timed out and belongs to the current
				624	* generation of the MDS session (i.e. has not gone 'stale' due to
				625	* us losing touch with the mds).
				626	*/
				627	static int __cap_is_valid(struct ceph_cap *cap)
				628	{
				629	unsigned long ttl;
Sage Weil	685f9a5d	2009-11-09 12:05:48 -0800	[diff] [blame^]	630	u32 gen, recon_gen;
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	631
				632	spin_lock(&cap->session->s_cap_lock);
				633	gen = cap->session->s_cap_gen;
Sage Weil	685f9a5d	2009-11-09 12:05:48 -0800	[diff] [blame^]	634	recon_gen = cap->session->s_recon_gen;
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	635	ttl = cap->session->s_cap_ttl;
				636	spin_unlock(&cap->session->s_cap_lock);
				637
Sage Weil	685f9a5d	2009-11-09 12:05:48 -0800	[diff] [blame^]	638	if (cap->recon_gen != recon_gen) {
				639	dout("__cap_is_valid %p cap %p issued %s "
				640	"but DEAD (recon_gen %u vs %u)\n", &cap->ci->vfs_inode,
				641	cap, ceph_cap_string(cap->issued), cap->recon_gen,
				642	recon_gen);
				643	return 0;
				644	}
				645	if (cap->cap_gen < gen \|\| time_after_eq(jiffies, ttl)) {
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	646	dout("__cap_is_valid %p cap %p issued %s "
				647	"but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
Sage Weil	685f9a5d	2009-11-09 12:05:48 -0800	[diff] [blame^]	648	cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	649	return 0;
				650	}
				651
				652	return 1;
				653	}
				654
				655	/*
				656	* Return set of valid cap bits issued to us. Note that caps time
				657	* out, and may be invalidated in bulk if the client session times out
				658	* and session->s_cap_gen is bumped.
				659	*/
				660	int __ceph_caps_issued(struct ceph_inode_info ci, int implemented)
				661	{
				662	int have = ci->i_snap_caps;
				663	struct ceph_cap *cap;
				664	struct rb_node *p;
				665
				666	if (implemented)
				667	*implemented = 0;
				668	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				669	cap = rb_entry(p, struct ceph_cap, ci_node);
				670	if (!__cap_is_valid(cap))
				671	continue;
				672	dout("__ceph_caps_issued %p cap %p issued %s\n",
				673	&ci->vfs_inode, cap, ceph_cap_string(cap->issued));
				674	have \|= cap->issued;
				675	if (implemented)
				676	*implemented \|= cap->implemented;
				677	}
				678	return have;
				679	}
				680
				681	/*
				682	* Get cap bits issued by caps other than @ocap
				683	*/
				684	int __ceph_caps_issued_other(struct ceph_inode_info ci, struct ceph_cap ocap)
				685	{
				686	int have = ci->i_snap_caps;
				687	struct ceph_cap *cap;
				688	struct rb_node *p;
				689
				690	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				691	cap = rb_entry(p, struct ceph_cap, ci_node);
				692	if (cap == ocap)
				693	continue;
				694	if (!__cap_is_valid(cap))
				695	continue;
				696	have \|= cap->issued;
				697	}
				698	return have;
				699	}
				700
				701	/*
				702	* Move a cap to the end of the LRU (oldest caps at list head, newest
				703	* at list tail).
				704	*/
				705	static void __touch_cap(struct ceph_cap *cap)
				706	{
				707	struct ceph_mds_session *s = cap->session;
				708
				709	dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
				710	s->s_mds);
				711	spin_lock(&s->s_cap_lock);
				712	list_move_tail(&cap->session_caps, &s->s_caps);
				713	spin_unlock(&s->s_cap_lock);
				714	}
				715
				716	/*
				717	* Check if we hold the given mask. If so, move the cap(s) to the
				718	* front of their respective LRUs. (This is the preferred way for
				719	* callers to check for caps they want.)
				720	*/
				721	int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
				722	{
				723	struct ceph_cap *cap;
				724	struct rb_node *p;
				725	int have = ci->i_snap_caps;
				726
				727	if ((have & mask) == mask) {
				728	dout("__ceph_caps_issued_mask %p snap issued %s"
				729	" (mask %s)\n", &ci->vfs_inode,
				730	ceph_cap_string(have),
				731	ceph_cap_string(mask));
				732	return 1;
				733	}
				734
				735	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				736	cap = rb_entry(p, struct ceph_cap, ci_node);
				737	if (!__cap_is_valid(cap))
				738	continue;
				739	if ((cap->issued & mask) == mask) {
				740	dout("__ceph_caps_issued_mask %p cap %p issued %s"
				741	" (mask %s)\n", &ci->vfs_inode, cap,
				742	ceph_cap_string(cap->issued),
				743	ceph_cap_string(mask));
				744	if (touch)
				745	__touch_cap(cap);
				746	return 1;
				747	}
				748
				749	/* does a combination of caps satisfy mask? */
				750	have \|= cap->issued;
				751	if ((have & mask) == mask) {
				752	dout("__ceph_caps_issued_mask %p combo issued %s"
				753	" (mask %s)\n", &ci->vfs_inode,
				754	ceph_cap_string(cap->issued),
				755	ceph_cap_string(mask));
				756	if (touch) {
				757	struct rb_node *q;
				758
				759	/* touch this + preceeding caps */
				760	__touch_cap(cap);
				761	for (q = rb_first(&ci->i_caps); q != p;
				762	q = rb_next(q)) {
				763	cap = rb_entry(q, struct ceph_cap,
				764	ci_node);
				765	if (!__cap_is_valid(cap))
				766	continue;
				767	__touch_cap(cap);
				768	}
				769	}
				770	return 1;
				771	}
				772	}
				773
				774	return 0;
				775	}
				776
				777	/*
				778	* Return true if mask caps are currently being revoked by an MDS.
				779	*/
				780	int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
				781	{
				782	struct inode *inode = &ci->vfs_inode;
				783	struct ceph_cap *cap;
				784	struct rb_node *p;
				785	int ret = 0;
				786
				787	spin_lock(&inode->i_lock);
				788	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				789	cap = rb_entry(p, struct ceph_cap, ci_node);
				790	if (__cap_is_valid(cap) &&
				791	(cap->implemented & ~cap->issued & mask)) {
				792	ret = 1;
				793	break;
				794	}
				795	}
				796	spin_unlock(&inode->i_lock);
				797	dout("ceph_caps_revoking %p %s = %d\n", inode,
				798	ceph_cap_string(mask), ret);
				799	return ret;
				800	}
				801
				802	int __ceph_caps_used(struct ceph_inode_info *ci)
				803	{
				804	int used = 0;
				805	if (ci->i_pin_ref)
				806	used \|= CEPH_CAP_PIN;
				807	if (ci->i_rd_ref)
				808	used \|= CEPH_CAP_FILE_RD;
				809	if (ci->i_rdcache_ref \|\| ci->i_rdcache_gen)
				810	used \|= CEPH_CAP_FILE_CACHE;
				811	if (ci->i_wr_ref)
				812	used \|= CEPH_CAP_FILE_WR;
				813	if (ci->i_wrbuffer_ref)
				814	used \|= CEPH_CAP_FILE_BUFFER;
				815	return used;
				816	}
				817
				818	/*
				819	* wanted, by virtue of open file modes
				820	*/
				821	int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
				822	{
				823	int want = 0;
				824	int mode;
				825	for (mode = 0; mode < 4; mode++)
				826	if (ci->i_nr_by_mode[mode])
				827	want \|= ceph_caps_for_mode(mode);
				828	return want;
				829	}
				830
				831	/*
				832	* Return caps we have registered with the MDS(s) as 'wanted'.
				833	*/
				834	int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
				835	{
				836	struct ceph_cap *cap;
				837	struct rb_node *p;
				838	int mds_wanted = 0;
				839
				840	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				841	cap = rb_entry(p, struct ceph_cap, ci_node);
				842	if (!__cap_is_valid(cap))
				843	continue;
				844	mds_wanted \|= cap->mds_wanted;
				845	}
				846	return mds_wanted;
				847	}
				848
				849	/*
				850	* called under i_lock
				851	*/
				852	static int __ceph_is_any_caps(struct ceph_inode_info *ci)
				853	{
				854	return !RB_EMPTY_ROOT(&ci->i_caps) \|\| ci->i_cap_exporting_mds >= 0;
				855	}
				856
				857	/*
				858	* caller should hold i_lock, and session s_mutex.
				859	* returns true if this is the last cap. if so, caller should iput.
				860	*/
				861	void __ceph_remove_cap(struct ceph_cap *cap,
				862	struct ceph_cap_reservation *ctx)
				863	{
				864	struct ceph_mds_session *session = cap->session;
				865	struct ceph_inode_info *ci = cap->ci;
				866	struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
				867
				868	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
				869
				870	/* remove from session list */
				871	spin_lock(&session->s_cap_lock);
				872	list_del_init(&cap->session_caps);
				873	session->s_nr_caps--;
				874	spin_unlock(&session->s_cap_lock);
				875
				876	/* remove from inode list */
				877	rb_erase(&cap->ci_node, &ci->i_caps);
				878	cap->session = NULL;
				879	if (ci->i_auth_cap == cap)
				880	ci->i_auth_cap = NULL;
				881
				882	put_cap(cap, ctx);
				883
				884	if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
				885	struct ceph_snap_realm *realm = ci->i_snap_realm;
				886	spin_lock(&realm->inodes_with_caps_lock);
				887	list_del_init(&ci->i_snap_realm_item);
				888	ci->i_snap_realm_counter++;
				889	ci->i_snap_realm = NULL;
				890	spin_unlock(&realm->inodes_with_caps_lock);
				891	ceph_put_snap_realm(mdsc, realm);
				892	}
				893	if (!__ceph_is_any_real_caps(ci))
				894	__cap_delay_cancel(mdsc, ci);
				895	}
				896
				897	/*
				898	* Build and send a cap message to the given MDS.
				899	*
				900	* Caller should be holding s_mutex.
				901	*/
				902	static int send_cap_msg(struct ceph_mds_session *session,
				903	u64 ino, u64 cid, int op,
				904	int caps, int wanted, int dirty,
				905	u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
				906	u64 size, u64 max_size,
				907	struct timespec mtime, struct timespec atime,
				908	u64 time_warp_seq,
				909	uid_t uid, gid_t gid, mode_t mode,
				910	u64 xattr_version,
				911	struct ceph_buffer *xattrs_buf,
				912	u64 follows)
				913	{
				914	struct ceph_mds_caps *fc;
				915	struct ceph_msg *msg;
				916
				917	dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
				918	" seq %u/%u mseq %u follows %lld size %llu/%llu"
				919	" xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
				920	cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
				921	ceph_cap_string(dirty),
				922	seq, issue_seq, mseq, follows, size, max_size,
				923	xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
				924
				925	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
				926	if (IS_ERR(msg))
				927	return PTR_ERR(msg);
				928
				929	fc = msg->front.iov_base;
				930
				931	memset(fc, 0, sizeof(*fc));
				932
				933	fc->cap_id = cpu_to_le64(cid);
				934	fc->op = cpu_to_le32(op);
				935	fc->seq = cpu_to_le32(seq);
				936	fc->client_tid = cpu_to_le64(flush_tid);
				937	fc->issue_seq = cpu_to_le32(issue_seq);
				938	fc->migrate_seq = cpu_to_le32(mseq);
				939	fc->caps = cpu_to_le32(caps);
				940	fc->wanted = cpu_to_le32(wanted);
				941	fc->dirty = cpu_to_le32(dirty);
				942	fc->ino = cpu_to_le64(ino);
				943	fc->snap_follows = cpu_to_le64(follows);
				944
				945	fc->size = cpu_to_le64(size);
				946	fc->max_size = cpu_to_le64(max_size);
				947	if (mtime)
				948	ceph_encode_timespec(&fc->mtime, mtime);
				949	if (atime)
				950	ceph_encode_timespec(&fc->atime, atime);
				951	fc->time_warp_seq = cpu_to_le32(time_warp_seq);
				952
				953	fc->uid = cpu_to_le32(uid);
				954	fc->gid = cpu_to_le32(gid);
				955	fc->mode = cpu_to_le32(mode);
				956
				957	fc->xattr_version = cpu_to_le64(xattr_version);
				958	if (xattrs_buf) {
				959	msg->middle = ceph_buffer_get(xattrs_buf);
				960	fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
				961	msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
				962	}
				963
				964	ceph_con_send(&session->s_con, msg);
				965	return 0;
				966	}
				967
				968	/*
				969	* Queue cap releases when an inode is dropped from our
				970	* cache.
				971	*/
				972	void ceph_queue_caps_release(struct inode *inode)
				973	{
				974	struct ceph_inode_info *ci = ceph_inode(inode);
				975	struct rb_node *p;
				976
				977	spin_lock(&inode->i_lock);
				978	p = rb_first(&ci->i_caps);
				979	while (p) {
				980	struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
				981	struct ceph_mds_session *session = cap->session;
				982	struct ceph_msg *msg;
				983	struct ceph_mds_cap_release *head;
				984	struct ceph_mds_cap_item *item;
				985
				986	spin_lock(&session->s_cap_lock);
				987	BUG_ON(!session->s_num_cap_releases);
				988	msg = list_first_entry(&session->s_cap_releases,
				989	struct ceph_msg, list_head);
				990
				991	dout(" adding %p release to mds%d msg %p (%d left)\n",
				992	inode, session->s_mds, msg, session->s_num_cap_releases);
				993
				994	BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
				995	head = msg->front.iov_base;
				996	head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
				997	item = msg->front.iov_base + msg->front.iov_len;
				998	item->ino = cpu_to_le64(ceph_ino(inode));
				999	item->cap_id = cpu_to_le64(cap->cap_id);
				1000	item->migrate_seq = cpu_to_le32(cap->mseq);
				1001	item->seq = cpu_to_le32(cap->issue_seq);
				1002
				1003	session->s_num_cap_releases--;
				1004
				1005	msg->front.iov_len += sizeof(*item);
				1006	if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
				1007	dout(" release msg %p full\n", msg);
				1008	list_move_tail(&msg->list_head,
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame]	1009	&session->s_cap_releases_done);
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1010	} else {
				1011	dout(" release msg %p at %d/%d (%d)\n", msg,
				1012	(int)le32_to_cpu(head->num),
				1013	(int)CEPH_CAPS_PER_RELEASE,
				1014	(int)msg->front.iov_len);
				1015	}
				1016	spin_unlock(&session->s_cap_lock);
				1017	p = rb_next(p);
				1018	__ceph_remove_cap(cap, NULL);
				1019
				1020	}
				1021	spin_unlock(&inode->i_lock);
				1022	}
				1023
				1024	/*
				1025	* Send a cap msg on the given inode. Update our caps state, then
				1026	* drop i_lock and send the message.
				1027	*
				1028	* Make note of max_size reported/requested from mds, revoked caps
				1029	* that have now been implemented.
				1030	*
				1031	* Make half-hearted attempt ot to invalidate page cache if we are
				1032	* dropping RDCACHE. Note that this will leave behind locked pages
				1033	* that we'll then need to deal with elsewhere.
				1034	*
				1035	* Return non-zero if delayed release, or we experienced an error
				1036	* such that the caller should requeue + retry later.
				1037	*
				1038	* called with i_lock, then drops it.
				1039	* caller should hold snap_rwsem (read), s_mutex.
				1040	*/
				1041	static int __send_cap(struct ceph_mds_client mdsc, struct ceph_cap cap,
				1042	int op, int used, int want, int retain, int flushing,
				1043	unsigned *pflush_tid)
				1044	__releases(cap->ci->vfs_inode->i_lock)
				1045	{
				1046	struct ceph_inode_info *ci = cap->ci;
				1047	struct inode *inode = &ci->vfs_inode;
				1048	u64 cap_id = cap->cap_id;
				1049	int held = cap->issued \| cap->implemented;
				1050	int revoking = cap->implemented & ~cap->issued;
				1051	int dropping = cap->issued & ~retain;
				1052	int keep;
				1053	u64 seq, issue_seq, mseq, time_warp_seq, follows;
				1054	u64 size, max_size;
				1055	struct timespec mtime, atime;
				1056	int wake = 0;
				1057	mode_t mode;
				1058	uid_t uid;
				1059	gid_t gid;
				1060	struct ceph_mds_session *session;
				1061	u64 xattr_version = 0;
				1062	int delayed = 0;
				1063	u64 flush_tid = 0;
				1064	int i;
				1065	int ret;
				1066
				1067	dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
				1068	inode, cap, cap->session,
				1069	ceph_cap_string(held), ceph_cap_string(held & retain),
				1070	ceph_cap_string(revoking));
				1071	BUG_ON((retain & CEPH_CAP_PIN) == 0);
				1072
				1073	session = cap->session;
				1074
				1075	/* don't release wanted unless we've waited a bit. */
				1076	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
				1077	time_before(jiffies, ci->i_hold_caps_min)) {
				1078	dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
				1079	ceph_cap_string(cap->issued),
				1080	ceph_cap_string(cap->issued & retain),
				1081	ceph_cap_string(cap->mds_wanted),
				1082	ceph_cap_string(want));
				1083	want \|= cap->mds_wanted;
				1084	retain \|= cap->issued;
				1085	delayed = 1;
				1086	}
				1087	ci->i_ceph_flags &= ~(CEPH_I_NODELAY \| CEPH_I_FLUSH);
				1088
				1089	cap->issued &= retain; /* drop bits we don't want */
				1090	if (cap->implemented & ~cap->issued) {
				1091	/*
				1092	* Wake up any waiters on wanted -> needed transition.
				1093	* This is due to the weird transition from buffered
				1094	* to sync IO... we need to flush dirty pages _before_
				1095	* allowing sync writes to avoid reordering.
				1096	*/
				1097	wake = 1;
				1098	}
				1099	cap->implemented &= cap->issued \| used;
				1100	cap->mds_wanted = want;
				1101
				1102	if (flushing) {
				1103	/*
				1104	* assign a tid for flush operations so we can avoid
				1105	* flush1 -> dirty1 -> flush2 -> flushack1 -> mark
				1106	* clean type races. track latest tid for every bit
				1107	* so we can handle flush AxFw, flush Fw, and have the
				1108	* first ack clean Ax.
				1109	*/
				1110	flush_tid = ++ci->i_cap_flush_last_tid;
				1111	if (pflush_tid)
				1112	*pflush_tid = flush_tid;
				1113	dout(" cap_flush_tid %d\n", (int)flush_tid);
				1114	for (i = 0; i < CEPH_CAP_BITS; i++)
				1115	if (flushing & (1 << i))
				1116	ci->i_cap_flush_tid[i] = flush_tid;
				1117	}
				1118
				1119	keep = cap->implemented;
				1120	seq = cap->seq;
				1121	issue_seq = cap->issue_seq;
				1122	mseq = cap->mseq;
				1123	size = inode->i_size;
				1124	ci->i_reported_size = size;
				1125	max_size = ci->i_wanted_max_size;
				1126	ci->i_requested_max_size = max_size;
				1127	mtime = inode->i_mtime;
				1128	atime = inode->i_atime;
				1129	time_warp_seq = ci->i_time_warp_seq;
				1130	follows = ci->i_snap_realm->cached_context->seq;
				1131	uid = inode->i_uid;
				1132	gid = inode->i_gid;
				1133	mode = inode->i_mode;
				1134
				1135	if (dropping & CEPH_CAP_XATTR_EXCL) {
				1136	__ceph_build_xattrs_blob(ci);
				1137	xattr_version = ci->i_xattrs.version + 1;
				1138	}
				1139
				1140	spin_unlock(&inode->i_lock);
				1141
				1142	if (dropping & CEPH_CAP_FILE_CACHE) {
				1143	/* invalidate what we can */
				1144	dout("invalidating pages on %p\n", inode);
				1145	invalidate_mapping_pages(&inode->i_data, 0, -1);
				1146	}
				1147
				1148	ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
				1149	op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
				1150	size, max_size, &mtime, &atime, time_warp_seq,
				1151	uid, gid, mode,
				1152	xattr_version,
				1153	(flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
				1154	follows);
				1155	if (ret < 0) {
				1156	dout("error sending cap msg, must requeue %p\n", inode);
				1157	delayed = 1;
				1158	}
				1159
				1160	if (wake)
				1161	wake_up(&ci->i_cap_wq);
				1162
				1163	return delayed;
				1164	}
				1165
				1166	/*
				1167	* When a snapshot is taken, clients accumulate dirty metadata on
				1168	* inodes with capabilities in ceph_cap_snaps to describe the file
				1169	* state at the time the snapshot was taken. This must be flushed
				1170	* asynchronously back to the MDS once sync writes complete and dirty
				1171	* data is written out.
				1172	*
				1173	* Called under i_lock. Takes s_mutex as needed.
				1174	*/
				1175	void __ceph_flush_snaps(struct ceph_inode_info *ci,
				1176	struct ceph_mds_session **psession)
				1177	{
				1178	struct inode *inode = &ci->vfs_inode;
				1179	int mds;
				1180	struct ceph_cap_snap *capsnap;
				1181	u32 mseq;
				1182	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
				1183	struct ceph_mds_session session = NULL; / if session != NULL, we hold
				1184	session->s_mutex */
				1185	u64 next_follows = 0; /* keep track of how far we've gotten through the
				1186	i_cap_snaps list, and skip these entries next time
				1187	around to avoid an infinite loop */
				1188
				1189	if (psession)
				1190	session = *psession;
				1191
				1192	dout("__flush_snaps %p\n", inode);
				1193	retry:
				1194	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
				1195	/* avoid an infiniute loop after retry */
				1196	if (capsnap->follows < next_follows)
				1197	continue;
				1198	/*
				1199	* we need to wait for sync writes to complete and for dirty
				1200	* pages to be written out.
				1201	*/
				1202	if (capsnap->dirty_pages \|\| capsnap->writing)
				1203	continue;
				1204
				1205	/* pick mds, take s_mutex */
				1206	mds = __ceph_get_cap_mds(ci, &mseq);
				1207	if (session && session->s_mds != mds) {
				1208	dout("oops, wrong session %p mutex\n", session);
				1209	mutex_unlock(&session->s_mutex);
				1210	ceph_put_mds_session(session);
				1211	session = NULL;
				1212	}
				1213	if (!session) {
				1214	spin_unlock(&inode->i_lock);
				1215	mutex_lock(&mdsc->mutex);
				1216	session = __ceph_lookup_mds_session(mdsc, mds);
				1217	mutex_unlock(&mdsc->mutex);
				1218	if (session) {
				1219	dout("inverting session/ino locks on %p\n",
				1220	session);
				1221	mutex_lock(&session->s_mutex);
				1222	}
				1223	/*
				1224	* if session == NULL, we raced against a cap
				1225	* deletion. retry, and we'll get a better
				1226	* @mds value next time.
				1227	*/
				1228	spin_lock(&inode->i_lock);
				1229	goto retry;
				1230	}
				1231
				1232	capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
				1233	atomic_inc(&capsnap->nref);
				1234	if (!list_empty(&capsnap->flushing_item))
				1235	list_del_init(&capsnap->flushing_item);
				1236	list_add_tail(&capsnap->flushing_item,
				1237	&session->s_cap_snaps_flushing);
				1238	spin_unlock(&inode->i_lock);
				1239
				1240	dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
				1241	inode, capsnap, next_follows, capsnap->size);
				1242	send_cap_msg(session, ceph_vino(inode).ino, 0,
				1243	CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
				1244	capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
				1245	capsnap->size, 0,
				1246	&capsnap->mtime, &capsnap->atime,
				1247	capsnap->time_warp_seq,
				1248	capsnap->uid, capsnap->gid, capsnap->mode,
				1249	0, NULL,
				1250	capsnap->follows);
				1251
				1252	next_follows = capsnap->follows + 1;
				1253	ceph_put_cap_snap(capsnap);
				1254
				1255	spin_lock(&inode->i_lock);
				1256	goto retry;
				1257	}
				1258
				1259	/* we flushed them all; remove this inode from the queue */
				1260	spin_lock(&mdsc->snap_flush_lock);
				1261	list_del_init(&ci->i_snap_flush_item);
				1262	spin_unlock(&mdsc->snap_flush_lock);
				1263
				1264	if (psession)
				1265	*psession = session;
				1266	else if (session) {
				1267	mutex_unlock(&session->s_mutex);
				1268	ceph_put_mds_session(session);
				1269	}
				1270	}
				1271
				1272	static void ceph_flush_snaps(struct ceph_inode_info *ci)
				1273	{
				1274	struct inode *inode = &ci->vfs_inode;
				1275
				1276	spin_lock(&inode->i_lock);
				1277	__ceph_flush_snaps(ci, NULL);
				1278	spin_unlock(&inode->i_lock);
				1279	}
				1280
				1281	/*
Sage Weil	76e3b39	2009-10-15 18:13:53 -0700	[diff] [blame]	1282	* Mark caps dirty. If inode is newly dirty, add to the global dirty
				1283	* list.
				1284	*/
				1285	void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
				1286	{
				1287	struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
				1288	struct inode *inode = &ci->vfs_inode;
				1289	int was = ci->i_dirty_caps;
				1290	int dirty = 0;
				1291
				1292	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
				1293	ceph_cap_string(mask), ceph_cap_string(was),
				1294	ceph_cap_string(was \| mask));
				1295	ci->i_dirty_caps \|= mask;
				1296	if (was == 0) {
				1297	dout(" inode %p now dirty\n", &ci->vfs_inode);
				1298	BUG_ON(!list_empty(&ci->i_dirty_item));
				1299	spin_lock(&mdsc->cap_dirty_lock);
				1300	list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
				1301	spin_unlock(&mdsc->cap_dirty_lock);
				1302	if (ci->i_flushing_caps == 0) {
				1303	igrab(inode);
				1304	dirty \|= I_DIRTY_SYNC;
				1305	}
				1306	}
				1307	BUG_ON(list_empty(&ci->i_dirty_item));
				1308	if (((was \| ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
				1309	(mask & CEPH_CAP_FILE_BUFFER))
				1310	dirty \|= I_DIRTY_DATASYNC;
				1311	if (dirty)
				1312	__mark_inode_dirty(inode, dirty);
				1313	__cap_delay_requeue(mdsc, ci);
				1314	}
				1315
				1316	/*
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1317	* Add dirty inode to the flushing list. Assigned a seq number so we
				1318	* can wait for caps to flush without starving.
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1319	*
				1320	* Called under i_lock.
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1321	*/
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1322	static int __mark_caps_flushing(struct inode *inode,
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1323	struct ceph_mds_session *session)
				1324	{
				1325	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
				1326	struct ceph_inode_info *ci = ceph_inode(inode);
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1327	int flushing;
				1328
				1329	BUG_ON(ci->i_dirty_caps == 0);
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1330	BUG_ON(list_empty(&ci->i_dirty_item));
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1331
				1332	flushing = ci->i_dirty_caps;
				1333	dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
				1334	ceph_cap_string(flushing),
				1335	ceph_cap_string(ci->i_flushing_caps),
				1336	ceph_cap_string(ci->i_flushing_caps \| flushing));
				1337	ci->i_flushing_caps \|= flushing;
				1338	ci->i_dirty_caps = 0;
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame]	1339	dout(" inode %p now !dirty\n", inode);
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1340
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1341	spin_lock(&mdsc->cap_dirty_lock);
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame]	1342	list_del_init(&ci->i_dirty_item);
				1343
				1344	ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1345	if (list_empty(&ci->i_flushing_item)) {
				1346	list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
				1347	mdsc->num_cap_flushing++;
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame]	1348	dout(" inode %p now flushing seq %lld\n", inode,
				1349	ci->i_cap_flush_seq);
				1350	} else {
				1351	list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
				1352	dout(" inode %p now flushing (more) seq %lld\n", inode,
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1353	ci->i_cap_flush_seq);
				1354	}
				1355	spin_unlock(&mdsc->cap_dirty_lock);
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1356
				1357	return flushing;
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1358	}
				1359
				1360	/*
				1361	* Swiss army knife function to examine currently used and wanted
				1362	* versus held caps. Release, flush, ack revoked caps to mds as
				1363	* appropriate.
				1364	*
				1365	* CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
				1366	* cap release further.
				1367	* CHECK_CAPS_AUTHONLY - we should only check the auth cap
				1368	* CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
				1369	* further delay.
				1370	*/
				1371	void ceph_check_caps(struct ceph_inode_info *ci, int flags,
				1372	struct ceph_mds_session *session)
				1373	{
				1374	struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
				1375	struct ceph_mds_client *mdsc = &client->mdsc;
				1376	struct inode *inode = &ci->vfs_inode;
				1377	struct ceph_cap *cap;
				1378	int file_wanted, used;
				1379	int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
				1380	int drop_session_lock = session ? 0 : 1;
				1381	int want, retain, revoking, flushing = 0;
				1382	int mds = -1; /* keep track of how far we've gone through i_caps list
				1383	to avoid an infinite loop on retry */
				1384	struct rb_node *p;
				1385	int tried_invalidate = 0;
				1386	int delayed = 0, sent = 0, force_requeue = 0, num;
				1387	int is_delayed = flags & CHECK_CAPS_NODELAY;
				1388
				1389	/* if we are unmounting, flush any unused caps immediately. */
				1390	if (mdsc->stopping)
				1391	is_delayed = 1;
				1392
				1393	spin_lock(&inode->i_lock);
				1394
				1395	if (ci->i_ceph_flags & CEPH_I_FLUSH)
				1396	flags \|= CHECK_CAPS_FLUSH;
				1397
				1398	/* flush snaps first time around only */
				1399	if (!list_empty(&ci->i_cap_snaps))
				1400	__ceph_flush_snaps(ci, &session);
				1401	goto retry_locked;
				1402	retry:
				1403	spin_lock(&inode->i_lock);
				1404	retry_locked:
				1405	file_wanted = __ceph_caps_file_wanted(ci);
				1406	used = __ceph_caps_used(ci);
				1407	want = file_wanted \| used;
				1408
				1409	retain = want \| CEPH_CAP_PIN;
				1410	if (!mdsc->stopping && inode->i_nlink > 0) {
				1411	if (want) {
				1412	retain \|= CEPH_CAP_ANY; /* be greedy */
				1413	} else {
				1414	retain \|= CEPH_CAP_ANY_SHARED;
				1415	/*
				1416	* keep RD only if we didn't have the file open RW,
				1417	* because then the mds would revoke it anyway to
				1418	* journal max_size=0.
				1419	*/
				1420	if (ci->i_max_size == 0)
				1421	retain \|= CEPH_CAP_ANY_RD;
				1422	}
				1423	}
				1424
				1425	dout("check_caps %p file_want %s used %s dirty %s flushing %s"
				1426	" issued %s retain %s %s%s%s\n", inode,
				1427	ceph_cap_string(file_wanted),
				1428	ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
				1429	ceph_cap_string(ci->i_flushing_caps),
				1430	ceph_cap_string(__ceph_caps_issued(ci, NULL)),
				1431	ceph_cap_string(retain),
				1432	(flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
				1433	(flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
				1434	(flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
				1435
				1436	/*
				1437	* If we no longer need to hold onto old our caps, and we may
				1438	* have cached pages, but don't want them, then try to invalidate.
				1439	* If we fail, it's because pages are locked.... try again later.
				1440	*/
				1441	if ((!is_delayed \|\| mdsc->stopping) &&
				1442	ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
				1443	ci->i_rdcache_gen && /* may have cached pages */
				1444	file_wanted == 0 && /* no open files */
				1445	!ci->i_truncate_pending &&
				1446	!tried_invalidate) {
				1447	u32 invalidating_gen = ci->i_rdcache_gen;
				1448	int ret;
				1449
				1450	dout("check_caps trying to invalidate on %p\n", inode);
				1451	spin_unlock(&inode->i_lock);
				1452	ret = invalidate_inode_pages2(&inode->i_data);
				1453	spin_lock(&inode->i_lock);
				1454	if (ret == 0 && invalidating_gen == ci->i_rdcache_gen) {
				1455	/* success. */
				1456	ci->i_rdcache_gen = 0;
				1457	ci->i_rdcache_revoking = 0;
				1458	} else {
				1459	dout("check_caps failed to invalidate pages\n");
				1460	/* we failed to invalidate pages. check these
				1461	caps again later. */
				1462	force_requeue = 1;
				1463	__cap_set_timeouts(mdsc, ci);
				1464	}
				1465	tried_invalidate = 1;
				1466	goto retry_locked;
				1467	}
				1468
				1469	num = 0;
				1470	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				1471	cap = rb_entry(p, struct ceph_cap, ci_node);
				1472	num++;
				1473
				1474	/* avoid looping forever */
				1475	if (mds >= cap->mds \|\|
				1476	((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
				1477	continue;
				1478
				1479	/* NOTE: no side-effects allowed, until we take s_mutex */
				1480
				1481	revoking = cap->implemented & ~cap->issued;
				1482	if (revoking)
				1483	dout("mds%d revoking %s\n", cap->mds,
				1484	ceph_cap_string(revoking));
				1485
				1486	if (cap == ci->i_auth_cap &&
				1487	(cap->issued & CEPH_CAP_FILE_WR)) {
				1488	/* request larger max_size from MDS? */
				1489	if (ci->i_wanted_max_size > ci->i_max_size &&
				1490	ci->i_wanted_max_size > ci->i_requested_max_size) {
				1491	dout("requesting new max_size\n");
				1492	goto ack;
				1493	}
				1494
				1495	/* approaching file_max? */
				1496	if ((inode->i_size << 1) >= ci->i_max_size &&
				1497	(ci->i_reported_size << 1) < ci->i_max_size) {
				1498	dout("i_size approaching max_size\n");
				1499	goto ack;
				1500	}
				1501	}
				1502	/* flush anything dirty? */
				1503	if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
				1504	ci->i_dirty_caps) {
				1505	dout("flushing dirty caps\n");
				1506	goto ack;
				1507	}
				1508
				1509	/* completed revocation? going down and there are no caps? */
				1510	if (revoking && (revoking & used) == 0) {
				1511	dout("completed revocation of %s\n",
				1512	ceph_cap_string(cap->implemented & ~cap->issued));
				1513	goto ack;
				1514	}
				1515
				1516	/* want more caps from mds? */
				1517	if (want & ~(cap->mds_wanted \| cap->issued))
				1518	goto ack;
				1519
				1520	/* things we might delay */
				1521	if ((cap->issued & ~retain) == 0 &&
				1522	cap->mds_wanted == want)
				1523	continue; /* nope, all good */
				1524
				1525	if (is_delayed)
				1526	goto ack;
				1527
				1528	/* delay? */
				1529	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
				1530	time_before(jiffies, ci->i_hold_caps_max)) {
				1531	dout(" delaying issued %s -> %s, wanted %s -> %s\n",
				1532	ceph_cap_string(cap->issued),
				1533	ceph_cap_string(cap->issued & retain),
				1534	ceph_cap_string(cap->mds_wanted),
				1535	ceph_cap_string(want));
				1536	delayed++;
				1537	continue;
				1538	}
				1539
				1540	ack:
				1541	if (session && session != cap->session) {
				1542	dout("oops, wrong session %p mutex\n", session);
				1543	mutex_unlock(&session->s_mutex);
				1544	session = NULL;
				1545	}
				1546	if (!session) {
				1547	session = cap->session;
				1548	if (mutex_trylock(&session->s_mutex) == 0) {
				1549	dout("inverting session/ino locks on %p\n",
				1550	session);
				1551	spin_unlock(&inode->i_lock);
				1552	if (took_snap_rwsem) {
				1553	up_read(&mdsc->snap_rwsem);
				1554	took_snap_rwsem = 0;
				1555	}
				1556	mutex_lock(&session->s_mutex);
				1557	goto retry;
				1558	}
				1559	}
				1560	/* take snap_rwsem after session mutex */
				1561	if (!took_snap_rwsem) {
				1562	if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
				1563	dout("inverting snap/in locks on %p\n",
				1564	inode);
				1565	spin_unlock(&inode->i_lock);
				1566	down_read(&mdsc->snap_rwsem);
				1567	took_snap_rwsem = 1;
				1568	goto retry;
				1569	}
				1570	took_snap_rwsem = 1;
				1571	}
				1572
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1573	if (cap == ci->i_auth_cap && ci->i_dirty_caps)
				1574	flushing = __mark_caps_flushing(inode, session);
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1575
				1576	mds = cap->mds; /* remember mds, so we don't repeat */
				1577	sent++;
				1578
				1579	/* __send_cap drops i_lock */
				1580	delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
				1581	retain, flushing, NULL);
				1582	goto retry; /* retake i_lock and restart our cap scan. */
				1583	}
				1584
				1585	/*
				1586	* Reschedule delayed caps release if we delayed anything,
				1587	* otherwise cancel.
				1588	*/
				1589	if (delayed && is_delayed)
				1590	force_requeue = 1; /* __send_cap delayed release; requeue */
				1591	if (!delayed && !is_delayed)
				1592	__cap_delay_cancel(mdsc, ci);
				1593	else if (!is_delayed \|\| force_requeue)
				1594	__cap_delay_requeue(mdsc, ci);
				1595
				1596	spin_unlock(&inode->i_lock);
				1597
				1598	if (session && drop_session_lock)
				1599	mutex_unlock(&session->s_mutex);
				1600	if (took_snap_rwsem)
				1601	up_read(&mdsc->snap_rwsem);
				1602	}
				1603
				1604	/*
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1605	* Try to flush dirty caps back to the auth mds.
				1606	*/
				1607	static int try_flush_caps(struct inode inode, struct ceph_mds_session session,
				1608	unsigned *flush_tid)
				1609	{
				1610	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
				1611	struct ceph_inode_info *ci = ceph_inode(inode);
				1612	int unlock_session = session ? 0 : 1;
				1613	int flushing = 0;
				1614
				1615	retry:
				1616	spin_lock(&inode->i_lock);
				1617	if (ci->i_dirty_caps && ci->i_auth_cap) {
				1618	struct ceph_cap *cap = ci->i_auth_cap;
				1619	int used = __ceph_caps_used(ci);
				1620	int want = __ceph_caps_wanted(ci);
				1621	int delayed;
				1622
				1623	if (!session) {
				1624	spin_unlock(&inode->i_lock);
				1625	session = cap->session;
				1626	mutex_lock(&session->s_mutex);
				1627	goto retry;
				1628	}
				1629	BUG_ON(session != cap->session);
				1630	if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
				1631	goto out;
				1632
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1633	flushing = __mark_caps_flushing(inode, session);
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1634
				1635	/* __send_cap drops i_lock */
				1636	delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
				1637	cap->issued \| cap->implemented, flushing,
				1638	flush_tid);
				1639	if (!delayed)
				1640	goto out_unlocked;
				1641
				1642	spin_lock(&inode->i_lock);
				1643	__cap_delay_requeue(mdsc, ci);
				1644	}
				1645	out:
				1646	spin_unlock(&inode->i_lock);
				1647	out_unlocked:
				1648	if (session && unlock_session)
				1649	mutex_unlock(&session->s_mutex);
				1650	return flushing;
				1651	}
				1652
				1653	/*
				1654	* Return true if we've flushed caps through the given flush_tid.
				1655	*/
				1656	static int caps_are_flushed(struct inode *inode, unsigned tid)
				1657	{
				1658	struct ceph_inode_info *ci = ceph_inode(inode);
				1659	int dirty, i, ret = 1;
				1660
				1661	spin_lock(&inode->i_lock);
				1662	dirty = __ceph_caps_dirty(ci);
				1663	for (i = 0; i < CEPH_CAP_BITS; i++)
				1664	if ((ci->i_flushing_caps & (1 << i)) &&
				1665	ci->i_cap_flush_tid[i] <= tid) {
				1666	/* still flushing this bit */
				1667	ret = 0;
				1668	break;
				1669	}
				1670	spin_unlock(&inode->i_lock);
				1671	return ret;
				1672	}
				1673
				1674	/*
				1675	* Wait on any unsafe replies for the given inode. First wait on the
				1676	* newest request, and make that the upper bound. Then, if there are
				1677	* more requests, keep waiting on the oldest as long as it is still older
				1678	* than the original request.
				1679	*/
				1680	static void sync_write_wait(struct inode *inode)
				1681	{
				1682	struct ceph_inode_info *ci = ceph_inode(inode);
				1683	struct list_head *head = &ci->i_unsafe_writes;
				1684	struct ceph_osd_request *req;
				1685	u64 last_tid;
				1686
				1687	spin_lock(&ci->i_unsafe_lock);
				1688	if (list_empty(head))
				1689	goto out;
				1690
				1691	/* set upper bound as _last_ entry in chain */
				1692	req = list_entry(head->prev, struct ceph_osd_request,
				1693	r_unsafe_item);
				1694	last_tid = req->r_tid;
				1695
				1696	do {
				1697	ceph_osdc_get_request(req);
				1698	spin_unlock(&ci->i_unsafe_lock);
				1699	dout("sync_write_wait on tid %llu (until %llu)\n",
				1700	req->r_tid, last_tid);
				1701	wait_for_completion(&req->r_safe_completion);
				1702	spin_lock(&ci->i_unsafe_lock);
				1703	ceph_osdc_put_request(req);
				1704
				1705	/*
				1706	* from here on look at first entry in chain, since we
				1707	* only want to wait for anything older than last_tid
				1708	*/
				1709	if (list_empty(head))
				1710	break;
				1711	req = list_entry(head->next, struct ceph_osd_request,
				1712	r_unsafe_item);
				1713	} while (req->r_tid < last_tid);
				1714	out:
				1715	spin_unlock(&ci->i_unsafe_lock);
				1716	}
				1717
				1718	int ceph_fsync(struct file file, struct dentry dentry, int datasync)
				1719	{
				1720	struct inode *inode = dentry->d_inode;
				1721	struct ceph_inode_info *ci = ceph_inode(inode);
				1722	unsigned flush_tid;
				1723	int ret;
				1724	int dirty;
				1725
				1726	dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
				1727	sync_write_wait(inode);
				1728
				1729	ret = filemap_write_and_wait(inode->i_mapping);
				1730	if (ret < 0)
				1731	return ret;
				1732
				1733	dirty = try_flush_caps(inode, NULL, &flush_tid);
				1734	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
				1735
				1736	/*
				1737	* only wait on non-file metadata writeback (the mds
				1738	* can recover size and mtime, so we don't need to
				1739	* wait for that)
				1740	*/
				1741	if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
				1742	dout("fsync waiting for flush_tid %u\n", flush_tid);
				1743	ret = wait_event_interruptible(ci->i_cap_wq,
				1744	caps_are_flushed(inode, flush_tid));
				1745	}
				1746
				1747	dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
				1748	return ret;
				1749	}
				1750
				1751	/*
				1752	* Flush any dirty caps back to the mds. If we aren't asked to wait,
				1753	* queue inode for flush but don't do so immediately, because we can
				1754	* get by with fewer MDS messages if we wait for data writeback to
				1755	* complete first.
				1756	*/
				1757	int ceph_write_inode(struct inode *inode, int wait)
				1758	{
				1759	struct ceph_inode_info *ci = ceph_inode(inode);
				1760	unsigned flush_tid;
				1761	int err = 0;
				1762	int dirty;
				1763
				1764	dout("write_inode %p wait=%d\n", inode, wait);
				1765	if (wait) {
				1766	dirty = try_flush_caps(inode, NULL, &flush_tid);
				1767	if (dirty)
				1768	err = wait_event_interruptible(ci->i_cap_wq,
				1769	caps_are_flushed(inode, flush_tid));
				1770	} else {
				1771	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
				1772
				1773	spin_lock(&inode->i_lock);
				1774	if (__ceph_caps_dirty(ci))
				1775	__cap_delay_requeue_front(mdsc, ci);
				1776	spin_unlock(&inode->i_lock);
				1777	}
				1778	return err;
				1779	}
				1780
				1781	/*
				1782	* After a recovering MDS goes active, we need to resend any caps
				1783	* we were flushing.
				1784	*
				1785	* Caller holds session->s_mutex.
				1786	*/
				1787	static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
				1788	struct ceph_mds_session *session)
				1789	{
				1790	struct ceph_cap_snap *capsnap;
				1791
				1792	dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
				1793	list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
				1794	flushing_item) {
				1795	struct ceph_inode_info *ci = capsnap->ci;
				1796	struct inode *inode = &ci->vfs_inode;
				1797	struct ceph_cap *cap;
				1798
				1799	spin_lock(&inode->i_lock);
				1800	cap = ci->i_auth_cap;
				1801	if (cap && cap->session == session) {
				1802	dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
				1803	cap, capsnap);
				1804	__ceph_flush_snaps(ci, &session);
				1805	} else {
				1806	pr_err("%p auth cap %p not mds%d ???\n", inode,
				1807	cap, session->s_mds);
				1808	spin_unlock(&inode->i_lock);
				1809	}
				1810	}
				1811	}
				1812
				1813	void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
				1814	struct ceph_mds_session *session)
				1815	{
				1816	struct ceph_inode_info *ci;
				1817
				1818	kick_flushing_capsnaps(mdsc, session);
				1819
				1820	dout("kick_flushing_caps mds%d\n", session->s_mds);
				1821	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
				1822	struct inode *inode = &ci->vfs_inode;
				1823	struct ceph_cap *cap;
				1824	int delayed = 0;
				1825
				1826	spin_lock(&inode->i_lock);
				1827	cap = ci->i_auth_cap;
				1828	if (cap && cap->session == session) {
				1829	dout("kick_flushing_caps %p cap %p %s\n", inode,
				1830	cap, ceph_cap_string(ci->i_flushing_caps));
				1831	delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
				1832	__ceph_caps_used(ci),
				1833	__ceph_caps_wanted(ci),
				1834	cap->issued \| cap->implemented,
				1835	ci->i_flushing_caps, NULL);
				1836	if (delayed) {
				1837	spin_lock(&inode->i_lock);
				1838	__cap_delay_requeue(mdsc, ci);
				1839	spin_unlock(&inode->i_lock);
				1840	}
				1841	} else {
				1842	pr_err("%p auth cap %p not mds%d ???\n", inode,
				1843	cap, session->s_mds);
				1844	spin_unlock(&inode->i_lock);
				1845	}
				1846	}
				1847	}
				1848
				1849
				1850	/*
				1851	* Take references to capabilities we hold, so that we don't release
				1852	* them to the MDS prematurely.
				1853	*
				1854	* Protected by i_lock.
				1855	*/
				1856	static void __take_cap_refs(struct ceph_inode_info *ci, int got)
				1857	{
				1858	if (got & CEPH_CAP_PIN)
				1859	ci->i_pin_ref++;
				1860	if (got & CEPH_CAP_FILE_RD)
				1861	ci->i_rd_ref++;
				1862	if (got & CEPH_CAP_FILE_CACHE)
				1863	ci->i_rdcache_ref++;
				1864	if (got & CEPH_CAP_FILE_WR)
				1865	ci->i_wr_ref++;
				1866	if (got & CEPH_CAP_FILE_BUFFER) {
				1867	if (ci->i_wrbuffer_ref == 0)
				1868	igrab(&ci->vfs_inode);
				1869	ci->i_wrbuffer_ref++;
				1870	dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
				1871	&ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
				1872	}
				1873	}
				1874
				1875	/*
				1876	* Try to grab cap references. Specify those refs we @want, and the
				1877	* minimal set we @need. Also include the larger offset we are writing
				1878	* to (when applicable), and check against max_size here as well.
				1879	* Note that caller is responsible for ensuring max_size increases are
				1880	* requested from the MDS.
				1881	*/
				1882	static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
				1883	int got, loff_t endoff, int check_max, int *err)
				1884	{
				1885	struct inode *inode = &ci->vfs_inode;
				1886	int ret = 0;
				1887	int have, implemented;
				1888
				1889	dout("get_cap_refs %p need %s want %s\n", inode,
				1890	ceph_cap_string(need), ceph_cap_string(want));
				1891	spin_lock(&inode->i_lock);
				1892
				1893	/* make sure we _have_ some caps! */
				1894	if (!__ceph_is_any_caps(ci)) {
				1895	dout("get_cap_refs %p no real caps\n", inode);
				1896	*err = -EBADF;
				1897	ret = 1;
				1898	goto out;
				1899	}
				1900
				1901	if (need & CEPH_CAP_FILE_WR) {
				1902	if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
				1903	dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
				1904	inode, endoff, ci->i_max_size);
				1905	if (endoff > ci->i_wanted_max_size) {
				1906	*check_max = 1;
				1907	ret = 1;
				1908	}
				1909	goto out;
				1910	}
				1911	/*
				1912	* If a sync write is in progress, we must wait, so that we
				1913	* can get a final snapshot value for size+mtime.
				1914	*/
				1915	if (__ceph_have_pending_cap_snap(ci)) {
				1916	dout("get_cap_refs %p cap_snap_pending\n", inode);
				1917	goto out;
				1918	}
				1919	}
				1920	have = __ceph_caps_issued(ci, &implemented);
				1921
				1922	/*
				1923	* disallow writes while a truncate is pending
				1924	*/
				1925	if (ci->i_truncate_pending)
				1926	have &= ~CEPH_CAP_FILE_WR;
				1927
				1928	if ((have & need) == need) {
				1929	/*
				1930	* Look at (implemented & ~have & not) so that we keep waiting
				1931	* on transition from wanted -> needed caps. This is needed
				1932	* for WRBUFFER\|WR -> WR to avoid a new WR sync write from
				1933	* going before a prior buffered writeback happens.
				1934	*/
				1935	int not = want & ~(have & need);
				1936	int revoking = implemented & ~have;
				1937	dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
				1938	inode, ceph_cap_string(have), ceph_cap_string(not),
				1939	ceph_cap_string(revoking));
				1940	if ((revoking & not) == 0) {
				1941	*got = need \| (have & want);
				1942	__take_cap_refs(ci, *got);
				1943	ret = 1;
				1944	}
				1945	} else {
				1946	dout("get_cap_refs %p have %s needed %s\n", inode,
				1947	ceph_cap_string(have), ceph_cap_string(need));
				1948	}
				1949	out:
				1950	spin_unlock(&inode->i_lock);
				1951	dout("get_cap_refs %p ret %d got %s\n", inode,
				1952	ret, ceph_cap_string(*got));
				1953	return ret;
				1954	}
				1955
				1956	/*
				1957	* Check the offset we are writing up to against our current
				1958	* max_size. If necessary, tell the MDS we want to write to
				1959	* a larger offset.
				1960	*/
				1961	static void check_max_size(struct inode *inode, loff_t endoff)
				1962	{
				1963	struct ceph_inode_info *ci = ceph_inode(inode);
				1964	int check = 0;
				1965
				1966	/* do we need to explicitly request a larger max_size? */
				1967	spin_lock(&inode->i_lock);
				1968	if ((endoff >= ci->i_max_size \|\|
				1969	endoff > (inode->i_size << 1)) &&
				1970	endoff > ci->i_wanted_max_size) {
				1971	dout("write %p at large endoff %llu, req max_size\n",
				1972	inode, endoff);
				1973	ci->i_wanted_max_size = endoff;
				1974	check = 1;
				1975	}
				1976	spin_unlock(&inode->i_lock);
				1977	if (check)
				1978	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
				1979	}
				1980
				1981	/*
				1982	* Wait for caps, and take cap references. If we can't get a WR cap
				1983	* due to a small max_size, make sure we check_max_size (and possibly
				1984	* ask the mds) so we don't get hung up indefinitely.
				1985	*/
				1986	int ceph_get_caps(struct ceph_inode_info ci, int need, int want, int got,
				1987	loff_t endoff)
				1988	{
				1989	int check_max, ret, err;
				1990
				1991	retry:
				1992	if (endoff > 0)
				1993	check_max_size(&ci->vfs_inode, endoff);
				1994	check_max = 0;
				1995	err = 0;
				1996	ret = wait_event_interruptible(ci->i_cap_wq,
				1997	try_get_cap_refs(ci, need, want,
				1998	got, endoff,
				1999	&check_max, &err));
				2000	if (err)
				2001	ret = err;
				2002	if (check_max)
				2003	goto retry;
				2004	return ret;
				2005	}
				2006
				2007	/*
				2008	* Take cap refs. Caller must already know we hold at least one ref
				2009	* on the caps in question or we don't know this is safe.
				2010	*/
				2011	void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
				2012	{
				2013	spin_lock(&ci->vfs_inode.i_lock);
				2014	__take_cap_refs(ci, caps);
				2015	spin_unlock(&ci->vfs_inode.i_lock);
				2016	}
				2017
				2018	/*
				2019	* Release cap refs.
				2020	*
				2021	* If we released the last ref on any given cap, call ceph_check_caps
				2022	* to release (or schedule a release).
				2023	*
				2024	* If we are releasing a WR cap (from a sync write), finalize any affected
				2025	* cap_snap, and wake up any waiters.
				2026	*/
				2027	void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
				2028	{
				2029	struct inode *inode = &ci->vfs_inode;
				2030	int last = 0, put = 0, flushsnaps = 0, wake = 0;
				2031	struct ceph_cap_snap *capsnap;
				2032
				2033	spin_lock(&inode->i_lock);
				2034	if (had & CEPH_CAP_PIN)
				2035	--ci->i_pin_ref;
				2036	if (had & CEPH_CAP_FILE_RD)
				2037	if (--ci->i_rd_ref == 0)
				2038	last++;
				2039	if (had & CEPH_CAP_FILE_CACHE)
				2040	if (--ci->i_rdcache_ref == 0)
				2041	last++;
				2042	if (had & CEPH_CAP_FILE_BUFFER) {
				2043	if (--ci->i_wrbuffer_ref == 0) {
				2044	last++;
				2045	put++;
				2046	}
				2047	dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
				2048	inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
				2049	}
				2050	if (had & CEPH_CAP_FILE_WR)
				2051	if (--ci->i_wr_ref == 0) {
				2052	last++;
				2053	if (!list_empty(&ci->i_cap_snaps)) {
				2054	capsnap = list_first_entry(&ci->i_cap_snaps,
				2055	struct ceph_cap_snap,
				2056	ci_item);
				2057	if (capsnap->writing) {
				2058	capsnap->writing = 0;
				2059	flushsnaps =
				2060	__ceph_finish_cap_snap(ci,
				2061	capsnap);
				2062	wake = 1;
				2063	}
				2064	}
				2065	}
				2066	spin_unlock(&inode->i_lock);
				2067
				2068	dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had),
				2069	last ? "last" : "");
				2070
				2071	if (last && !flushsnaps)
				2072	ceph_check_caps(ci, 0, NULL);
				2073	else if (flushsnaps)
				2074	ceph_flush_snaps(ci);
				2075	if (wake)
				2076	wake_up(&ci->i_cap_wq);
				2077	if (put)
				2078	iput(inode);
				2079	}
				2080
				2081	/*
				2082	* Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
				2083	* context. Adjust per-snap dirty page accounting as appropriate.
				2084	* Once all dirty data for a cap_snap is flushed, flush snapped file
				2085	* metadata back to the MDS. If we dropped the last ref, call
				2086	* ceph_check_caps.
				2087	*/
				2088	void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
				2089	struct ceph_snap_context *snapc)
				2090	{
				2091	struct inode *inode = &ci->vfs_inode;
				2092	int last = 0;
				2093	int last_snap = 0;
				2094	int found = 0;
				2095	struct ceph_cap_snap *capsnap = NULL;
				2096
				2097	spin_lock(&inode->i_lock);
				2098	ci->i_wrbuffer_ref -= nr;
				2099	last = !ci->i_wrbuffer_ref;
				2100
				2101	if (ci->i_head_snapc == snapc) {
				2102	ci->i_wrbuffer_ref_head -= nr;
				2103	if (!ci->i_wrbuffer_ref_head) {
				2104	ceph_put_snap_context(ci->i_head_snapc);
				2105	ci->i_head_snapc = NULL;
				2106	}
				2107	dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
				2108	inode,
				2109	ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
				2110	ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
				2111	last ? " LAST" : "");
				2112	} else {
				2113	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
				2114	if (capsnap->context == snapc) {
				2115	found = 1;
				2116	capsnap->dirty_pages -= nr;
				2117	last_snap = !capsnap->dirty_pages;
				2118	break;
				2119	}
				2120	}
				2121	BUG_ON(!found);
				2122	dout("put_wrbuffer_cap_refs on %p cap_snap %p "
				2123	" snap %lld %d/%d -> %d/%d %s%s\n",
				2124	inode, capsnap, capsnap->context->seq,
				2125	ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
				2126	ci->i_wrbuffer_ref, capsnap->dirty_pages,
				2127	last ? " (wrbuffer last)" : "",
				2128	last_snap ? " (capsnap last)" : "");
				2129	}
				2130
				2131	spin_unlock(&inode->i_lock);
				2132
				2133	if (last) {
				2134	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
				2135	iput(inode);
				2136	} else if (last_snap) {
				2137	ceph_flush_snaps(ci);
				2138	wake_up(&ci->i_cap_wq);
				2139	}
				2140	}
				2141
				2142	/*
				2143	* Handle a cap GRANT message from the MDS. (Note that a GRANT may
				2144	* actually be a revocation if it specifies a smaller cap set.)
				2145	*
				2146	* caller holds s_mutex.
				2147	* return value:
				2148	* 0 - ok
				2149	* 1 - check_caps on auth cap only (writeback)
				2150	* 2 - check_caps (ack revoke)
				2151	*/
				2152	static int handle_cap_grant(struct inode inode, struct ceph_mds_caps grant,
				2153	struct ceph_mds_session *session,
				2154	struct ceph_cap *cap,
				2155	struct ceph_buffer *xattr_buf)
				2156	__releases(inode->i_lock)
				2157
				2158	{
				2159	struct ceph_inode_info *ci = ceph_inode(inode);
				2160	int mds = session->s_mds;
				2161	int seq = le32_to_cpu(grant->seq);
				2162	int newcaps = le32_to_cpu(grant->caps);
				2163	int issued, implemented, used, wanted, dirty;
				2164	u64 size = le64_to_cpu(grant->size);
				2165	u64 max_size = le64_to_cpu(grant->max_size);
				2166	struct timespec mtime, atime, ctime;
				2167	int reply = 0;
				2168	int wake = 0;
				2169	int writeback = 0;
				2170	int revoked_rdcache = 0;
				2171	int invalidate_async = 0;
				2172	int tried_invalidate = 0;
				2173	int ret;
				2174
				2175	dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
				2176	inode, cap, mds, seq, ceph_cap_string(newcaps));
				2177	dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
				2178	inode->i_size);
				2179
				2180	/*
				2181	* If CACHE is being revoked, and we have no dirty buffers,
				2182	* try to invalidate (once). (If there are dirty buffers, we
				2183	* will invalidate _after_ writeback.)
				2184	*/
				2185	restart:
				2186	if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
				2187	!ci->i_wrbuffer_ref && !tried_invalidate) {
				2188	dout("CACHE invalidation\n");
				2189	spin_unlock(&inode->i_lock);
				2190	tried_invalidate = 1;
				2191
				2192	ret = invalidate_inode_pages2(&inode->i_data);
				2193	spin_lock(&inode->i_lock);
				2194	if (ret < 0) {
				2195	/* there were locked pages.. invalidate later
				2196	in a separate thread. */
				2197	if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
				2198	invalidate_async = 1;
				2199	ci->i_rdcache_revoking = ci->i_rdcache_gen;
				2200	}
				2201	} else {
				2202	/* we successfully invalidated those pages */
				2203	revoked_rdcache = 1;
				2204	ci->i_rdcache_gen = 0;
				2205	ci->i_rdcache_revoking = 0;
				2206	}
				2207	goto restart;
				2208	}
				2209
				2210	/* side effects now are allowed */
				2211
				2212	issued = __ceph_caps_issued(ci, &implemented);
				2213	issued \|= implemented \| __ceph_caps_dirty(ci);
				2214
Sage Weil	685f9a5d	2009-11-09 12:05:48 -0800	[diff] [blame^]	2215	cap->cap_gen = session->s_cap_gen;
				2216	cap->recon_gen = session->s_recon_gen;
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2217
				2218	__check_cap_issue(ci, cap, newcaps);
				2219
				2220	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
				2221	inode->i_mode = le32_to_cpu(grant->mode);
				2222	inode->i_uid = le32_to_cpu(grant->uid);
				2223	inode->i_gid = le32_to_cpu(grant->gid);
				2224	dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
				2225	inode->i_uid, inode->i_gid);
				2226	}
				2227
				2228	if ((issued & CEPH_CAP_LINK_EXCL) == 0)
				2229	inode->i_nlink = le32_to_cpu(grant->nlink);
				2230
				2231	if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
				2232	int len = le32_to_cpu(grant->xattr_len);
				2233	u64 version = le64_to_cpu(grant->xattr_version);
				2234
				2235	if (version > ci->i_xattrs.version) {
				2236	dout(" got new xattrs v%llu on %p len %d\n",
				2237	version, inode, len);
				2238	if (ci->i_xattrs.blob)
				2239	ceph_buffer_put(ci->i_xattrs.blob);
				2240	ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
				2241	ci->i_xattrs.version = version;
				2242	}
				2243	}
				2244
				2245	/* size/ctime/mtime/atime? */
				2246	ceph_fill_file_size(inode, issued,
				2247	le32_to_cpu(grant->truncate_seq),
				2248	le64_to_cpu(grant->truncate_size), size);
				2249	ceph_decode_timespec(&mtime, &grant->mtime);
				2250	ceph_decode_timespec(&atime, &grant->atime);
				2251	ceph_decode_timespec(&ctime, &grant->ctime);
				2252	ceph_fill_file_time(inode, issued,
				2253	le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
				2254	&atime);
				2255
				2256	/* max size increase? */
				2257	if (max_size != ci->i_max_size) {
				2258	dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
				2259	ci->i_max_size = max_size;
				2260	if (max_size >= ci->i_wanted_max_size) {
				2261	ci->i_wanted_max_size = 0; /* reset */
				2262	ci->i_requested_max_size = 0;
				2263	}
				2264	wake = 1;
				2265	}
				2266
				2267	/* check cap bits */
				2268	wanted = __ceph_caps_wanted(ci);
				2269	used = __ceph_caps_used(ci);
				2270	dirty = __ceph_caps_dirty(ci);
				2271	dout(" my wanted = %s, used = %s, dirty %s\n",
				2272	ceph_cap_string(wanted),
				2273	ceph_cap_string(used),
				2274	ceph_cap_string(dirty));
				2275	if (wanted != le32_to_cpu(grant->wanted)) {
				2276	dout("mds wanted %s -> %s\n",
				2277	ceph_cap_string(le32_to_cpu(grant->wanted)),
				2278	ceph_cap_string(wanted));
				2279	grant->wanted = cpu_to_le32(wanted);
				2280	}
				2281
				2282	cap->seq = seq;
				2283
				2284	/* file layout may have changed */
				2285	ci->i_layout = grant->layout;
				2286
				2287	/* revocation, grant, or no-op? */
				2288	if (cap->issued & ~newcaps) {
				2289	dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
				2290	ceph_cap_string(newcaps));
				2291	if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
				2292	writeback = 1; /* will delay ack */
				2293	else if (dirty & ~newcaps)
				2294	reply = 1; /* initiate writeback in check_caps */
				2295	else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 \|\|
				2296	revoked_rdcache)
				2297	reply = 2; /* send revoke ack in check_caps */
				2298	cap->issued = newcaps;
				2299	} else if (cap->issued == newcaps) {
				2300	dout("caps unchanged: %s -> %s\n",
				2301	ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
				2302	} else {
				2303	dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
				2304	ceph_cap_string(newcaps));
				2305	cap->issued = newcaps;
				2306	cap->implemented \|= newcaps; /* add bits only, to
				2307	* avoid stepping on a
				2308	* pending revocation */
				2309	wake = 1;
				2310	}
				2311
				2312	spin_unlock(&inode->i_lock);
				2313	if (writeback) {
				2314	/*
				2315	* queue inode for writeback: we can't actually call
				2316	* filemap_write_and_wait, etc. from message handler
				2317	* context.
				2318	*/
				2319	dout("queueing %p for writeback\n", inode);
				2320	if (ceph_queue_writeback(inode))
				2321	igrab(inode);
				2322	}
				2323	if (invalidate_async) {
				2324	dout("queueing %p for page invalidation\n", inode);
				2325	if (ceph_queue_page_invalidation(inode))
				2326	igrab(inode);
				2327	}
				2328	if (wake)
				2329	wake_up(&ci->i_cap_wq);
				2330	return reply;
				2331	}
				2332
				2333	/*
				2334	* Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
				2335	* MDS has been safely committed.
				2336	*/
				2337	static void handle_cap_flush_ack(struct inode *inode,
				2338	struct ceph_mds_caps *m,
				2339	struct ceph_mds_session *session,
				2340	struct ceph_cap *cap)
				2341	__releases(inode->i_lock)
				2342	{
				2343	struct ceph_inode_info *ci = ceph_inode(inode);
				2344	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
				2345	unsigned seq = le32_to_cpu(m->seq);
				2346	int dirty = le32_to_cpu(m->dirty);
				2347	int cleaned = 0;
				2348	u64 flush_tid = le64_to_cpu(m->client_tid);
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame]	2349	int drop = 0;
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2350	int i;
				2351
				2352	for (i = 0; i < CEPH_CAP_BITS; i++)
				2353	if ((dirty & (1 << i)) &&
				2354	flush_tid == ci->i_cap_flush_tid[i])
				2355	cleaned \|= 1 << i;
				2356
				2357	dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
				2358	" flushing %s -> %s\n",
				2359	inode, session->s_mds, seq, ceph_cap_string(dirty),
				2360	ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
				2361	ceph_cap_string(ci->i_flushing_caps & ~cleaned));
				2362
				2363	if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
				2364	goto out;
				2365
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2366	ci->i_flushing_caps &= ~cleaned;
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2367
				2368	spin_lock(&mdsc->cap_dirty_lock);
				2369	if (ci->i_flushing_caps == 0) {
				2370	list_del_init(&ci->i_flushing_item);
				2371	if (!list_empty(&session->s_cap_flushing))
				2372	dout(" mds%d still flushing cap on %p\n",
				2373	session->s_mds,
				2374	&list_entry(session->s_cap_flushing.next,
				2375	struct ceph_inode_info,
				2376	i_flushing_item)->vfs_inode);
				2377	mdsc->num_cap_flushing--;
				2378	wake_up(&mdsc->cap_flushing_wq);
				2379	dout(" inode %p now !flushing\n", inode);
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame]	2380
				2381	if (ci->i_dirty_caps == 0) {
				2382	dout(" inode %p now clean\n", inode);
				2383	BUG_ON(!list_empty(&ci->i_dirty_item));
				2384	drop = 1;
Sage Weil	76e3b39	2009-10-15 18:13:53 -0700	[diff] [blame]	2385	} else {
				2386	BUG_ON(list_empty(&ci->i_dirty_item));
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame]	2387	}
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2388	}
				2389	spin_unlock(&mdsc->cap_dirty_lock);
				2390	wake_up(&ci->i_cap_wq);
				2391
				2392	out:
				2393	spin_unlock(&inode->i_lock);
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame]	2394	if (drop)
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2395	iput(inode);
				2396	}
				2397
				2398	/*
				2399	* Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
				2400	* throw away our cap_snap.
				2401	*
				2402	* Caller hold s_mutex.
				2403	*/
				2404	static void handle_cap_flushsnap_ack(struct inode *inode,
				2405	struct ceph_mds_caps *m,
				2406	struct ceph_mds_session *session)
				2407	{
				2408	struct ceph_inode_info *ci = ceph_inode(inode);
				2409	u64 follows = le64_to_cpu(m->snap_follows);
				2410	u64 flush_tid = le64_to_cpu(m->client_tid);
				2411	struct ceph_cap_snap *capsnap;
				2412	int drop = 0;
				2413
				2414	dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
				2415	inode, ci, session->s_mds, follows);
				2416
				2417	spin_lock(&inode->i_lock);
				2418	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
				2419	if (capsnap->follows == follows) {
				2420	if (capsnap->flush_tid != flush_tid) {
				2421	dout(" cap_snap %p follows %lld tid %lld !="
				2422	" %lld\n", capsnap, follows,
				2423	flush_tid, capsnap->flush_tid);
				2424	break;
				2425	}
				2426	WARN_ON(capsnap->dirty_pages \|\| capsnap->writing);
				2427	dout(" removing cap_snap %p follows %lld\n",
				2428	capsnap, follows);
				2429	ceph_put_snap_context(capsnap->context);
				2430	list_del(&capsnap->ci_item);
				2431	list_del(&capsnap->flushing_item);
				2432	ceph_put_cap_snap(capsnap);
				2433	drop = 1;
				2434	break;
				2435	} else {
				2436	dout(" skipping cap_snap %p follows %lld\n",
				2437	capsnap, capsnap->follows);
				2438	}
				2439	}
				2440	spin_unlock(&inode->i_lock);
				2441	if (drop)
				2442	iput(inode);
				2443	}
				2444
				2445	/*
				2446	* Handle TRUNC from MDS, indicating file truncation.
				2447	*
				2448	* caller hold s_mutex.
				2449	*/
				2450	static void handle_cap_trunc(struct inode *inode,
				2451	struct ceph_mds_caps *trunc,
				2452	struct ceph_mds_session *session)
				2453	__releases(inode->i_lock)
				2454	{
				2455	struct ceph_inode_info *ci = ceph_inode(inode);
				2456	int mds = session->s_mds;
				2457	int seq = le32_to_cpu(trunc->seq);
				2458	u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
				2459	u64 truncate_size = le64_to_cpu(trunc->truncate_size);
				2460	u64 size = le64_to_cpu(trunc->size);
				2461	int implemented = 0;
				2462	int dirty = __ceph_caps_dirty(ci);
				2463	int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
				2464	int queue_trunc = 0;
				2465
				2466	issued \|= implemented \| dirty;
				2467
				2468	dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
				2469	inode, mds, seq, truncate_size, truncate_seq);
				2470	queue_trunc = ceph_fill_file_size(inode, issued,
				2471	truncate_seq, truncate_size, size);
				2472	spin_unlock(&inode->i_lock);
				2473
				2474	if (queue_trunc)
				2475	if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
				2476	&ci->i_vmtruncate_work))
				2477	igrab(inode);
				2478	}
				2479
				2480	/*
				2481	* Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
				2482	* different one. If we are the most recent migration we've seen (as
				2483	* indicated by mseq), make note of the migrating cap bits for the
				2484	* duration (until we see the corresponding IMPORT).
				2485	*
				2486	* caller holds s_mutex
				2487	*/
				2488	static void handle_cap_export(struct inode inode, struct ceph_mds_caps ex,
				2489	struct ceph_mds_session *session)
				2490	{
				2491	struct ceph_inode_info *ci = ceph_inode(inode);
				2492	int mds = session->s_mds;
				2493	unsigned mseq = le32_to_cpu(ex->migrate_seq);
				2494	struct ceph_cap cap = NULL, t;
				2495	struct rb_node *p;
				2496	int remember = 1;
				2497
				2498	dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
				2499	inode, ci, mds, mseq);
				2500
				2501	spin_lock(&inode->i_lock);
				2502
				2503	/* make sure we haven't seen a higher mseq */
				2504	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				2505	t = rb_entry(p, struct ceph_cap, ci_node);
				2506	if (ceph_seq_cmp(t->mseq, mseq) > 0) {
				2507	dout(" higher mseq on cap from mds%d\n",
				2508	t->session->s_mds);
				2509	remember = 0;
				2510	}
				2511	if (t->session->s_mds == mds)
				2512	cap = t;
				2513	}
				2514
				2515	if (cap) {
				2516	if (remember) {
				2517	/* make note */
				2518	ci->i_cap_exporting_mds = mds;
				2519	ci->i_cap_exporting_mseq = mseq;
				2520	ci->i_cap_exporting_issued = cap->issued;
				2521	}
				2522	__ceph_remove_cap(cap, NULL);
				2523	} else {
				2524	WARN_ON(!cap);
				2525	}
				2526
				2527	spin_unlock(&inode->i_lock);
				2528	}
				2529
				2530	/*
				2531	* Handle cap IMPORT. If there are temp bits from an older EXPORT,
				2532	* clean them up.
				2533	*
				2534	* caller holds s_mutex.
				2535	*/
				2536	static void handle_cap_import(struct ceph_mds_client *mdsc,
				2537	struct inode inode, struct ceph_mds_caps im,
				2538	struct ceph_mds_session *session,
				2539	void *snaptrace, int snaptrace_len)
				2540	{
				2541	struct ceph_inode_info *ci = ceph_inode(inode);
				2542	int mds = session->s_mds;
				2543	unsigned issued = le32_to_cpu(im->caps);
				2544	unsigned wanted = le32_to_cpu(im->wanted);
				2545	unsigned seq = le32_to_cpu(im->seq);
				2546	unsigned mseq = le32_to_cpu(im->migrate_seq);
				2547	u64 realmino = le64_to_cpu(im->realm);
				2548	u64 cap_id = le64_to_cpu(im->cap_id);
				2549
				2550	if (ci->i_cap_exporting_mds >= 0 &&
				2551	ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
				2552	dout("handle_cap_import inode %p ci %p mds%d mseq %d"
				2553	" - cleared exporting from mds%d\n",
				2554	inode, ci, mds, mseq,
				2555	ci->i_cap_exporting_mds);
				2556	ci->i_cap_exporting_issued = 0;
				2557	ci->i_cap_exporting_mseq = 0;
				2558	ci->i_cap_exporting_mds = -1;
				2559	} else {
				2560	dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
				2561	inode, ci, mds, mseq);
				2562	}
				2563
				2564	down_write(&mdsc->snap_rwsem);
				2565	ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
				2566	false);
				2567	downgrade_write(&mdsc->snap_rwsem);
				2568	ceph_add_cap(inode, session, cap_id, -1,
				2569	issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
				2570	NULL /* no caps context */);
				2571	try_flush_caps(inode, session, NULL);
				2572	up_read(&mdsc->snap_rwsem);
				2573	}
				2574
				2575	/*
				2576	* Handle a caps message from the MDS.
				2577	*
				2578	* Identify the appropriate session, inode, and call the right handler
				2579	* based on the cap op.
				2580	*/
				2581	void ceph_handle_caps(struct ceph_mds_session *session,
				2582	struct ceph_msg *msg)
				2583	{
				2584	struct ceph_mds_client *mdsc = session->s_mdsc;
				2585	struct super_block *sb = mdsc->client->sb;
				2586	struct inode *inode;
				2587	struct ceph_cap *cap;
				2588	struct ceph_mds_caps *h;
				2589	int mds = le64_to_cpu(msg->hdr.src.name.num);
				2590	int op;
				2591	u32 seq;
				2592	struct ceph_vino vino;
				2593	u64 cap_id;
				2594	u64 size, max_size;
				2595	int check_caps = 0;
				2596	int r;
				2597
				2598	dout("handle_caps from mds%d\n", mds);
				2599
				2600	/* decode */
				2601	if (msg->front.iov_len < sizeof(*h))
				2602	goto bad;
				2603	h = msg->front.iov_base;
				2604	op = le32_to_cpu(h->op);
				2605	vino.ino = le64_to_cpu(h->ino);
				2606	vino.snap = CEPH_NOSNAP;
				2607	cap_id = le64_to_cpu(h->cap_id);
				2608	seq = le32_to_cpu(h->seq);
				2609	size = le64_to_cpu(h->size);
				2610	max_size = le64_to_cpu(h->max_size);
				2611
				2612	mutex_lock(&session->s_mutex);
				2613	session->s_seq++;
				2614	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
				2615	(unsigned)seq);
				2616
				2617	/* lookup ino */
				2618	inode = ceph_find_inode(sb, vino);
				2619	dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
				2620	vino.snap, inode);
				2621	if (!inode) {
				2622	dout(" i don't have ino %llx\n", vino.ino);
				2623	goto done;
				2624	}
				2625
				2626	/* these will work even if we don't have a cap yet */
				2627	switch (op) {
				2628	case CEPH_CAP_OP_FLUSHSNAP_ACK:
				2629	handle_cap_flushsnap_ack(inode, h, session);
				2630	goto done;
				2631
				2632	case CEPH_CAP_OP_EXPORT:
				2633	handle_cap_export(inode, h, session);
				2634	goto done;
				2635
				2636	case CEPH_CAP_OP_IMPORT:
				2637	handle_cap_import(mdsc, inode, h, session,
				2638	msg->middle,
				2639	le32_to_cpu(h->snap_trace_len));
				2640	check_caps = 1; /* we may have sent a RELEASE to the old auth */
				2641	goto done;
				2642	}
				2643
				2644	/* the rest require a cap */
				2645	spin_lock(&inode->i_lock);
				2646	cap = __get_cap_for_mds(ceph_inode(inode), mds);
				2647	if (!cap) {
				2648	dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
				2649	inode, ceph_ino(inode), ceph_snap(inode), mds);
				2650	spin_unlock(&inode->i_lock);
				2651	goto done;
				2652	}
				2653
				2654	/* note that each of these drops i_lock for us */
				2655	switch (op) {
				2656	case CEPH_CAP_OP_REVOKE:
				2657	case CEPH_CAP_OP_GRANT:
				2658	r = handle_cap_grant(inode, h, session, cap, msg->middle);
				2659	if (r == 1)
				2660	ceph_check_caps(ceph_inode(inode),
				2661	CHECK_CAPS_NODELAY\|CHECK_CAPS_AUTHONLY,
				2662	session);
				2663	else if (r == 2)
				2664	ceph_check_caps(ceph_inode(inode),
				2665	CHECK_CAPS_NODELAY,
				2666	session);
				2667	break;
				2668
				2669	case CEPH_CAP_OP_FLUSH_ACK:
				2670	handle_cap_flush_ack(inode, h, session, cap);
				2671	break;
				2672
				2673	case CEPH_CAP_OP_TRUNC:
				2674	handle_cap_trunc(inode, h, session);
				2675	break;
				2676
				2677	default:
				2678	spin_unlock(&inode->i_lock);
				2679	pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
				2680	ceph_cap_op_name(op));
				2681	}
				2682
				2683	done:
				2684	mutex_unlock(&session->s_mutex);
				2685
				2686	if (check_caps)
				2687	ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL);
				2688	if (inode)
				2689	iput(inode);
				2690	return;
				2691
				2692	bad:
				2693	pr_err("ceph_handle_caps: corrupt message\n");
				2694	return;
				2695	}
				2696
				2697	/*
				2698	* Delayed work handler to process end of delayed cap release LRU list.
				2699	*/
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame]	2700	void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2701	{
				2702	struct ceph_inode_info *ci;
				2703	int flags = CHECK_CAPS_NODELAY;
				2704
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2705	dout("check_delayed_caps\n");
				2706	while (1) {
				2707	spin_lock(&mdsc->cap_delay_lock);
				2708	if (list_empty(&mdsc->cap_delay_list))
				2709	break;
				2710	ci = list_first_entry(&mdsc->cap_delay_list,
				2711	struct ceph_inode_info,
				2712	i_cap_delay_list);
				2713	if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
				2714	time_before(jiffies, ci->i_hold_caps_max))
				2715	break;
				2716	list_del_init(&ci->i_cap_delay_list);
				2717	spin_unlock(&mdsc->cap_delay_lock);
				2718	dout("check_delayed_caps on %p\n", &ci->vfs_inode);
				2719	ceph_check_caps(ci, flags, NULL);
				2720	}
				2721	spin_unlock(&mdsc->cap_delay_lock);
				2722	}
				2723
				2724	/*
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame]	2725	* Flush all dirty caps to the mds
				2726	*/
				2727	void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
				2728	{
				2729	struct ceph_inode_info *ci;
				2730	struct inode *inode;
				2731
				2732	dout("flush_dirty_caps\n");
				2733	spin_lock(&mdsc->cap_dirty_lock);
				2734	while (!list_empty(&mdsc->cap_dirty)) {
				2735	ci = list_first_entry(&mdsc->cap_dirty,
				2736	struct ceph_inode_info,
				2737	i_dirty_item);
				2738	inode = igrab(&ci->vfs_inode);
				2739	spin_unlock(&mdsc->cap_dirty_lock);
				2740	if (inode) {
				2741	ceph_check_caps(ci, CHECK_CAPS_NODELAY\|CHECK_CAPS_FLUSH,
				2742	NULL);
				2743	iput(inode);
				2744	}
				2745	spin_lock(&mdsc->cap_dirty_lock);
				2746	}
				2747	spin_unlock(&mdsc->cap_dirty_lock);
				2748	}
				2749
				2750	/*
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2751	* Drop open file reference. If we were the last open file,
				2752	* we may need to release capabilities to the MDS (or schedule
				2753	* their delayed release).
				2754	*/
				2755	void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
				2756	{
				2757	struct inode *inode = &ci->vfs_inode;
				2758	int last = 0;
				2759
				2760	spin_lock(&inode->i_lock);
				2761	dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
				2762	ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
				2763	BUG_ON(ci->i_nr_by_mode[fmode] == 0);
				2764	if (--ci->i_nr_by_mode[fmode] == 0)
				2765	last++;
				2766	spin_unlock(&inode->i_lock);
				2767
				2768	if (last && ci->i_vino.snap == CEPH_NOSNAP)
				2769	ceph_check_caps(ci, 0, NULL);
				2770	}
				2771
				2772	/*
				2773	* Helpers for embedding cap and dentry lease releases into mds
				2774	* requests.
				2775	*
				2776	* @force is used by dentry_release (below) to force inclusion of a
				2777	* record for the directory inode, even when there aren't any caps to
				2778	* drop.
				2779	*/
				2780	int ceph_encode_inode_release(void *p, struct inode inode,
				2781	int mds, int drop, int unless, int force)
				2782	{
				2783	struct ceph_inode_info *ci = ceph_inode(inode);
				2784	struct ceph_cap *cap;
				2785	struct ceph_mds_request_release rel = p;
				2786	int ret = 0;
				2787
				2788	dout("encode_inode_release %p mds%d drop %s unless %s\n", inode,
				2789	mds, ceph_cap_string(drop), ceph_cap_string(unless));
				2790
				2791	spin_lock(&inode->i_lock);
				2792	cap = __get_cap_for_mds(ci, mds);
				2793	if (cap && __cap_is_valid(cap)) {
				2794	if (force \|\|
				2795	((cap->issued & drop) &&
				2796	(cap->issued & unless) == 0)) {
				2797	if ((cap->issued & drop) &&
				2798	(cap->issued & unless) == 0) {
				2799	dout("encode_inode_release %p cap %p %s -> "
				2800	"%s\n", inode, cap,
				2801	ceph_cap_string(cap->issued),
				2802	ceph_cap_string(cap->issued & ~drop));
				2803	cap->issued &= ~drop;
				2804	cap->implemented &= ~drop;
				2805	if (ci->i_ceph_flags & CEPH_I_NODELAY) {
				2806	int wanted = __ceph_caps_wanted(ci);
				2807	dout(" wanted %s -> %s (act %s)\n",
				2808	ceph_cap_string(cap->mds_wanted),
				2809	ceph_cap_string(cap->mds_wanted &
				2810	~wanted),
				2811	ceph_cap_string(wanted));
				2812	cap->mds_wanted &= wanted;
				2813	}
				2814	} else {
				2815	dout("encode_inode_release %p cap %p %s"
				2816	" (force)\n", inode, cap,
				2817	ceph_cap_string(cap->issued));
				2818	}
				2819
				2820	rel->ino = cpu_to_le64(ceph_ino(inode));
				2821	rel->cap_id = cpu_to_le64(cap->cap_id);
				2822	rel->seq = cpu_to_le32(cap->seq);
				2823	rel->issue_seq = cpu_to_le32(cap->issue_seq),
				2824	rel->mseq = cpu_to_le32(cap->mseq);
				2825	rel->caps = cpu_to_le32(cap->issued);
				2826	rel->wanted = cpu_to_le32(cap->mds_wanted);
				2827	rel->dname_len = 0;
				2828	rel->dname_seq = 0;
				2829	p += sizeof(rel);
				2830	ret = 1;
				2831	} else {
				2832	dout("encode_inode_release %p cap %p %s\n",
				2833	inode, cap, ceph_cap_string(cap->issued));
				2834	}
				2835	}
				2836	spin_unlock(&inode->i_lock);
				2837	return ret;
				2838	}
				2839
				2840	int ceph_encode_dentry_release(void *p, struct dentry dentry,
				2841	int mds, int drop, int unless)
				2842	{
				2843	struct inode *dir = dentry->d_parent->d_inode;
				2844	struct ceph_mds_request_release rel = p;
				2845	struct ceph_dentry_info *di = ceph_dentry(dentry);
				2846	int force = 0;
				2847	int ret;
				2848
				2849	/*
				2850	* force an record for the directory caps if we have a dentry lease.
				2851	* this is racy (can't take i_lock and d_lock together), but it
				2852	* doesn't have to be perfect; the mds will revoke anything we don't
				2853	* release.
				2854	*/
				2855	spin_lock(&dentry->d_lock);
				2856	if (di->lease_session && di->lease_session->s_mds == mds)
				2857	force = 1;
				2858	spin_unlock(&dentry->d_lock);
				2859
				2860	ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
				2861
				2862	spin_lock(&dentry->d_lock);
				2863	if (ret && di->lease_session && di->lease_session->s_mds == mds) {
				2864	dout("encode_dentry_release %p mds%d seq %d\n",
				2865	dentry, mds, (int)di->lease_seq);
				2866	rel->dname_len = cpu_to_le32(dentry->d_name.len);
				2867	memcpy(*p, dentry->d_name.name, dentry->d_name.len);
				2868	*p += dentry->d_name.len;
				2869	rel->dname_seq = cpu_to_le32(di->lease_seq);
				2870	}
				2871	spin_unlock(&dentry->d_lock);
				2872	return ret;
				2873	}