Blame - fs/ceph/caps.c - kernel/msm

blob: 8b863dbec70ce64d96131eb5e17d76f01486d086 [file] [log] [blame]

Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1	#include "ceph_debug.h"
				2
				3	#include <linux/fs.h>
				4	#include <linux/kernel.h>
				5	#include <linux/sched.h>
				6	#include <linux/vmalloc.h>
				7	#include <linux/wait.h>
				8
				9	#include "super.h"
				10	#include "decode.h"
				11	#include "messenger.h"
				12
				13	/*
				14	* Capability management
				15	*
				16	* The Ceph metadata servers control client access to inode metadata
				17	* and file data by issuing capabilities, granting clients permission
				18	* to read and/or write both inode field and file data to OSDs
				19	* (storage nodes). Each capability consists of a set of bits
				20	* indicating which operations are allowed.
				21	*
				22	* If the client holds a *_SHARED cap, the client has a coherent value
				23	* that can be safely read from the cached inode.
				24	*
				25	* In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
				26	* client is allowed to change inode attributes (e.g., file size,
				27	* mtime), note its dirty state in the ceph_cap, and asynchronously
				28	* flush that metadata change to the MDS.
				29	*
				30	* In the event of a conflicting operation (perhaps by another
				31	* client), the MDS will revoke the conflicting client capabilities.
				32	*
				33	* In order for a client to cache an inode, it must hold a capability
				34	* with at least one MDS server. When inodes are released, release
				35	* notifications are batched and periodically sent en masse to the MDS
				36	* cluster to release server state.
				37	*/
				38
				39
				40	/*
				41	* Generate readable cap strings for debugging output.
				42	*/
				43	#define MAX_CAP_STR 20
				44	static char cap_str[MAX_CAP_STR][40];
				45	static DEFINE_SPINLOCK(cap_str_lock);
				46	static int last_cap_str;
				47
				48	static char gcap_string(char s, int c)
				49	{
				50	if (c & CEPH_CAP_GSHARED)
				51	*s++ = 's';
				52	if (c & CEPH_CAP_GEXCL)
				53	*s++ = 'x';
				54	if (c & CEPH_CAP_GCACHE)
				55	*s++ = 'c';
				56	if (c & CEPH_CAP_GRD)
				57	*s++ = 'r';
				58	if (c & CEPH_CAP_GWR)
				59	*s++ = 'w';
				60	if (c & CEPH_CAP_GBUFFER)
				61	*s++ = 'b';
				62	if (c & CEPH_CAP_GLAZYIO)
				63	*s++ = 'l';
				64	return s;
				65	}
				66
				67	const char *ceph_cap_string(int caps)
				68	{
				69	int i;
				70	char *s;
				71	int c;
				72
				73	spin_lock(&cap_str_lock);
				74	i = last_cap_str++;
				75	if (last_cap_str == MAX_CAP_STR)
				76	last_cap_str = 0;
				77	spin_unlock(&cap_str_lock);
				78
				79	s = cap_str[i];
				80
				81	if (caps & CEPH_CAP_PIN)
				82	*s++ = 'p';
				83
				84	c = (caps >> CEPH_CAP_SAUTH) & 3;
				85	if (c) {
				86	*s++ = 'A';
				87	s = gcap_string(s, c);
				88	}
				89
				90	c = (caps >> CEPH_CAP_SLINK) & 3;
				91	if (c) {
				92	*s++ = 'L';
				93	s = gcap_string(s, c);
				94	}
				95
				96	c = (caps >> CEPH_CAP_SXATTR) & 3;
				97	if (c) {
				98	*s++ = 'X';
				99	s = gcap_string(s, c);
				100	}
				101
				102	c = caps >> CEPH_CAP_SFILE;
				103	if (c) {
				104	*s++ = 'F';
				105	s = gcap_string(s, c);
				106	}
				107
				108	if (s == cap_str[i])
				109	*s++ = '-';
				110	*s = 0;
				111	return cap_str[i];
				112	}
				113
				114	/*
				115	* Cap reservations
				116	*
				117	* Maintain a global pool of preallocated struct ceph_caps, referenced
				118	* by struct ceph_caps_reservations. This ensures that we preallocate
				119	* memory needed to successfully process an MDS response. (If an MDS
				120	* sends us cap information and we fail to process it, we will have
				121	* problems due to the client and MDS being out of sync.)
				122	*
				123	* Reservations are 'owned' by a ceph_cap_reservation context.
				124	*/
				125	static spinlock_t caps_list_lock;
				126	static struct list_head caps_list; /* unused (reserved or unreserved) */
				127	static int caps_total_count; /* total caps allocated */
				128	static int caps_use_count; /* in use */
				129	static int caps_reserve_count; /* unused, reserved */
				130	static int caps_avail_count; /* unused, unreserved */
				131
				132	void __init ceph_caps_init(void)
				133	{
				134	INIT_LIST_HEAD(&caps_list);
				135	spin_lock_init(&caps_list_lock);
				136	}
				137
				138	void ceph_caps_finalize(void)
				139	{
				140	struct ceph_cap *cap;
				141
				142	spin_lock(&caps_list_lock);
				143	while (!list_empty(&caps_list)) {
				144	cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
				145	list_del(&cap->caps_item);
				146	kmem_cache_free(ceph_cap_cachep, cap);
				147	}
				148	caps_total_count = 0;
				149	caps_avail_count = 0;
				150	caps_use_count = 0;
				151	caps_reserve_count = 0;
				152	spin_unlock(&caps_list_lock);
				153	}
				154
				155	int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
				156	{
				157	int i;
				158	struct ceph_cap *cap;
				159	int have;
				160	int alloc = 0;
				161	LIST_HEAD(newcaps);
				162	int ret = 0;
				163
				164	dout("reserve caps ctx=%p need=%d\n", ctx, need);
				165
				166	/* first reserve any caps that are already allocated */
				167	spin_lock(&caps_list_lock);
				168	if (caps_avail_count >= need)
				169	have = need;
				170	else
				171	have = caps_avail_count;
				172	caps_avail_count -= have;
				173	caps_reserve_count += have;
				174	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
				175	caps_avail_count);
				176	spin_unlock(&caps_list_lock);
				177
				178	for (i = have; i < need; i++) {
				179	cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
				180	if (!cap) {
				181	ret = -ENOMEM;
				182	goto out_alloc_count;
				183	}
				184	list_add(&cap->caps_item, &newcaps);
				185	alloc++;
				186	}
				187	BUG_ON(have + alloc != need);
				188
				189	spin_lock(&caps_list_lock);
				190	caps_total_count += alloc;
				191	caps_reserve_count += alloc;
				192	list_splice(&newcaps, &caps_list);
				193
				194	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
				195	caps_avail_count);
				196	spin_unlock(&caps_list_lock);
				197
				198	ctx->count = need;
				199	dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
				200	ctx, caps_total_count, caps_use_count, caps_reserve_count,
				201	caps_avail_count);
				202	return 0;
				203
				204	out_alloc_count:
				205	/* we didn't manage to reserve as much as we needed */
				206	pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
				207	ctx, need, have);
				208	return ret;
				209	}
				210
				211	int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
				212	{
				213	dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
				214	if (ctx->count) {
				215	spin_lock(&caps_list_lock);
				216	BUG_ON(caps_reserve_count < ctx->count);
				217	caps_reserve_count -= ctx->count;
				218	caps_avail_count += ctx->count;
				219	ctx->count = 0;
				220	dout("unreserve caps %d = %d used + %d resv + %d avail\n",
				221	caps_total_count, caps_use_count, caps_reserve_count,
				222	caps_avail_count);
				223	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
				224	caps_avail_count);
				225	spin_unlock(&caps_list_lock);
				226	}
				227	return 0;
				228	}
				229
				230	static struct ceph_cap get_cap(struct ceph_cap_reservation ctx)
				231	{
				232	struct ceph_cap *cap = NULL;
				233
				234	/* temporary, until we do something about cap import/export */
				235	if (!ctx)
				236	return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
				237
				238	spin_lock(&caps_list_lock);
				239	dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
				240	ctx, ctx->count, caps_total_count, caps_use_count,
				241	caps_reserve_count, caps_avail_count);
				242	BUG_ON(!ctx->count);
				243	BUG_ON(ctx->count > caps_reserve_count);
				244	BUG_ON(list_empty(&caps_list));
				245
				246	ctx->count--;
				247	caps_reserve_count--;
				248	caps_use_count++;
				249
				250	cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
				251	list_del(&cap->caps_item);
				252
				253	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
				254	caps_avail_count);
				255	spin_unlock(&caps_list_lock);
				256	return cap;
				257	}
				258
				259	static void put_cap(struct ceph_cap *cap,
				260	struct ceph_cap_reservation *ctx)
				261	{
				262	spin_lock(&caps_list_lock);
				263	dout("put_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
				264	ctx, ctx ? ctx->count : 0, caps_total_count, caps_use_count,
				265	caps_reserve_count, caps_avail_count);
				266	caps_use_count--;
				267	/*
				268	* Keep some preallocated caps around, at least enough to do a
				269	* readdir (which needs to preallocate lots of them), to avoid
				270	* lots of free/alloc churn.
				271	*/
				272	if (caps_avail_count >= caps_reserve_count +
Sage Weil	6b80518	2009-10-27 11:50:50 -0700	[diff] [blame^]	273	ceph_client(cap->ci->vfs_inode.i_sb)->mount_args->max_readdir) {
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	274	caps_total_count--;
				275	kmem_cache_free(ceph_cap_cachep, cap);
				276	} else {
				277	if (ctx) {
				278	ctx->count++;
				279	caps_reserve_count++;
				280	} else {
				281	caps_avail_count++;
				282	}
				283	list_add(&cap->caps_item, &caps_list);
				284	}
				285
				286	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
				287	caps_avail_count);
				288	spin_unlock(&caps_list_lock);
				289	}
				290
				291	void ceph_reservation_status(struct ceph_client *client,
				292	int total, int avail, int used, int reserved)
				293	{
				294	if (total)
				295	*total = caps_total_count;
				296	if (avail)
				297	*avail = caps_avail_count;
				298	if (used)
				299	*used = caps_use_count;
				300	if (reserved)
				301	*reserved = caps_reserve_count;
				302	}
				303
				304	/*
				305	* Find ceph_cap for given mds, if any.
				306	*
				307	* Called with i_lock held.
				308	*/
				309	static struct ceph_cap __get_cap_for_mds(struct ceph_inode_info ci, int mds)
				310	{
				311	struct ceph_cap *cap;
				312	struct rb_node *n = ci->i_caps.rb_node;
				313
				314	while (n) {
				315	cap = rb_entry(n, struct ceph_cap, ci_node);
				316	if (mds < cap->mds)
				317	n = n->rb_left;
				318	else if (mds > cap->mds)
				319	n = n->rb_right;
				320	else
				321	return cap;
				322	}
				323	return NULL;
				324	}
				325
				326	/*
				327	* Return id of any MDS with a cap, preferably FILE_WR\|WRBUFFER\|EXCL, else
				328	* -1.
				329	*/
				330	static int __ceph_get_cap_mds(struct ceph_inode_info ci, u32 mseq)
				331	{
				332	struct ceph_cap *cap;
				333	int mds = -1;
				334	struct rb_node *p;
				335
				336	/* prefer mds with WR\|WRBUFFER\|EXCL caps */
				337	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				338	cap = rb_entry(p, struct ceph_cap, ci_node);
				339	mds = cap->mds;
				340	if (mseq)
				341	*mseq = cap->mseq;
				342	if (cap->issued & (CEPH_CAP_FILE_WR \|
				343	CEPH_CAP_FILE_BUFFER \|
				344	CEPH_CAP_FILE_EXCL))
				345	break;
				346	}
				347	return mds;
				348	}
				349
				350	int ceph_get_cap_mds(struct inode *inode)
				351	{
				352	int mds;
				353	spin_lock(&inode->i_lock);
				354	mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
				355	spin_unlock(&inode->i_lock);
				356	return mds;
				357	}
				358
				359	/*
				360	* Called under i_lock.
				361	*/
				362	static void __insert_cap_node(struct ceph_inode_info *ci,
				363	struct ceph_cap *new)
				364	{
				365	struct rb_node **p = &ci->i_caps.rb_node;
				366	struct rb_node *parent = NULL;
				367	struct ceph_cap *cap = NULL;
				368
				369	while (*p) {
				370	parent = *p;
				371	cap = rb_entry(parent, struct ceph_cap, ci_node);
				372	if (new->mds < cap->mds)
				373	p = &(*p)->rb_left;
				374	else if (new->mds > cap->mds)
				375	p = &(*p)->rb_right;
				376	else
				377	BUG();
				378	}
				379
				380	rb_link_node(&new->ci_node, parent, p);
				381	rb_insert_color(&new->ci_node, &ci->i_caps);
				382	}
				383
				384	/*
				385	* (re)set cap hold timeouts, which control the delayed release
				386	* of unused caps back to the MDS. Should be called on cap use.
				387	*/
				388	static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
				389	struct ceph_inode_info *ci)
				390	{
Sage Weil	6b80518	2009-10-27 11:50:50 -0700	[diff] [blame^]	391	struct ceph_mount_args *ma = mdsc->client->mount_args;
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	392
				393	ci->i_hold_caps_min = round_jiffies(jiffies +
				394	ma->caps_wanted_delay_min * HZ);
				395	ci->i_hold_caps_max = round_jiffies(jiffies +
				396	ma->caps_wanted_delay_max * HZ);
				397	dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
				398	ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
				399	}
				400
				401	/*
				402	* (Re)queue cap at the end of the delayed cap release list.
				403	*
				404	* If I_FLUSH is set, leave the inode at the front of the list.
				405	*
				406	* Caller holds i_lock
				407	* -> we take mdsc->cap_delay_lock
				408	*/
				409	static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
				410	struct ceph_inode_info *ci)
				411	{
				412	__cap_set_timeouts(mdsc, ci);
				413	dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
				414	ci->i_ceph_flags, ci->i_hold_caps_max);
				415	if (!mdsc->stopping) {
				416	spin_lock(&mdsc->cap_delay_lock);
				417	if (!list_empty(&ci->i_cap_delay_list)) {
				418	if (ci->i_ceph_flags & CEPH_I_FLUSH)
				419	goto no_change;
				420	list_del_init(&ci->i_cap_delay_list);
				421	}
				422	list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
				423	no_change:
				424	spin_unlock(&mdsc->cap_delay_lock);
				425	}
				426	}
				427
				428	/*
				429	* Queue an inode for immediate writeback. Mark inode with I_FLUSH,
				430	* indicating we should send a cap message to flush dirty metadata
				431	* asap, and move to the front of the delayed cap list.
				432	*/
				433	static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
				434	struct ceph_inode_info *ci)
				435	{
				436	dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
				437	spin_lock(&mdsc->cap_delay_lock);
				438	ci->i_ceph_flags \|= CEPH_I_FLUSH;
				439	if (!list_empty(&ci->i_cap_delay_list))
				440	list_del_init(&ci->i_cap_delay_list);
				441	list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
				442	spin_unlock(&mdsc->cap_delay_lock);
				443	}
				444
				445	/*
				446	* Cancel delayed work on cap.
				447	*
				448	* Caller must hold i_lock.
				449	*/
				450	static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
				451	struct ceph_inode_info *ci)
				452	{
				453	dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
				454	if (list_empty(&ci->i_cap_delay_list))
				455	return;
				456	spin_lock(&mdsc->cap_delay_lock);
				457	list_del_init(&ci->i_cap_delay_list);
				458	spin_unlock(&mdsc->cap_delay_lock);
				459	}
				460
				461	/*
				462	* Common issue checks for add_cap, handle_cap_grant.
				463	*/
				464	static void __check_cap_issue(struct ceph_inode_info ci, struct ceph_cap cap,
				465	unsigned issued)
				466	{
				467	unsigned had = __ceph_caps_issued(ci, NULL);
				468
				469	/*
				470	* Each time we receive FILE_CACHE anew, we increment
				471	* i_rdcache_gen.
				472	*/
				473	if ((issued & CEPH_CAP_FILE_CACHE) &&
				474	(had & CEPH_CAP_FILE_CACHE) == 0)
				475	ci->i_rdcache_gen++;
				476
				477	/*
				478	* if we are newly issued FILE_SHARED, clear I_COMPLETE; we
				479	* don't know what happened to this directory while we didn't
				480	* have the cap.
				481	*/
				482	if ((issued & CEPH_CAP_FILE_SHARED) &&
				483	(had & CEPH_CAP_FILE_SHARED) == 0) {
				484	ci->i_shared_gen++;
				485	if (S_ISDIR(ci->vfs_inode.i_mode)) {
				486	dout(" marking %p NOT complete\n", &ci->vfs_inode);
				487	ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
				488	}
				489	}
				490	}
				491
				492	/*
				493	* Add a capability under the given MDS session.
				494	*
				495	* Caller should hold session snap_rwsem (read) and s_mutex.
				496	*
				497	* @fmode is the open file mode, if we are opening a file, otherwise
				498	* it is < 0. (This is so we can atomically add the cap and add an
				499	* open file reference to it.)
				500	*/
				501	int ceph_add_cap(struct inode *inode,
				502	struct ceph_mds_session *session, u64 cap_id,
				503	int fmode, unsigned issued, unsigned wanted,
				504	unsigned seq, unsigned mseq, u64 realmino, int flags,
				505	struct ceph_cap_reservation *caps_reservation)
				506	{
				507	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
				508	struct ceph_inode_info *ci = ceph_inode(inode);
				509	struct ceph_cap *new_cap = NULL;
				510	struct ceph_cap *cap;
				511	int mds = session->s_mds;
				512	int actual_wanted;
				513
				514	dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
				515	session->s_mds, cap_id, ceph_cap_string(issued), seq);
				516
				517	/*
				518	* If we are opening the file, include file mode wanted bits
				519	* in wanted.
				520	*/
				521	if (fmode >= 0)
				522	wanted \|= ceph_caps_for_mode(fmode);
				523
				524	retry:
				525	spin_lock(&inode->i_lock);
				526	cap = __get_cap_for_mds(ci, mds);
				527	if (!cap) {
				528	if (new_cap) {
				529	cap = new_cap;
				530	new_cap = NULL;
				531	} else {
				532	spin_unlock(&inode->i_lock);
				533	new_cap = get_cap(caps_reservation);
				534	if (new_cap == NULL)
				535	return -ENOMEM;
				536	goto retry;
				537	}
				538
				539	cap->issued = 0;
				540	cap->implemented = 0;
				541	cap->mds = mds;
				542	cap->mds_wanted = 0;
				543
				544	cap->ci = ci;
				545	__insert_cap_node(ci, cap);
				546
				547	/* clear out old exporting info? (i.e. on cap import) */
				548	if (ci->i_cap_exporting_mds == mds) {
				549	ci->i_cap_exporting_issued = 0;
				550	ci->i_cap_exporting_mseq = 0;
				551	ci->i_cap_exporting_mds = -1;
				552	}
				553
				554	/* add to session cap list */
				555	cap->session = session;
				556	spin_lock(&session->s_cap_lock);
				557	list_add_tail(&cap->session_caps, &session->s_caps);
				558	session->s_nr_caps++;
				559	spin_unlock(&session->s_cap_lock);
				560	}
				561
				562	if (!ci->i_snap_realm) {
				563	/*
				564	* add this inode to the appropriate snap realm
				565	*/
				566	struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
				567	realmino);
				568	if (realm) {
				569	ceph_get_snap_realm(mdsc, realm);
				570	spin_lock(&realm->inodes_with_caps_lock);
				571	ci->i_snap_realm = realm;
				572	list_add(&ci->i_snap_realm_item,
				573	&realm->inodes_with_caps);
				574	spin_unlock(&realm->inodes_with_caps_lock);
				575	} else {
				576	pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
				577	realmino);
				578	}
				579	}
				580
				581	__check_cap_issue(ci, cap, issued);
				582
				583	/*
				584	* If we are issued caps we don't want, or the mds' wanted
				585	* value appears to be off, queue a check so we'll release
				586	* later and/or update the mds wanted value.
				587	*/
				588	actual_wanted = __ceph_caps_wanted(ci);
				589	if ((wanted & ~actual_wanted) \|\|
				590	(issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
				591	dout(" issued %s, mds wanted %s, actual %s, queueing\n",
				592	ceph_cap_string(issued), ceph_cap_string(wanted),
				593	ceph_cap_string(actual_wanted));
				594	__cap_delay_requeue(mdsc, ci);
				595	}
				596
				597	if (flags & CEPH_CAP_FLAG_AUTH)
				598	ci->i_auth_cap = cap;
				599	else if (ci->i_auth_cap == cap)
				600	ci->i_auth_cap = NULL;
				601
				602	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
				603	inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
				604	ceph_cap_string(issued\|cap->issued), seq, mds);
				605	cap->cap_id = cap_id;
				606	cap->issued = issued;
				607	cap->implemented \|= issued;
				608	cap->mds_wanted \|= wanted;
				609	cap->seq = seq;
				610	cap->issue_seq = seq;
				611	cap->mseq = mseq;
				612	cap->gen = session->s_cap_gen;
				613
				614	if (fmode >= 0)
				615	__ceph_get_fmode(ci, fmode);
				616	spin_unlock(&inode->i_lock);
				617	wake_up(&ci->i_cap_wq);
				618	return 0;
				619	}
				620
				621	/*
				622	* Return true if cap has not timed out and belongs to the current
				623	* generation of the MDS session (i.e. has not gone 'stale' due to
				624	* us losing touch with the mds).
				625	*/
				626	static int __cap_is_valid(struct ceph_cap *cap)
				627	{
				628	unsigned long ttl;
				629	u32 gen;
				630
				631	spin_lock(&cap->session->s_cap_lock);
				632	gen = cap->session->s_cap_gen;
				633	ttl = cap->session->s_cap_ttl;
				634	spin_unlock(&cap->session->s_cap_lock);
				635
				636	if (cap->gen < gen \|\| time_after_eq(jiffies, ttl)) {
				637	dout("__cap_is_valid %p cap %p issued %s "
				638	"but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
				639	cap, ceph_cap_string(cap->issued), cap->gen, gen);
				640	return 0;
				641	}
				642
				643	return 1;
				644	}
				645
				646	/*
				647	* Return set of valid cap bits issued to us. Note that caps time
				648	* out, and may be invalidated in bulk if the client session times out
				649	* and session->s_cap_gen is bumped.
				650	*/
				651	int __ceph_caps_issued(struct ceph_inode_info ci, int implemented)
				652	{
				653	int have = ci->i_snap_caps;
				654	struct ceph_cap *cap;
				655	struct rb_node *p;
				656
				657	if (implemented)
				658	*implemented = 0;
				659	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				660	cap = rb_entry(p, struct ceph_cap, ci_node);
				661	if (!__cap_is_valid(cap))
				662	continue;
				663	dout("__ceph_caps_issued %p cap %p issued %s\n",
				664	&ci->vfs_inode, cap, ceph_cap_string(cap->issued));
				665	have \|= cap->issued;
				666	if (implemented)
				667	*implemented \|= cap->implemented;
				668	}
				669	return have;
				670	}
				671
				672	/*
				673	* Get cap bits issued by caps other than @ocap
				674	*/
				675	int __ceph_caps_issued_other(struct ceph_inode_info ci, struct ceph_cap ocap)
				676	{
				677	int have = ci->i_snap_caps;
				678	struct ceph_cap *cap;
				679	struct rb_node *p;
				680
				681	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				682	cap = rb_entry(p, struct ceph_cap, ci_node);
				683	if (cap == ocap)
				684	continue;
				685	if (!__cap_is_valid(cap))
				686	continue;
				687	have \|= cap->issued;
				688	}
				689	return have;
				690	}
				691
				692	/*
				693	* Move a cap to the end of the LRU (oldest caps at list head, newest
				694	* at list tail).
				695	*/
				696	static void __touch_cap(struct ceph_cap *cap)
				697	{
				698	struct ceph_mds_session *s = cap->session;
				699
				700	dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
				701	s->s_mds);
				702	spin_lock(&s->s_cap_lock);
				703	list_move_tail(&cap->session_caps, &s->s_caps);
				704	spin_unlock(&s->s_cap_lock);
				705	}
				706
				707	/*
				708	* Check if we hold the given mask. If so, move the cap(s) to the
				709	* front of their respective LRUs. (This is the preferred way for
				710	* callers to check for caps they want.)
				711	*/
				712	int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
				713	{
				714	struct ceph_cap *cap;
				715	struct rb_node *p;
				716	int have = ci->i_snap_caps;
				717
				718	if ((have & mask) == mask) {
				719	dout("__ceph_caps_issued_mask %p snap issued %s"
				720	" (mask %s)\n", &ci->vfs_inode,
				721	ceph_cap_string(have),
				722	ceph_cap_string(mask));
				723	return 1;
				724	}
				725
				726	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				727	cap = rb_entry(p, struct ceph_cap, ci_node);
				728	if (!__cap_is_valid(cap))
				729	continue;
				730	if ((cap->issued & mask) == mask) {
				731	dout("__ceph_caps_issued_mask %p cap %p issued %s"
				732	" (mask %s)\n", &ci->vfs_inode, cap,
				733	ceph_cap_string(cap->issued),
				734	ceph_cap_string(mask));
				735	if (touch)
				736	__touch_cap(cap);
				737	return 1;
				738	}
				739
				740	/* does a combination of caps satisfy mask? */
				741	have \|= cap->issued;
				742	if ((have & mask) == mask) {
				743	dout("__ceph_caps_issued_mask %p combo issued %s"
				744	" (mask %s)\n", &ci->vfs_inode,
				745	ceph_cap_string(cap->issued),
				746	ceph_cap_string(mask));
				747	if (touch) {
				748	struct rb_node *q;
				749
				750	/* touch this + preceeding caps */
				751	__touch_cap(cap);
				752	for (q = rb_first(&ci->i_caps); q != p;
				753	q = rb_next(q)) {
				754	cap = rb_entry(q, struct ceph_cap,
				755	ci_node);
				756	if (!__cap_is_valid(cap))
				757	continue;
				758	__touch_cap(cap);
				759	}
				760	}
				761	return 1;
				762	}
				763	}
				764
				765	return 0;
				766	}
				767
				768	/*
				769	* Return true if mask caps are currently being revoked by an MDS.
				770	*/
				771	int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
				772	{
				773	struct inode *inode = &ci->vfs_inode;
				774	struct ceph_cap *cap;
				775	struct rb_node *p;
				776	int ret = 0;
				777
				778	spin_lock(&inode->i_lock);
				779	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				780	cap = rb_entry(p, struct ceph_cap, ci_node);
				781	if (__cap_is_valid(cap) &&
				782	(cap->implemented & ~cap->issued & mask)) {
				783	ret = 1;
				784	break;
				785	}
				786	}
				787	spin_unlock(&inode->i_lock);
				788	dout("ceph_caps_revoking %p %s = %d\n", inode,
				789	ceph_cap_string(mask), ret);
				790	return ret;
				791	}
				792
				793	int __ceph_caps_used(struct ceph_inode_info *ci)
				794	{
				795	int used = 0;
				796	if (ci->i_pin_ref)
				797	used \|= CEPH_CAP_PIN;
				798	if (ci->i_rd_ref)
				799	used \|= CEPH_CAP_FILE_RD;
				800	if (ci->i_rdcache_ref \|\| ci->i_rdcache_gen)
				801	used \|= CEPH_CAP_FILE_CACHE;
				802	if (ci->i_wr_ref)
				803	used \|= CEPH_CAP_FILE_WR;
				804	if (ci->i_wrbuffer_ref)
				805	used \|= CEPH_CAP_FILE_BUFFER;
				806	return used;
				807	}
				808
				809	/*
				810	* wanted, by virtue of open file modes
				811	*/
				812	int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
				813	{
				814	int want = 0;
				815	int mode;
				816	for (mode = 0; mode < 4; mode++)
				817	if (ci->i_nr_by_mode[mode])
				818	want \|= ceph_caps_for_mode(mode);
				819	return want;
				820	}
				821
				822	/*
				823	* Return caps we have registered with the MDS(s) as 'wanted'.
				824	*/
				825	int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
				826	{
				827	struct ceph_cap *cap;
				828	struct rb_node *p;
				829	int mds_wanted = 0;
				830
				831	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				832	cap = rb_entry(p, struct ceph_cap, ci_node);
				833	if (!__cap_is_valid(cap))
				834	continue;
				835	mds_wanted \|= cap->mds_wanted;
				836	}
				837	return mds_wanted;
				838	}
				839
				840	/*
				841	* called under i_lock
				842	*/
				843	static int __ceph_is_any_caps(struct ceph_inode_info *ci)
				844	{
				845	return !RB_EMPTY_ROOT(&ci->i_caps) \|\| ci->i_cap_exporting_mds >= 0;
				846	}
				847
				848	/*
				849	* caller should hold i_lock, and session s_mutex.
				850	* returns true if this is the last cap. if so, caller should iput.
				851	*/
				852	void __ceph_remove_cap(struct ceph_cap *cap,
				853	struct ceph_cap_reservation *ctx)
				854	{
				855	struct ceph_mds_session *session = cap->session;
				856	struct ceph_inode_info *ci = cap->ci;
				857	struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
				858
				859	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
				860
				861	/* remove from session list */
				862	spin_lock(&session->s_cap_lock);
				863	list_del_init(&cap->session_caps);
				864	session->s_nr_caps--;
				865	spin_unlock(&session->s_cap_lock);
				866
				867	/* remove from inode list */
				868	rb_erase(&cap->ci_node, &ci->i_caps);
				869	cap->session = NULL;
				870	if (ci->i_auth_cap == cap)
				871	ci->i_auth_cap = NULL;
				872
				873	put_cap(cap, ctx);
				874
				875	if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
				876	struct ceph_snap_realm *realm = ci->i_snap_realm;
				877	spin_lock(&realm->inodes_with_caps_lock);
				878	list_del_init(&ci->i_snap_realm_item);
				879	ci->i_snap_realm_counter++;
				880	ci->i_snap_realm = NULL;
				881	spin_unlock(&realm->inodes_with_caps_lock);
				882	ceph_put_snap_realm(mdsc, realm);
				883	}
				884	if (!__ceph_is_any_real_caps(ci))
				885	__cap_delay_cancel(mdsc, ci);
				886	}
				887
				888	/*
				889	* Build and send a cap message to the given MDS.
				890	*
				891	* Caller should be holding s_mutex.
				892	*/
				893	static int send_cap_msg(struct ceph_mds_session *session,
				894	u64 ino, u64 cid, int op,
				895	int caps, int wanted, int dirty,
				896	u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
				897	u64 size, u64 max_size,
				898	struct timespec mtime, struct timespec atime,
				899	u64 time_warp_seq,
				900	uid_t uid, gid_t gid, mode_t mode,
				901	u64 xattr_version,
				902	struct ceph_buffer *xattrs_buf,
				903	u64 follows)
				904	{
				905	struct ceph_mds_caps *fc;
				906	struct ceph_msg *msg;
				907
				908	dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
				909	" seq %u/%u mseq %u follows %lld size %llu/%llu"
				910	" xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
				911	cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
				912	ceph_cap_string(dirty),
				913	seq, issue_seq, mseq, follows, size, max_size,
				914	xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
				915
				916	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
				917	if (IS_ERR(msg))
				918	return PTR_ERR(msg);
				919
				920	fc = msg->front.iov_base;
				921
				922	memset(fc, 0, sizeof(*fc));
				923
				924	fc->cap_id = cpu_to_le64(cid);
				925	fc->op = cpu_to_le32(op);
				926	fc->seq = cpu_to_le32(seq);
				927	fc->client_tid = cpu_to_le64(flush_tid);
				928	fc->issue_seq = cpu_to_le32(issue_seq);
				929	fc->migrate_seq = cpu_to_le32(mseq);
				930	fc->caps = cpu_to_le32(caps);
				931	fc->wanted = cpu_to_le32(wanted);
				932	fc->dirty = cpu_to_le32(dirty);
				933	fc->ino = cpu_to_le64(ino);
				934	fc->snap_follows = cpu_to_le64(follows);
				935
				936	fc->size = cpu_to_le64(size);
				937	fc->max_size = cpu_to_le64(max_size);
				938	if (mtime)
				939	ceph_encode_timespec(&fc->mtime, mtime);
				940	if (atime)
				941	ceph_encode_timespec(&fc->atime, atime);
				942	fc->time_warp_seq = cpu_to_le32(time_warp_seq);
				943
				944	fc->uid = cpu_to_le32(uid);
				945	fc->gid = cpu_to_le32(gid);
				946	fc->mode = cpu_to_le32(mode);
				947
				948	fc->xattr_version = cpu_to_le64(xattr_version);
				949	if (xattrs_buf) {
				950	msg->middle = ceph_buffer_get(xattrs_buf);
				951	fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
				952	msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
				953	}
				954
				955	ceph_con_send(&session->s_con, msg);
				956	return 0;
				957	}
				958
				959	/*
				960	* Queue cap releases when an inode is dropped from our
				961	* cache.
				962	*/
				963	void ceph_queue_caps_release(struct inode *inode)
				964	{
				965	struct ceph_inode_info *ci = ceph_inode(inode);
				966	struct rb_node *p;
				967
				968	spin_lock(&inode->i_lock);
				969	p = rb_first(&ci->i_caps);
				970	while (p) {
				971	struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
				972	struct ceph_mds_session *session = cap->session;
				973	struct ceph_msg *msg;
				974	struct ceph_mds_cap_release *head;
				975	struct ceph_mds_cap_item *item;
				976
				977	spin_lock(&session->s_cap_lock);
				978	BUG_ON(!session->s_num_cap_releases);
				979	msg = list_first_entry(&session->s_cap_releases,
				980	struct ceph_msg, list_head);
				981
				982	dout(" adding %p release to mds%d msg %p (%d left)\n",
				983	inode, session->s_mds, msg, session->s_num_cap_releases);
				984
				985	BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
				986	head = msg->front.iov_base;
				987	head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
				988	item = msg->front.iov_base + msg->front.iov_len;
				989	item->ino = cpu_to_le64(ceph_ino(inode));
				990	item->cap_id = cpu_to_le64(cap->cap_id);
				991	item->migrate_seq = cpu_to_le32(cap->mseq);
				992	item->seq = cpu_to_le32(cap->issue_seq);
				993
				994	session->s_num_cap_releases--;
				995
				996	msg->front.iov_len += sizeof(*item);
				997	if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
				998	dout(" release msg %p full\n", msg);
				999	list_move_tail(&msg->list_head,
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame]	1000	&session->s_cap_releases_done);
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1001	} else {
				1002	dout(" release msg %p at %d/%d (%d)\n", msg,
				1003	(int)le32_to_cpu(head->num),
				1004	(int)CEPH_CAPS_PER_RELEASE,
				1005	(int)msg->front.iov_len);
				1006	}
				1007	spin_unlock(&session->s_cap_lock);
				1008	p = rb_next(p);
				1009	__ceph_remove_cap(cap, NULL);
				1010
				1011	}
				1012	spin_unlock(&inode->i_lock);
				1013	}
				1014
				1015	/*
				1016	* Send a cap msg on the given inode. Update our caps state, then
				1017	* drop i_lock and send the message.
				1018	*
				1019	* Make note of max_size reported/requested from mds, revoked caps
				1020	* that have now been implemented.
				1021	*
				1022	* Make half-hearted attempt ot to invalidate page cache if we are
				1023	* dropping RDCACHE. Note that this will leave behind locked pages
				1024	* that we'll then need to deal with elsewhere.
				1025	*
				1026	* Return non-zero if delayed release, or we experienced an error
				1027	* such that the caller should requeue + retry later.
				1028	*
				1029	* called with i_lock, then drops it.
				1030	* caller should hold snap_rwsem (read), s_mutex.
				1031	*/
				1032	static int __send_cap(struct ceph_mds_client mdsc, struct ceph_cap cap,
				1033	int op, int used, int want, int retain, int flushing,
				1034	unsigned *pflush_tid)
				1035	__releases(cap->ci->vfs_inode->i_lock)
				1036	{
				1037	struct ceph_inode_info *ci = cap->ci;
				1038	struct inode *inode = &ci->vfs_inode;
				1039	u64 cap_id = cap->cap_id;
				1040	int held = cap->issued \| cap->implemented;
				1041	int revoking = cap->implemented & ~cap->issued;
				1042	int dropping = cap->issued & ~retain;
				1043	int keep;
				1044	u64 seq, issue_seq, mseq, time_warp_seq, follows;
				1045	u64 size, max_size;
				1046	struct timespec mtime, atime;
				1047	int wake = 0;
				1048	mode_t mode;
				1049	uid_t uid;
				1050	gid_t gid;
				1051	struct ceph_mds_session *session;
				1052	u64 xattr_version = 0;
				1053	int delayed = 0;
				1054	u64 flush_tid = 0;
				1055	int i;
				1056	int ret;
				1057
				1058	dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
				1059	inode, cap, cap->session,
				1060	ceph_cap_string(held), ceph_cap_string(held & retain),
				1061	ceph_cap_string(revoking));
				1062	BUG_ON((retain & CEPH_CAP_PIN) == 0);
				1063
				1064	session = cap->session;
				1065
				1066	/* don't release wanted unless we've waited a bit. */
				1067	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
				1068	time_before(jiffies, ci->i_hold_caps_min)) {
				1069	dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
				1070	ceph_cap_string(cap->issued),
				1071	ceph_cap_string(cap->issued & retain),
				1072	ceph_cap_string(cap->mds_wanted),
				1073	ceph_cap_string(want));
				1074	want \|= cap->mds_wanted;
				1075	retain \|= cap->issued;
				1076	delayed = 1;
				1077	}
				1078	ci->i_ceph_flags &= ~(CEPH_I_NODELAY \| CEPH_I_FLUSH);
				1079
				1080	cap->issued &= retain; /* drop bits we don't want */
				1081	if (cap->implemented & ~cap->issued) {
				1082	/*
				1083	* Wake up any waiters on wanted -> needed transition.
				1084	* This is due to the weird transition from buffered
				1085	* to sync IO... we need to flush dirty pages _before_
				1086	* allowing sync writes to avoid reordering.
				1087	*/
				1088	wake = 1;
				1089	}
				1090	cap->implemented &= cap->issued \| used;
				1091	cap->mds_wanted = want;
				1092
				1093	if (flushing) {
				1094	/*
				1095	* assign a tid for flush operations so we can avoid
				1096	* flush1 -> dirty1 -> flush2 -> flushack1 -> mark
				1097	* clean type races. track latest tid for every bit
				1098	* so we can handle flush AxFw, flush Fw, and have the
				1099	* first ack clean Ax.
				1100	*/
				1101	flush_tid = ++ci->i_cap_flush_last_tid;
				1102	if (pflush_tid)
				1103	*pflush_tid = flush_tid;
				1104	dout(" cap_flush_tid %d\n", (int)flush_tid);
				1105	for (i = 0; i < CEPH_CAP_BITS; i++)
				1106	if (flushing & (1 << i))
				1107	ci->i_cap_flush_tid[i] = flush_tid;
				1108	}
				1109
				1110	keep = cap->implemented;
				1111	seq = cap->seq;
				1112	issue_seq = cap->issue_seq;
				1113	mseq = cap->mseq;
				1114	size = inode->i_size;
				1115	ci->i_reported_size = size;
				1116	max_size = ci->i_wanted_max_size;
				1117	ci->i_requested_max_size = max_size;
				1118	mtime = inode->i_mtime;
				1119	atime = inode->i_atime;
				1120	time_warp_seq = ci->i_time_warp_seq;
				1121	follows = ci->i_snap_realm->cached_context->seq;
				1122	uid = inode->i_uid;
				1123	gid = inode->i_gid;
				1124	mode = inode->i_mode;
				1125
				1126	if (dropping & CEPH_CAP_XATTR_EXCL) {
				1127	__ceph_build_xattrs_blob(ci);
				1128	xattr_version = ci->i_xattrs.version + 1;
				1129	}
				1130
				1131	spin_unlock(&inode->i_lock);
				1132
				1133	if (dropping & CEPH_CAP_FILE_CACHE) {
				1134	/* invalidate what we can */
				1135	dout("invalidating pages on %p\n", inode);
				1136	invalidate_mapping_pages(&inode->i_data, 0, -1);
				1137	}
				1138
				1139	ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
				1140	op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
				1141	size, max_size, &mtime, &atime, time_warp_seq,
				1142	uid, gid, mode,
				1143	xattr_version,
				1144	(flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
				1145	follows);
				1146	if (ret < 0) {
				1147	dout("error sending cap msg, must requeue %p\n", inode);
				1148	delayed = 1;
				1149	}
				1150
				1151	if (wake)
				1152	wake_up(&ci->i_cap_wq);
				1153
				1154	return delayed;
				1155	}
				1156
				1157	/*
				1158	* When a snapshot is taken, clients accumulate dirty metadata on
				1159	* inodes with capabilities in ceph_cap_snaps to describe the file
				1160	* state at the time the snapshot was taken. This must be flushed
				1161	* asynchronously back to the MDS once sync writes complete and dirty
				1162	* data is written out.
				1163	*
				1164	* Called under i_lock. Takes s_mutex as needed.
				1165	*/
				1166	void __ceph_flush_snaps(struct ceph_inode_info *ci,
				1167	struct ceph_mds_session **psession)
				1168	{
				1169	struct inode *inode = &ci->vfs_inode;
				1170	int mds;
				1171	struct ceph_cap_snap *capsnap;
				1172	u32 mseq;
				1173	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
				1174	struct ceph_mds_session session = NULL; / if session != NULL, we hold
				1175	session->s_mutex */
				1176	u64 next_follows = 0; /* keep track of how far we've gotten through the
				1177	i_cap_snaps list, and skip these entries next time
				1178	around to avoid an infinite loop */
				1179
				1180	if (psession)
				1181	session = *psession;
				1182
				1183	dout("__flush_snaps %p\n", inode);
				1184	retry:
				1185	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
				1186	/* avoid an infiniute loop after retry */
				1187	if (capsnap->follows < next_follows)
				1188	continue;
				1189	/*
				1190	* we need to wait for sync writes to complete and for dirty
				1191	* pages to be written out.
				1192	*/
				1193	if (capsnap->dirty_pages \|\| capsnap->writing)
				1194	continue;
				1195
				1196	/* pick mds, take s_mutex */
				1197	mds = __ceph_get_cap_mds(ci, &mseq);
				1198	if (session && session->s_mds != mds) {
				1199	dout("oops, wrong session %p mutex\n", session);
				1200	mutex_unlock(&session->s_mutex);
				1201	ceph_put_mds_session(session);
				1202	session = NULL;
				1203	}
				1204	if (!session) {
				1205	spin_unlock(&inode->i_lock);
				1206	mutex_lock(&mdsc->mutex);
				1207	session = __ceph_lookup_mds_session(mdsc, mds);
				1208	mutex_unlock(&mdsc->mutex);
				1209	if (session) {
				1210	dout("inverting session/ino locks on %p\n",
				1211	session);
				1212	mutex_lock(&session->s_mutex);
				1213	}
				1214	/*
				1215	* if session == NULL, we raced against a cap
				1216	* deletion. retry, and we'll get a better
				1217	* @mds value next time.
				1218	*/
				1219	spin_lock(&inode->i_lock);
				1220	goto retry;
				1221	}
				1222
				1223	capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
				1224	atomic_inc(&capsnap->nref);
				1225	if (!list_empty(&capsnap->flushing_item))
				1226	list_del_init(&capsnap->flushing_item);
				1227	list_add_tail(&capsnap->flushing_item,
				1228	&session->s_cap_snaps_flushing);
				1229	spin_unlock(&inode->i_lock);
				1230
				1231	dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
				1232	inode, capsnap, next_follows, capsnap->size);
				1233	send_cap_msg(session, ceph_vino(inode).ino, 0,
				1234	CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
				1235	capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
				1236	capsnap->size, 0,
				1237	&capsnap->mtime, &capsnap->atime,
				1238	capsnap->time_warp_seq,
				1239	capsnap->uid, capsnap->gid, capsnap->mode,
				1240	0, NULL,
				1241	capsnap->follows);
				1242
				1243	next_follows = capsnap->follows + 1;
				1244	ceph_put_cap_snap(capsnap);
				1245
				1246	spin_lock(&inode->i_lock);
				1247	goto retry;
				1248	}
				1249
				1250	/* we flushed them all; remove this inode from the queue */
				1251	spin_lock(&mdsc->snap_flush_lock);
				1252	list_del_init(&ci->i_snap_flush_item);
				1253	spin_unlock(&mdsc->snap_flush_lock);
				1254
				1255	if (psession)
				1256	*psession = session;
				1257	else if (session) {
				1258	mutex_unlock(&session->s_mutex);
				1259	ceph_put_mds_session(session);
				1260	}
				1261	}
				1262
				1263	static void ceph_flush_snaps(struct ceph_inode_info *ci)
				1264	{
				1265	struct inode *inode = &ci->vfs_inode;
				1266
				1267	spin_lock(&inode->i_lock);
				1268	__ceph_flush_snaps(ci, NULL);
				1269	spin_unlock(&inode->i_lock);
				1270	}
				1271
				1272	/*
Sage Weil	76e3b39	2009-10-15 18:13:53 -0700	[diff] [blame]	1273	* Mark caps dirty. If inode is newly dirty, add to the global dirty
				1274	* list.
				1275	*/
				1276	void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
				1277	{
				1278	struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
				1279	struct inode *inode = &ci->vfs_inode;
				1280	int was = ci->i_dirty_caps;
				1281	int dirty = 0;
				1282
				1283	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
				1284	ceph_cap_string(mask), ceph_cap_string(was),
				1285	ceph_cap_string(was \| mask));
				1286	ci->i_dirty_caps \|= mask;
				1287	if (was == 0) {
				1288	dout(" inode %p now dirty\n", &ci->vfs_inode);
				1289	BUG_ON(!list_empty(&ci->i_dirty_item));
				1290	spin_lock(&mdsc->cap_dirty_lock);
				1291	list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
				1292	spin_unlock(&mdsc->cap_dirty_lock);
				1293	if (ci->i_flushing_caps == 0) {
				1294	igrab(inode);
				1295	dirty \|= I_DIRTY_SYNC;
				1296	}
				1297	}
				1298	BUG_ON(list_empty(&ci->i_dirty_item));
				1299	if (((was \| ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
				1300	(mask & CEPH_CAP_FILE_BUFFER))
				1301	dirty \|= I_DIRTY_DATASYNC;
				1302	if (dirty)
				1303	__mark_inode_dirty(inode, dirty);
				1304	__cap_delay_requeue(mdsc, ci);
				1305	}
				1306
				1307	/*
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1308	* Add dirty inode to the flushing list. Assigned a seq number so we
				1309	* can wait for caps to flush without starving.
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1310	*
				1311	* Called under i_lock.
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1312	*/
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1313	static int __mark_caps_flushing(struct inode *inode,
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1314	struct ceph_mds_session *session)
				1315	{
				1316	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
				1317	struct ceph_inode_info *ci = ceph_inode(inode);
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1318	int flushing;
				1319
				1320	BUG_ON(ci->i_dirty_caps == 0);
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1321	BUG_ON(list_empty(&ci->i_dirty_item));
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1322
				1323	flushing = ci->i_dirty_caps;
				1324	dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
				1325	ceph_cap_string(flushing),
				1326	ceph_cap_string(ci->i_flushing_caps),
				1327	ceph_cap_string(ci->i_flushing_caps \| flushing));
				1328	ci->i_flushing_caps \|= flushing;
				1329	ci->i_dirty_caps = 0;
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame]	1330	dout(" inode %p now !dirty\n", inode);
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1331
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1332	spin_lock(&mdsc->cap_dirty_lock);
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame]	1333	list_del_init(&ci->i_dirty_item);
				1334
				1335	ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1336	if (list_empty(&ci->i_flushing_item)) {
				1337	list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
				1338	mdsc->num_cap_flushing++;
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame]	1339	dout(" inode %p now flushing seq %lld\n", inode,
				1340	ci->i_cap_flush_seq);
				1341	} else {
				1342	list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
				1343	dout(" inode %p now flushing (more) seq %lld\n", inode,
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1344	ci->i_cap_flush_seq);
				1345	}
				1346	spin_unlock(&mdsc->cap_dirty_lock);
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1347
				1348	return flushing;
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1349	}
				1350
				1351	/*
				1352	* Swiss army knife function to examine currently used and wanted
				1353	* versus held caps. Release, flush, ack revoked caps to mds as
				1354	* appropriate.
				1355	*
				1356	* CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
				1357	* cap release further.
				1358	* CHECK_CAPS_AUTHONLY - we should only check the auth cap
				1359	* CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
				1360	* further delay.
				1361	*/
				1362	void ceph_check_caps(struct ceph_inode_info *ci, int flags,
				1363	struct ceph_mds_session *session)
				1364	{
				1365	struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
				1366	struct ceph_mds_client *mdsc = &client->mdsc;
				1367	struct inode *inode = &ci->vfs_inode;
				1368	struct ceph_cap *cap;
				1369	int file_wanted, used;
				1370	int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
				1371	int drop_session_lock = session ? 0 : 1;
				1372	int want, retain, revoking, flushing = 0;
				1373	int mds = -1; /* keep track of how far we've gone through i_caps list
				1374	to avoid an infinite loop on retry */
				1375	struct rb_node *p;
				1376	int tried_invalidate = 0;
				1377	int delayed = 0, sent = 0, force_requeue = 0, num;
				1378	int is_delayed = flags & CHECK_CAPS_NODELAY;
				1379
				1380	/* if we are unmounting, flush any unused caps immediately. */
				1381	if (mdsc->stopping)
				1382	is_delayed = 1;
				1383
				1384	spin_lock(&inode->i_lock);
				1385
				1386	if (ci->i_ceph_flags & CEPH_I_FLUSH)
				1387	flags \|= CHECK_CAPS_FLUSH;
				1388
				1389	/* flush snaps first time around only */
				1390	if (!list_empty(&ci->i_cap_snaps))
				1391	__ceph_flush_snaps(ci, &session);
				1392	goto retry_locked;
				1393	retry:
				1394	spin_lock(&inode->i_lock);
				1395	retry_locked:
				1396	file_wanted = __ceph_caps_file_wanted(ci);
				1397	used = __ceph_caps_used(ci);
				1398	want = file_wanted \| used;
				1399
				1400	retain = want \| CEPH_CAP_PIN;
				1401	if (!mdsc->stopping && inode->i_nlink > 0) {
				1402	if (want) {
				1403	retain \|= CEPH_CAP_ANY; /* be greedy */
				1404	} else {
				1405	retain \|= CEPH_CAP_ANY_SHARED;
				1406	/*
				1407	* keep RD only if we didn't have the file open RW,
				1408	* because then the mds would revoke it anyway to
				1409	* journal max_size=0.
				1410	*/
				1411	if (ci->i_max_size == 0)
				1412	retain \|= CEPH_CAP_ANY_RD;
				1413	}
				1414	}
				1415
				1416	dout("check_caps %p file_want %s used %s dirty %s flushing %s"
				1417	" issued %s retain %s %s%s%s\n", inode,
				1418	ceph_cap_string(file_wanted),
				1419	ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
				1420	ceph_cap_string(ci->i_flushing_caps),
				1421	ceph_cap_string(__ceph_caps_issued(ci, NULL)),
				1422	ceph_cap_string(retain),
				1423	(flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
				1424	(flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
				1425	(flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
				1426
				1427	/*
				1428	* If we no longer need to hold onto old our caps, and we may
				1429	* have cached pages, but don't want them, then try to invalidate.
				1430	* If we fail, it's because pages are locked.... try again later.
				1431	*/
				1432	if ((!is_delayed \|\| mdsc->stopping) &&
				1433	ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
				1434	ci->i_rdcache_gen && /* may have cached pages */
				1435	file_wanted == 0 && /* no open files */
				1436	!ci->i_truncate_pending &&
				1437	!tried_invalidate) {
				1438	u32 invalidating_gen = ci->i_rdcache_gen;
				1439	int ret;
				1440
				1441	dout("check_caps trying to invalidate on %p\n", inode);
				1442	spin_unlock(&inode->i_lock);
				1443	ret = invalidate_inode_pages2(&inode->i_data);
				1444	spin_lock(&inode->i_lock);
				1445	if (ret == 0 && invalidating_gen == ci->i_rdcache_gen) {
				1446	/* success. */
				1447	ci->i_rdcache_gen = 0;
				1448	ci->i_rdcache_revoking = 0;
				1449	} else {
				1450	dout("check_caps failed to invalidate pages\n");
				1451	/* we failed to invalidate pages. check these
				1452	caps again later. */
				1453	force_requeue = 1;
				1454	__cap_set_timeouts(mdsc, ci);
				1455	}
				1456	tried_invalidate = 1;
				1457	goto retry_locked;
				1458	}
				1459
				1460	num = 0;
				1461	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				1462	cap = rb_entry(p, struct ceph_cap, ci_node);
				1463	num++;
				1464
				1465	/* avoid looping forever */
				1466	if (mds >= cap->mds \|\|
				1467	((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
				1468	continue;
				1469
				1470	/* NOTE: no side-effects allowed, until we take s_mutex */
				1471
				1472	revoking = cap->implemented & ~cap->issued;
				1473	if (revoking)
				1474	dout("mds%d revoking %s\n", cap->mds,
				1475	ceph_cap_string(revoking));
				1476
				1477	if (cap == ci->i_auth_cap &&
				1478	(cap->issued & CEPH_CAP_FILE_WR)) {
				1479	/* request larger max_size from MDS? */
				1480	if (ci->i_wanted_max_size > ci->i_max_size &&
				1481	ci->i_wanted_max_size > ci->i_requested_max_size) {
				1482	dout("requesting new max_size\n");
				1483	goto ack;
				1484	}
				1485
				1486	/* approaching file_max? */
				1487	if ((inode->i_size << 1) >= ci->i_max_size &&
				1488	(ci->i_reported_size << 1) < ci->i_max_size) {
				1489	dout("i_size approaching max_size\n");
				1490	goto ack;
				1491	}
				1492	}
				1493	/* flush anything dirty? */
				1494	if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
				1495	ci->i_dirty_caps) {
				1496	dout("flushing dirty caps\n");
				1497	goto ack;
				1498	}
				1499
				1500	/* completed revocation? going down and there are no caps? */
				1501	if (revoking && (revoking & used) == 0) {
				1502	dout("completed revocation of %s\n",
				1503	ceph_cap_string(cap->implemented & ~cap->issued));
				1504	goto ack;
				1505	}
				1506
				1507	/* want more caps from mds? */
				1508	if (want & ~(cap->mds_wanted \| cap->issued))
				1509	goto ack;
				1510
				1511	/* things we might delay */
				1512	if ((cap->issued & ~retain) == 0 &&
				1513	cap->mds_wanted == want)
				1514	continue; /* nope, all good */
				1515
				1516	if (is_delayed)
				1517	goto ack;
				1518
				1519	/* delay? */
				1520	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
				1521	time_before(jiffies, ci->i_hold_caps_max)) {
				1522	dout(" delaying issued %s -> %s, wanted %s -> %s\n",
				1523	ceph_cap_string(cap->issued),
				1524	ceph_cap_string(cap->issued & retain),
				1525	ceph_cap_string(cap->mds_wanted),
				1526	ceph_cap_string(want));
				1527	delayed++;
				1528	continue;
				1529	}
				1530
				1531	ack:
				1532	if (session && session != cap->session) {
				1533	dout("oops, wrong session %p mutex\n", session);
				1534	mutex_unlock(&session->s_mutex);
				1535	session = NULL;
				1536	}
				1537	if (!session) {
				1538	session = cap->session;
				1539	if (mutex_trylock(&session->s_mutex) == 0) {
				1540	dout("inverting session/ino locks on %p\n",
				1541	session);
				1542	spin_unlock(&inode->i_lock);
				1543	if (took_snap_rwsem) {
				1544	up_read(&mdsc->snap_rwsem);
				1545	took_snap_rwsem = 0;
				1546	}
				1547	mutex_lock(&session->s_mutex);
				1548	goto retry;
				1549	}
				1550	}
				1551	/* take snap_rwsem after session mutex */
				1552	if (!took_snap_rwsem) {
				1553	if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
				1554	dout("inverting snap/in locks on %p\n",
				1555	inode);
				1556	spin_unlock(&inode->i_lock);
				1557	down_read(&mdsc->snap_rwsem);
				1558	took_snap_rwsem = 1;
				1559	goto retry;
				1560	}
				1561	took_snap_rwsem = 1;
				1562	}
				1563
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1564	if (cap == ci->i_auth_cap && ci->i_dirty_caps)
				1565	flushing = __mark_caps_flushing(inode, session);
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1566
				1567	mds = cap->mds; /* remember mds, so we don't repeat */
				1568	sent++;
				1569
				1570	/* __send_cap drops i_lock */
				1571	delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
				1572	retain, flushing, NULL);
				1573	goto retry; /* retake i_lock and restart our cap scan. */
				1574	}
				1575
				1576	/*
				1577	* Reschedule delayed caps release if we delayed anything,
				1578	* otherwise cancel.
				1579	*/
				1580	if (delayed && is_delayed)
				1581	force_requeue = 1; /* __send_cap delayed release; requeue */
				1582	if (!delayed && !is_delayed)
				1583	__cap_delay_cancel(mdsc, ci);
				1584	else if (!is_delayed \|\| force_requeue)
				1585	__cap_delay_requeue(mdsc, ci);
				1586
				1587	spin_unlock(&inode->i_lock);
				1588
				1589	if (session && drop_session_lock)
				1590	mutex_unlock(&session->s_mutex);
				1591	if (took_snap_rwsem)
				1592	up_read(&mdsc->snap_rwsem);
				1593	}
				1594
				1595	/*
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1596	* Try to flush dirty caps back to the auth mds.
				1597	*/
				1598	static int try_flush_caps(struct inode inode, struct ceph_mds_session session,
				1599	unsigned *flush_tid)
				1600	{
				1601	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
				1602	struct ceph_inode_info *ci = ceph_inode(inode);
				1603	int unlock_session = session ? 0 : 1;
				1604	int flushing = 0;
				1605
				1606	retry:
				1607	spin_lock(&inode->i_lock);
				1608	if (ci->i_dirty_caps && ci->i_auth_cap) {
				1609	struct ceph_cap *cap = ci->i_auth_cap;
				1610	int used = __ceph_caps_used(ci);
				1611	int want = __ceph_caps_wanted(ci);
				1612	int delayed;
				1613
				1614	if (!session) {
				1615	spin_unlock(&inode->i_lock);
				1616	session = cap->session;
				1617	mutex_lock(&session->s_mutex);
				1618	goto retry;
				1619	}
				1620	BUG_ON(session != cap->session);
				1621	if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
				1622	goto out;
				1623
Sage Weil	cdc35f9	2009-10-14 14:24:19 -0700	[diff] [blame]	1624	flushing = __mark_caps_flushing(inode, session);
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	1625
				1626	/* __send_cap drops i_lock */
				1627	delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
				1628	cap->issued \| cap->implemented, flushing,
				1629	flush_tid);
				1630	if (!delayed)
				1631	goto out_unlocked;
				1632
				1633	spin_lock(&inode->i_lock);
				1634	__cap_delay_requeue(mdsc, ci);
				1635	}
				1636	out:
				1637	spin_unlock(&inode->i_lock);
				1638	out_unlocked:
				1639	if (session && unlock_session)
				1640	mutex_unlock(&session->s_mutex);
				1641	return flushing;
				1642	}
				1643
				1644	/*
				1645	* Return true if we've flushed caps through the given flush_tid.
				1646	*/
				1647	static int caps_are_flushed(struct inode *inode, unsigned tid)
				1648	{
				1649	struct ceph_inode_info *ci = ceph_inode(inode);
				1650	int dirty, i, ret = 1;
				1651
				1652	spin_lock(&inode->i_lock);
				1653	dirty = __ceph_caps_dirty(ci);
				1654	for (i = 0; i < CEPH_CAP_BITS; i++)
				1655	if ((ci->i_flushing_caps & (1 << i)) &&
				1656	ci->i_cap_flush_tid[i] <= tid) {
				1657	/* still flushing this bit */
				1658	ret = 0;
				1659	break;
				1660	}
				1661	spin_unlock(&inode->i_lock);
				1662	return ret;
				1663	}
				1664
				1665	/*
				1666	* Wait on any unsafe replies for the given inode. First wait on the
				1667	* newest request, and make that the upper bound. Then, if there are
				1668	* more requests, keep waiting on the oldest as long as it is still older
				1669	* than the original request.
				1670	*/
				1671	static void sync_write_wait(struct inode *inode)
				1672	{
				1673	struct ceph_inode_info *ci = ceph_inode(inode);
				1674	struct list_head *head = &ci->i_unsafe_writes;
				1675	struct ceph_osd_request *req;
				1676	u64 last_tid;
				1677
				1678	spin_lock(&ci->i_unsafe_lock);
				1679	if (list_empty(head))
				1680	goto out;
				1681
				1682	/* set upper bound as _last_ entry in chain */
				1683	req = list_entry(head->prev, struct ceph_osd_request,
				1684	r_unsafe_item);
				1685	last_tid = req->r_tid;
				1686
				1687	do {
				1688	ceph_osdc_get_request(req);
				1689	spin_unlock(&ci->i_unsafe_lock);
				1690	dout("sync_write_wait on tid %llu (until %llu)\n",
				1691	req->r_tid, last_tid);
				1692	wait_for_completion(&req->r_safe_completion);
				1693	spin_lock(&ci->i_unsafe_lock);
				1694	ceph_osdc_put_request(req);
				1695
				1696	/*
				1697	* from here on look at first entry in chain, since we
				1698	* only want to wait for anything older than last_tid
				1699	*/
				1700	if (list_empty(head))
				1701	break;
				1702	req = list_entry(head->next, struct ceph_osd_request,
				1703	r_unsafe_item);
				1704	} while (req->r_tid < last_tid);
				1705	out:
				1706	spin_unlock(&ci->i_unsafe_lock);
				1707	}
				1708
				1709	int ceph_fsync(struct file file, struct dentry dentry, int datasync)
				1710	{
				1711	struct inode *inode = dentry->d_inode;
				1712	struct ceph_inode_info *ci = ceph_inode(inode);
				1713	unsigned flush_tid;
				1714	int ret;
				1715	int dirty;
				1716
				1717	dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
				1718	sync_write_wait(inode);
				1719
				1720	ret = filemap_write_and_wait(inode->i_mapping);
				1721	if (ret < 0)
				1722	return ret;
				1723
				1724	dirty = try_flush_caps(inode, NULL, &flush_tid);
				1725	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
				1726
				1727	/*
				1728	* only wait on non-file metadata writeback (the mds
				1729	* can recover size and mtime, so we don't need to
				1730	* wait for that)
				1731	*/
				1732	if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
				1733	dout("fsync waiting for flush_tid %u\n", flush_tid);
				1734	ret = wait_event_interruptible(ci->i_cap_wq,
				1735	caps_are_flushed(inode, flush_tid));
				1736	}
				1737
				1738	dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
				1739	return ret;
				1740	}
				1741
				1742	/*
				1743	* Flush any dirty caps back to the mds. If we aren't asked to wait,
				1744	* queue inode for flush but don't do so immediately, because we can
				1745	* get by with fewer MDS messages if we wait for data writeback to
				1746	* complete first.
				1747	*/
				1748	int ceph_write_inode(struct inode *inode, int wait)
				1749	{
				1750	struct ceph_inode_info *ci = ceph_inode(inode);
				1751	unsigned flush_tid;
				1752	int err = 0;
				1753	int dirty;
				1754
				1755	dout("write_inode %p wait=%d\n", inode, wait);
				1756	if (wait) {
				1757	dirty = try_flush_caps(inode, NULL, &flush_tid);
				1758	if (dirty)
				1759	err = wait_event_interruptible(ci->i_cap_wq,
				1760	caps_are_flushed(inode, flush_tid));
				1761	} else {
				1762	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
				1763
				1764	spin_lock(&inode->i_lock);
				1765	if (__ceph_caps_dirty(ci))
				1766	__cap_delay_requeue_front(mdsc, ci);
				1767	spin_unlock(&inode->i_lock);
				1768	}
				1769	return err;
				1770	}
				1771
				1772	/*
				1773	* After a recovering MDS goes active, we need to resend any caps
				1774	* we were flushing.
				1775	*
				1776	* Caller holds session->s_mutex.
				1777	*/
				1778	static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
				1779	struct ceph_mds_session *session)
				1780	{
				1781	struct ceph_cap_snap *capsnap;
				1782
				1783	dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
				1784	list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
				1785	flushing_item) {
				1786	struct ceph_inode_info *ci = capsnap->ci;
				1787	struct inode *inode = &ci->vfs_inode;
				1788	struct ceph_cap *cap;
				1789
				1790	spin_lock(&inode->i_lock);
				1791	cap = ci->i_auth_cap;
				1792	if (cap && cap->session == session) {
				1793	dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
				1794	cap, capsnap);
				1795	__ceph_flush_snaps(ci, &session);
				1796	} else {
				1797	pr_err("%p auth cap %p not mds%d ???\n", inode,
				1798	cap, session->s_mds);
				1799	spin_unlock(&inode->i_lock);
				1800	}
				1801	}
				1802	}
				1803
				1804	void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
				1805	struct ceph_mds_session *session)
				1806	{
				1807	struct ceph_inode_info *ci;
				1808
				1809	kick_flushing_capsnaps(mdsc, session);
				1810
				1811	dout("kick_flushing_caps mds%d\n", session->s_mds);
				1812	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
				1813	struct inode *inode = &ci->vfs_inode;
				1814	struct ceph_cap *cap;
				1815	int delayed = 0;
				1816
				1817	spin_lock(&inode->i_lock);
				1818	cap = ci->i_auth_cap;
				1819	if (cap && cap->session == session) {
				1820	dout("kick_flushing_caps %p cap %p %s\n", inode,
				1821	cap, ceph_cap_string(ci->i_flushing_caps));
				1822	delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
				1823	__ceph_caps_used(ci),
				1824	__ceph_caps_wanted(ci),
				1825	cap->issued \| cap->implemented,
				1826	ci->i_flushing_caps, NULL);
				1827	if (delayed) {
				1828	spin_lock(&inode->i_lock);
				1829	__cap_delay_requeue(mdsc, ci);
				1830	spin_unlock(&inode->i_lock);
				1831	}
				1832	} else {
				1833	pr_err("%p auth cap %p not mds%d ???\n", inode,
				1834	cap, session->s_mds);
				1835	spin_unlock(&inode->i_lock);
				1836	}
				1837	}
				1838	}
				1839
				1840
				1841	/*
				1842	* Take references to capabilities we hold, so that we don't release
				1843	* them to the MDS prematurely.
				1844	*
				1845	* Protected by i_lock.
				1846	*/
				1847	static void __take_cap_refs(struct ceph_inode_info *ci, int got)
				1848	{
				1849	if (got & CEPH_CAP_PIN)
				1850	ci->i_pin_ref++;
				1851	if (got & CEPH_CAP_FILE_RD)
				1852	ci->i_rd_ref++;
				1853	if (got & CEPH_CAP_FILE_CACHE)
				1854	ci->i_rdcache_ref++;
				1855	if (got & CEPH_CAP_FILE_WR)
				1856	ci->i_wr_ref++;
				1857	if (got & CEPH_CAP_FILE_BUFFER) {
				1858	if (ci->i_wrbuffer_ref == 0)
				1859	igrab(&ci->vfs_inode);
				1860	ci->i_wrbuffer_ref++;
				1861	dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
				1862	&ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
				1863	}
				1864	}
				1865
				1866	/*
				1867	* Try to grab cap references. Specify those refs we @want, and the
				1868	* minimal set we @need. Also include the larger offset we are writing
				1869	* to (when applicable), and check against max_size here as well.
				1870	* Note that caller is responsible for ensuring max_size increases are
				1871	* requested from the MDS.
				1872	*/
				1873	static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
				1874	int got, loff_t endoff, int check_max, int *err)
				1875	{
				1876	struct inode *inode = &ci->vfs_inode;
				1877	int ret = 0;
				1878	int have, implemented;
				1879
				1880	dout("get_cap_refs %p need %s want %s\n", inode,
				1881	ceph_cap_string(need), ceph_cap_string(want));
				1882	spin_lock(&inode->i_lock);
				1883
				1884	/* make sure we _have_ some caps! */
				1885	if (!__ceph_is_any_caps(ci)) {
				1886	dout("get_cap_refs %p no real caps\n", inode);
				1887	*err = -EBADF;
				1888	ret = 1;
				1889	goto out;
				1890	}
				1891
				1892	if (need & CEPH_CAP_FILE_WR) {
				1893	if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
				1894	dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
				1895	inode, endoff, ci->i_max_size);
				1896	if (endoff > ci->i_wanted_max_size) {
				1897	*check_max = 1;
				1898	ret = 1;
				1899	}
				1900	goto out;
				1901	}
				1902	/*
				1903	* If a sync write is in progress, we must wait, so that we
				1904	* can get a final snapshot value for size+mtime.
				1905	*/
				1906	if (__ceph_have_pending_cap_snap(ci)) {
				1907	dout("get_cap_refs %p cap_snap_pending\n", inode);
				1908	goto out;
				1909	}
				1910	}
				1911	have = __ceph_caps_issued(ci, &implemented);
				1912
				1913	/*
				1914	* disallow writes while a truncate is pending
				1915	*/
				1916	if (ci->i_truncate_pending)
				1917	have &= ~CEPH_CAP_FILE_WR;
				1918
				1919	if ((have & need) == need) {
				1920	/*
				1921	* Look at (implemented & ~have & not) so that we keep waiting
				1922	* on transition from wanted -> needed caps. This is needed
				1923	* for WRBUFFER\|WR -> WR to avoid a new WR sync write from
				1924	* going before a prior buffered writeback happens.
				1925	*/
				1926	int not = want & ~(have & need);
				1927	int revoking = implemented & ~have;
				1928	dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
				1929	inode, ceph_cap_string(have), ceph_cap_string(not),
				1930	ceph_cap_string(revoking));
				1931	if ((revoking & not) == 0) {
				1932	*got = need \| (have & want);
				1933	__take_cap_refs(ci, *got);
				1934	ret = 1;
				1935	}
				1936	} else {
				1937	dout("get_cap_refs %p have %s needed %s\n", inode,
				1938	ceph_cap_string(have), ceph_cap_string(need));
				1939	}
				1940	out:
				1941	spin_unlock(&inode->i_lock);
				1942	dout("get_cap_refs %p ret %d got %s\n", inode,
				1943	ret, ceph_cap_string(*got));
				1944	return ret;
				1945	}
				1946
				1947	/*
				1948	* Check the offset we are writing up to against our current
				1949	* max_size. If necessary, tell the MDS we want to write to
				1950	* a larger offset.
				1951	*/
				1952	static void check_max_size(struct inode *inode, loff_t endoff)
				1953	{
				1954	struct ceph_inode_info *ci = ceph_inode(inode);
				1955	int check = 0;
				1956
				1957	/* do we need to explicitly request a larger max_size? */
				1958	spin_lock(&inode->i_lock);
				1959	if ((endoff >= ci->i_max_size \|\|
				1960	endoff > (inode->i_size << 1)) &&
				1961	endoff > ci->i_wanted_max_size) {
				1962	dout("write %p at large endoff %llu, req max_size\n",
				1963	inode, endoff);
				1964	ci->i_wanted_max_size = endoff;
				1965	check = 1;
				1966	}
				1967	spin_unlock(&inode->i_lock);
				1968	if (check)
				1969	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
				1970	}
				1971
				1972	/*
				1973	* Wait for caps, and take cap references. If we can't get a WR cap
				1974	* due to a small max_size, make sure we check_max_size (and possibly
				1975	* ask the mds) so we don't get hung up indefinitely.
				1976	*/
				1977	int ceph_get_caps(struct ceph_inode_info ci, int need, int want, int got,
				1978	loff_t endoff)
				1979	{
				1980	int check_max, ret, err;
				1981
				1982	retry:
				1983	if (endoff > 0)
				1984	check_max_size(&ci->vfs_inode, endoff);
				1985	check_max = 0;
				1986	err = 0;
				1987	ret = wait_event_interruptible(ci->i_cap_wq,
				1988	try_get_cap_refs(ci, need, want,
				1989	got, endoff,
				1990	&check_max, &err));
				1991	if (err)
				1992	ret = err;
				1993	if (check_max)
				1994	goto retry;
				1995	return ret;
				1996	}
				1997
				1998	/*
				1999	* Take cap refs. Caller must already know we hold at least one ref
				2000	* on the caps in question or we don't know this is safe.
				2001	*/
				2002	void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
				2003	{
				2004	spin_lock(&ci->vfs_inode.i_lock);
				2005	__take_cap_refs(ci, caps);
				2006	spin_unlock(&ci->vfs_inode.i_lock);
				2007	}
				2008
				2009	/*
				2010	* Release cap refs.
				2011	*
				2012	* If we released the last ref on any given cap, call ceph_check_caps
				2013	* to release (or schedule a release).
				2014	*
				2015	* If we are releasing a WR cap (from a sync write), finalize any affected
				2016	* cap_snap, and wake up any waiters.
				2017	*/
				2018	void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
				2019	{
				2020	struct inode *inode = &ci->vfs_inode;
				2021	int last = 0, put = 0, flushsnaps = 0, wake = 0;
				2022	struct ceph_cap_snap *capsnap;
				2023
				2024	spin_lock(&inode->i_lock);
				2025	if (had & CEPH_CAP_PIN)
				2026	--ci->i_pin_ref;
				2027	if (had & CEPH_CAP_FILE_RD)
				2028	if (--ci->i_rd_ref == 0)
				2029	last++;
				2030	if (had & CEPH_CAP_FILE_CACHE)
				2031	if (--ci->i_rdcache_ref == 0)
				2032	last++;
				2033	if (had & CEPH_CAP_FILE_BUFFER) {
				2034	if (--ci->i_wrbuffer_ref == 0) {
				2035	last++;
				2036	put++;
				2037	}
				2038	dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
				2039	inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
				2040	}
				2041	if (had & CEPH_CAP_FILE_WR)
				2042	if (--ci->i_wr_ref == 0) {
				2043	last++;
				2044	if (!list_empty(&ci->i_cap_snaps)) {
				2045	capsnap = list_first_entry(&ci->i_cap_snaps,
				2046	struct ceph_cap_snap,
				2047	ci_item);
				2048	if (capsnap->writing) {
				2049	capsnap->writing = 0;
				2050	flushsnaps =
				2051	__ceph_finish_cap_snap(ci,
				2052	capsnap);
				2053	wake = 1;
				2054	}
				2055	}
				2056	}
				2057	spin_unlock(&inode->i_lock);
				2058
				2059	dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had),
				2060	last ? "last" : "");
				2061
				2062	if (last && !flushsnaps)
				2063	ceph_check_caps(ci, 0, NULL);
				2064	else if (flushsnaps)
				2065	ceph_flush_snaps(ci);
				2066	if (wake)
				2067	wake_up(&ci->i_cap_wq);
				2068	if (put)
				2069	iput(inode);
				2070	}
				2071
				2072	/*
				2073	* Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
				2074	* context. Adjust per-snap dirty page accounting as appropriate.
				2075	* Once all dirty data for a cap_snap is flushed, flush snapped file
				2076	* metadata back to the MDS. If we dropped the last ref, call
				2077	* ceph_check_caps.
				2078	*/
				2079	void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
				2080	struct ceph_snap_context *snapc)
				2081	{
				2082	struct inode *inode = &ci->vfs_inode;
				2083	int last = 0;
				2084	int last_snap = 0;
				2085	int found = 0;
				2086	struct ceph_cap_snap *capsnap = NULL;
				2087
				2088	spin_lock(&inode->i_lock);
				2089	ci->i_wrbuffer_ref -= nr;
				2090	last = !ci->i_wrbuffer_ref;
				2091
				2092	if (ci->i_head_snapc == snapc) {
				2093	ci->i_wrbuffer_ref_head -= nr;
				2094	if (!ci->i_wrbuffer_ref_head) {
				2095	ceph_put_snap_context(ci->i_head_snapc);
				2096	ci->i_head_snapc = NULL;
				2097	}
				2098	dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
				2099	inode,
				2100	ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
				2101	ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
				2102	last ? " LAST" : "");
				2103	} else {
				2104	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
				2105	if (capsnap->context == snapc) {
				2106	found = 1;
				2107	capsnap->dirty_pages -= nr;
				2108	last_snap = !capsnap->dirty_pages;
				2109	break;
				2110	}
				2111	}
				2112	BUG_ON(!found);
				2113	dout("put_wrbuffer_cap_refs on %p cap_snap %p "
				2114	" snap %lld %d/%d -> %d/%d %s%s\n",
				2115	inode, capsnap, capsnap->context->seq,
				2116	ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
				2117	ci->i_wrbuffer_ref, capsnap->dirty_pages,
				2118	last ? " (wrbuffer last)" : "",
				2119	last_snap ? " (capsnap last)" : "");
				2120	}
				2121
				2122	spin_unlock(&inode->i_lock);
				2123
				2124	if (last) {
				2125	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
				2126	iput(inode);
				2127	} else if (last_snap) {
				2128	ceph_flush_snaps(ci);
				2129	wake_up(&ci->i_cap_wq);
				2130	}
				2131	}
				2132
				2133	/*
				2134	* Handle a cap GRANT message from the MDS. (Note that a GRANT may
				2135	* actually be a revocation if it specifies a smaller cap set.)
				2136	*
				2137	* caller holds s_mutex.
				2138	* return value:
				2139	* 0 - ok
				2140	* 1 - check_caps on auth cap only (writeback)
				2141	* 2 - check_caps (ack revoke)
				2142	*/
				2143	static int handle_cap_grant(struct inode inode, struct ceph_mds_caps grant,
				2144	struct ceph_mds_session *session,
				2145	struct ceph_cap *cap,
				2146	struct ceph_buffer *xattr_buf)
				2147	__releases(inode->i_lock)
				2148
				2149	{
				2150	struct ceph_inode_info *ci = ceph_inode(inode);
				2151	int mds = session->s_mds;
				2152	int seq = le32_to_cpu(grant->seq);
				2153	int newcaps = le32_to_cpu(grant->caps);
				2154	int issued, implemented, used, wanted, dirty;
				2155	u64 size = le64_to_cpu(grant->size);
				2156	u64 max_size = le64_to_cpu(grant->max_size);
				2157	struct timespec mtime, atime, ctime;
				2158	int reply = 0;
				2159	int wake = 0;
				2160	int writeback = 0;
				2161	int revoked_rdcache = 0;
				2162	int invalidate_async = 0;
				2163	int tried_invalidate = 0;
				2164	int ret;
				2165
				2166	dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
				2167	inode, cap, mds, seq, ceph_cap_string(newcaps));
				2168	dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
				2169	inode->i_size);
				2170
				2171	/*
				2172	* If CACHE is being revoked, and we have no dirty buffers,
				2173	* try to invalidate (once). (If there are dirty buffers, we
				2174	* will invalidate _after_ writeback.)
				2175	*/
				2176	restart:
				2177	if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
				2178	!ci->i_wrbuffer_ref && !tried_invalidate) {
				2179	dout("CACHE invalidation\n");
				2180	spin_unlock(&inode->i_lock);
				2181	tried_invalidate = 1;
				2182
				2183	ret = invalidate_inode_pages2(&inode->i_data);
				2184	spin_lock(&inode->i_lock);
				2185	if (ret < 0) {
				2186	/* there were locked pages.. invalidate later
				2187	in a separate thread. */
				2188	if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
				2189	invalidate_async = 1;
				2190	ci->i_rdcache_revoking = ci->i_rdcache_gen;
				2191	}
				2192	} else {
				2193	/* we successfully invalidated those pages */
				2194	revoked_rdcache = 1;
				2195	ci->i_rdcache_gen = 0;
				2196	ci->i_rdcache_revoking = 0;
				2197	}
				2198	goto restart;
				2199	}
				2200
				2201	/* side effects now are allowed */
				2202
				2203	issued = __ceph_caps_issued(ci, &implemented);
				2204	issued \|= implemented \| __ceph_caps_dirty(ci);
				2205
				2206	cap->gen = session->s_cap_gen;
				2207
				2208	__check_cap_issue(ci, cap, newcaps);
				2209
				2210	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
				2211	inode->i_mode = le32_to_cpu(grant->mode);
				2212	inode->i_uid = le32_to_cpu(grant->uid);
				2213	inode->i_gid = le32_to_cpu(grant->gid);
				2214	dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
				2215	inode->i_uid, inode->i_gid);
				2216	}
				2217
				2218	if ((issued & CEPH_CAP_LINK_EXCL) == 0)
				2219	inode->i_nlink = le32_to_cpu(grant->nlink);
				2220
				2221	if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
				2222	int len = le32_to_cpu(grant->xattr_len);
				2223	u64 version = le64_to_cpu(grant->xattr_version);
				2224
				2225	if (version > ci->i_xattrs.version) {
				2226	dout(" got new xattrs v%llu on %p len %d\n",
				2227	version, inode, len);
				2228	if (ci->i_xattrs.blob)
				2229	ceph_buffer_put(ci->i_xattrs.blob);
				2230	ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
				2231	ci->i_xattrs.version = version;
				2232	}
				2233	}
				2234
				2235	/* size/ctime/mtime/atime? */
				2236	ceph_fill_file_size(inode, issued,
				2237	le32_to_cpu(grant->truncate_seq),
				2238	le64_to_cpu(grant->truncate_size), size);
				2239	ceph_decode_timespec(&mtime, &grant->mtime);
				2240	ceph_decode_timespec(&atime, &grant->atime);
				2241	ceph_decode_timespec(&ctime, &grant->ctime);
				2242	ceph_fill_file_time(inode, issued,
				2243	le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
				2244	&atime);
				2245
				2246	/* max size increase? */
				2247	if (max_size != ci->i_max_size) {
				2248	dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
				2249	ci->i_max_size = max_size;
				2250	if (max_size >= ci->i_wanted_max_size) {
				2251	ci->i_wanted_max_size = 0; /* reset */
				2252	ci->i_requested_max_size = 0;
				2253	}
				2254	wake = 1;
				2255	}
				2256
				2257	/* check cap bits */
				2258	wanted = __ceph_caps_wanted(ci);
				2259	used = __ceph_caps_used(ci);
				2260	dirty = __ceph_caps_dirty(ci);
				2261	dout(" my wanted = %s, used = %s, dirty %s\n",
				2262	ceph_cap_string(wanted),
				2263	ceph_cap_string(used),
				2264	ceph_cap_string(dirty));
				2265	if (wanted != le32_to_cpu(grant->wanted)) {
				2266	dout("mds wanted %s -> %s\n",
				2267	ceph_cap_string(le32_to_cpu(grant->wanted)),
				2268	ceph_cap_string(wanted));
				2269	grant->wanted = cpu_to_le32(wanted);
				2270	}
				2271
				2272	cap->seq = seq;
				2273
				2274	/* file layout may have changed */
				2275	ci->i_layout = grant->layout;
				2276
				2277	/* revocation, grant, or no-op? */
				2278	if (cap->issued & ~newcaps) {
				2279	dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
				2280	ceph_cap_string(newcaps));
				2281	if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
				2282	writeback = 1; /* will delay ack */
				2283	else if (dirty & ~newcaps)
				2284	reply = 1; /* initiate writeback in check_caps */
				2285	else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 \|\|
				2286	revoked_rdcache)
				2287	reply = 2; /* send revoke ack in check_caps */
				2288	cap->issued = newcaps;
				2289	} else if (cap->issued == newcaps) {
				2290	dout("caps unchanged: %s -> %s\n",
				2291	ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
				2292	} else {
				2293	dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
				2294	ceph_cap_string(newcaps));
				2295	cap->issued = newcaps;
				2296	cap->implemented \|= newcaps; /* add bits only, to
				2297	* avoid stepping on a
				2298	* pending revocation */
				2299	wake = 1;
				2300	}
				2301
				2302	spin_unlock(&inode->i_lock);
				2303	if (writeback) {
				2304	/*
				2305	* queue inode for writeback: we can't actually call
				2306	* filemap_write_and_wait, etc. from message handler
				2307	* context.
				2308	*/
				2309	dout("queueing %p for writeback\n", inode);
				2310	if (ceph_queue_writeback(inode))
				2311	igrab(inode);
				2312	}
				2313	if (invalidate_async) {
				2314	dout("queueing %p for page invalidation\n", inode);
				2315	if (ceph_queue_page_invalidation(inode))
				2316	igrab(inode);
				2317	}
				2318	if (wake)
				2319	wake_up(&ci->i_cap_wq);
				2320	return reply;
				2321	}
				2322
				2323	/*
				2324	* Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
				2325	* MDS has been safely committed.
				2326	*/
				2327	static void handle_cap_flush_ack(struct inode *inode,
				2328	struct ceph_mds_caps *m,
				2329	struct ceph_mds_session *session,
				2330	struct ceph_cap *cap)
				2331	__releases(inode->i_lock)
				2332	{
				2333	struct ceph_inode_info *ci = ceph_inode(inode);
				2334	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
				2335	unsigned seq = le32_to_cpu(m->seq);
				2336	int dirty = le32_to_cpu(m->dirty);
				2337	int cleaned = 0;
				2338	u64 flush_tid = le64_to_cpu(m->client_tid);
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame]	2339	int drop = 0;
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2340	int i;
				2341
				2342	for (i = 0; i < CEPH_CAP_BITS; i++)
				2343	if ((dirty & (1 << i)) &&
				2344	flush_tid == ci->i_cap_flush_tid[i])
				2345	cleaned \|= 1 << i;
				2346
				2347	dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
				2348	" flushing %s -> %s\n",
				2349	inode, session->s_mds, seq, ceph_cap_string(dirty),
				2350	ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
				2351	ceph_cap_string(ci->i_flushing_caps & ~cleaned));
				2352
				2353	if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
				2354	goto out;
				2355
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2356	ci->i_flushing_caps &= ~cleaned;
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2357
				2358	spin_lock(&mdsc->cap_dirty_lock);
				2359	if (ci->i_flushing_caps == 0) {
				2360	list_del_init(&ci->i_flushing_item);
				2361	if (!list_empty(&session->s_cap_flushing))
				2362	dout(" mds%d still flushing cap on %p\n",
				2363	session->s_mds,
				2364	&list_entry(session->s_cap_flushing.next,
				2365	struct ceph_inode_info,
				2366	i_flushing_item)->vfs_inode);
				2367	mdsc->num_cap_flushing--;
				2368	wake_up(&mdsc->cap_flushing_wq);
				2369	dout(" inode %p now !flushing\n", inode);
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame]	2370
				2371	if (ci->i_dirty_caps == 0) {
				2372	dout(" inode %p now clean\n", inode);
				2373	BUG_ON(!list_empty(&ci->i_dirty_item));
				2374	drop = 1;
Sage Weil	76e3b39	2009-10-15 18:13:53 -0700	[diff] [blame]	2375	} else {
				2376	BUG_ON(list_empty(&ci->i_dirty_item));
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame]	2377	}
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2378	}
				2379	spin_unlock(&mdsc->cap_dirty_lock);
				2380	wake_up(&ci->i_cap_wq);
				2381
				2382	out:
				2383	spin_unlock(&inode->i_lock);
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame]	2384	if (drop)
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2385	iput(inode);
				2386	}
				2387
				2388	/*
				2389	* Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
				2390	* throw away our cap_snap.
				2391	*
				2392	* Caller hold s_mutex.
				2393	*/
				2394	static void handle_cap_flushsnap_ack(struct inode *inode,
				2395	struct ceph_mds_caps *m,
				2396	struct ceph_mds_session *session)
				2397	{
				2398	struct ceph_inode_info *ci = ceph_inode(inode);
				2399	u64 follows = le64_to_cpu(m->snap_follows);
				2400	u64 flush_tid = le64_to_cpu(m->client_tid);
				2401	struct ceph_cap_snap *capsnap;
				2402	int drop = 0;
				2403
				2404	dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
				2405	inode, ci, session->s_mds, follows);
				2406
				2407	spin_lock(&inode->i_lock);
				2408	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
				2409	if (capsnap->follows == follows) {
				2410	if (capsnap->flush_tid != flush_tid) {
				2411	dout(" cap_snap %p follows %lld tid %lld !="
				2412	" %lld\n", capsnap, follows,
				2413	flush_tid, capsnap->flush_tid);
				2414	break;
				2415	}
				2416	WARN_ON(capsnap->dirty_pages \|\| capsnap->writing);
				2417	dout(" removing cap_snap %p follows %lld\n",
				2418	capsnap, follows);
				2419	ceph_put_snap_context(capsnap->context);
				2420	list_del(&capsnap->ci_item);
				2421	list_del(&capsnap->flushing_item);
				2422	ceph_put_cap_snap(capsnap);
				2423	drop = 1;
				2424	break;
				2425	} else {
				2426	dout(" skipping cap_snap %p follows %lld\n",
				2427	capsnap, capsnap->follows);
				2428	}
				2429	}
				2430	spin_unlock(&inode->i_lock);
				2431	if (drop)
				2432	iput(inode);
				2433	}
				2434
				2435	/*
				2436	* Handle TRUNC from MDS, indicating file truncation.
				2437	*
				2438	* caller hold s_mutex.
				2439	*/
				2440	static void handle_cap_trunc(struct inode *inode,
				2441	struct ceph_mds_caps *trunc,
				2442	struct ceph_mds_session *session)
				2443	__releases(inode->i_lock)
				2444	{
				2445	struct ceph_inode_info *ci = ceph_inode(inode);
				2446	int mds = session->s_mds;
				2447	int seq = le32_to_cpu(trunc->seq);
				2448	u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
				2449	u64 truncate_size = le64_to_cpu(trunc->truncate_size);
				2450	u64 size = le64_to_cpu(trunc->size);
				2451	int implemented = 0;
				2452	int dirty = __ceph_caps_dirty(ci);
				2453	int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
				2454	int queue_trunc = 0;
				2455
				2456	issued \|= implemented \| dirty;
				2457
				2458	dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
				2459	inode, mds, seq, truncate_size, truncate_seq);
				2460	queue_trunc = ceph_fill_file_size(inode, issued,
				2461	truncate_seq, truncate_size, size);
				2462	spin_unlock(&inode->i_lock);
				2463
				2464	if (queue_trunc)
				2465	if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
				2466	&ci->i_vmtruncate_work))
				2467	igrab(inode);
				2468	}
				2469
				2470	/*
				2471	* Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
				2472	* different one. If we are the most recent migration we've seen (as
				2473	* indicated by mseq), make note of the migrating cap bits for the
				2474	* duration (until we see the corresponding IMPORT).
				2475	*
				2476	* caller holds s_mutex
				2477	*/
				2478	static void handle_cap_export(struct inode inode, struct ceph_mds_caps ex,
				2479	struct ceph_mds_session *session)
				2480	{
				2481	struct ceph_inode_info *ci = ceph_inode(inode);
				2482	int mds = session->s_mds;
				2483	unsigned mseq = le32_to_cpu(ex->migrate_seq);
				2484	struct ceph_cap cap = NULL, t;
				2485	struct rb_node *p;
				2486	int remember = 1;
				2487
				2488	dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
				2489	inode, ci, mds, mseq);
				2490
				2491	spin_lock(&inode->i_lock);
				2492
				2493	/* make sure we haven't seen a higher mseq */
				2494	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
				2495	t = rb_entry(p, struct ceph_cap, ci_node);
				2496	if (ceph_seq_cmp(t->mseq, mseq) > 0) {
				2497	dout(" higher mseq on cap from mds%d\n",
				2498	t->session->s_mds);
				2499	remember = 0;
				2500	}
				2501	if (t->session->s_mds == mds)
				2502	cap = t;
				2503	}
				2504
				2505	if (cap) {
				2506	if (remember) {
				2507	/* make note */
				2508	ci->i_cap_exporting_mds = mds;
				2509	ci->i_cap_exporting_mseq = mseq;
				2510	ci->i_cap_exporting_issued = cap->issued;
				2511	}
				2512	__ceph_remove_cap(cap, NULL);
				2513	} else {
				2514	WARN_ON(!cap);
				2515	}
				2516
				2517	spin_unlock(&inode->i_lock);
				2518	}
				2519
				2520	/*
				2521	* Handle cap IMPORT. If there are temp bits from an older EXPORT,
				2522	* clean them up.
				2523	*
				2524	* caller holds s_mutex.
				2525	*/
				2526	static void handle_cap_import(struct ceph_mds_client *mdsc,
				2527	struct inode inode, struct ceph_mds_caps im,
				2528	struct ceph_mds_session *session,
				2529	void *snaptrace, int snaptrace_len)
				2530	{
				2531	struct ceph_inode_info *ci = ceph_inode(inode);
				2532	int mds = session->s_mds;
				2533	unsigned issued = le32_to_cpu(im->caps);
				2534	unsigned wanted = le32_to_cpu(im->wanted);
				2535	unsigned seq = le32_to_cpu(im->seq);
				2536	unsigned mseq = le32_to_cpu(im->migrate_seq);
				2537	u64 realmino = le64_to_cpu(im->realm);
				2538	u64 cap_id = le64_to_cpu(im->cap_id);
				2539
				2540	if (ci->i_cap_exporting_mds >= 0 &&
				2541	ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
				2542	dout("handle_cap_import inode %p ci %p mds%d mseq %d"
				2543	" - cleared exporting from mds%d\n",
				2544	inode, ci, mds, mseq,
				2545	ci->i_cap_exporting_mds);
				2546	ci->i_cap_exporting_issued = 0;
				2547	ci->i_cap_exporting_mseq = 0;
				2548	ci->i_cap_exporting_mds = -1;
				2549	} else {
				2550	dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
				2551	inode, ci, mds, mseq);
				2552	}
				2553
				2554	down_write(&mdsc->snap_rwsem);
				2555	ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
				2556	false);
				2557	downgrade_write(&mdsc->snap_rwsem);
				2558	ceph_add_cap(inode, session, cap_id, -1,
				2559	issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
				2560	NULL /* no caps context */);
				2561	try_flush_caps(inode, session, NULL);
				2562	up_read(&mdsc->snap_rwsem);
				2563	}
				2564
				2565	/*
				2566	* Handle a caps message from the MDS.
				2567	*
				2568	* Identify the appropriate session, inode, and call the right handler
				2569	* based on the cap op.
				2570	*/
				2571	void ceph_handle_caps(struct ceph_mds_session *session,
				2572	struct ceph_msg *msg)
				2573	{
				2574	struct ceph_mds_client *mdsc = session->s_mdsc;
				2575	struct super_block *sb = mdsc->client->sb;
				2576	struct inode *inode;
				2577	struct ceph_cap *cap;
				2578	struct ceph_mds_caps *h;
				2579	int mds = le64_to_cpu(msg->hdr.src.name.num);
				2580	int op;
				2581	u32 seq;
				2582	struct ceph_vino vino;
				2583	u64 cap_id;
				2584	u64 size, max_size;
				2585	int check_caps = 0;
				2586	int r;
				2587
				2588	dout("handle_caps from mds%d\n", mds);
				2589
				2590	/* decode */
				2591	if (msg->front.iov_len < sizeof(*h))
				2592	goto bad;
				2593	h = msg->front.iov_base;
				2594	op = le32_to_cpu(h->op);
				2595	vino.ino = le64_to_cpu(h->ino);
				2596	vino.snap = CEPH_NOSNAP;
				2597	cap_id = le64_to_cpu(h->cap_id);
				2598	seq = le32_to_cpu(h->seq);
				2599	size = le64_to_cpu(h->size);
				2600	max_size = le64_to_cpu(h->max_size);
				2601
				2602	mutex_lock(&session->s_mutex);
				2603	session->s_seq++;
				2604	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
				2605	(unsigned)seq);
				2606
				2607	/* lookup ino */
				2608	inode = ceph_find_inode(sb, vino);
				2609	dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
				2610	vino.snap, inode);
				2611	if (!inode) {
				2612	dout(" i don't have ino %llx\n", vino.ino);
				2613	goto done;
				2614	}
				2615
				2616	/* these will work even if we don't have a cap yet */
				2617	switch (op) {
				2618	case CEPH_CAP_OP_FLUSHSNAP_ACK:
				2619	handle_cap_flushsnap_ack(inode, h, session);
				2620	goto done;
				2621
				2622	case CEPH_CAP_OP_EXPORT:
				2623	handle_cap_export(inode, h, session);
				2624	goto done;
				2625
				2626	case CEPH_CAP_OP_IMPORT:
				2627	handle_cap_import(mdsc, inode, h, session,
				2628	msg->middle,
				2629	le32_to_cpu(h->snap_trace_len));
				2630	check_caps = 1; /* we may have sent a RELEASE to the old auth */
				2631	goto done;
				2632	}
				2633
				2634	/* the rest require a cap */
				2635	spin_lock(&inode->i_lock);
				2636	cap = __get_cap_for_mds(ceph_inode(inode), mds);
				2637	if (!cap) {
				2638	dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
				2639	inode, ceph_ino(inode), ceph_snap(inode), mds);
				2640	spin_unlock(&inode->i_lock);
				2641	goto done;
				2642	}
				2643
				2644	/* note that each of these drops i_lock for us */
				2645	switch (op) {
				2646	case CEPH_CAP_OP_REVOKE:
				2647	case CEPH_CAP_OP_GRANT:
				2648	r = handle_cap_grant(inode, h, session, cap, msg->middle);
				2649	if (r == 1)
				2650	ceph_check_caps(ceph_inode(inode),
				2651	CHECK_CAPS_NODELAY\|CHECK_CAPS_AUTHONLY,
				2652	session);
				2653	else if (r == 2)
				2654	ceph_check_caps(ceph_inode(inode),
				2655	CHECK_CAPS_NODELAY,
				2656	session);
				2657	break;
				2658
				2659	case CEPH_CAP_OP_FLUSH_ACK:
				2660	handle_cap_flush_ack(inode, h, session, cap);
				2661	break;
				2662
				2663	case CEPH_CAP_OP_TRUNC:
				2664	handle_cap_trunc(inode, h, session);
				2665	break;
				2666
				2667	default:
				2668	spin_unlock(&inode->i_lock);
				2669	pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
				2670	ceph_cap_op_name(op));
				2671	}
				2672
				2673	done:
				2674	mutex_unlock(&session->s_mutex);
				2675
				2676	if (check_caps)
				2677	ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL);
				2678	if (inode)
				2679	iput(inode);
				2680	return;
				2681
				2682	bad:
				2683	pr_err("ceph_handle_caps: corrupt message\n");
				2684	return;
				2685	}
				2686
				2687	/*
				2688	* Delayed work handler to process end of delayed cap release LRU list.
				2689	*/
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame]	2690	void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2691	{
				2692	struct ceph_inode_info *ci;
				2693	int flags = CHECK_CAPS_NODELAY;
				2694
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2695	dout("check_delayed_caps\n");
				2696	while (1) {
				2697	spin_lock(&mdsc->cap_delay_lock);
				2698	if (list_empty(&mdsc->cap_delay_list))
				2699	break;
				2700	ci = list_first_entry(&mdsc->cap_delay_list,
				2701	struct ceph_inode_info,
				2702	i_cap_delay_list);
				2703	if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
				2704	time_before(jiffies, ci->i_hold_caps_max))
				2705	break;
				2706	list_del_init(&ci->i_cap_delay_list);
				2707	spin_unlock(&mdsc->cap_delay_lock);
				2708	dout("check_delayed_caps on %p\n", &ci->vfs_inode);
				2709	ceph_check_caps(ci, flags, NULL);
				2710	}
				2711	spin_unlock(&mdsc->cap_delay_lock);
				2712	}
				2713
				2714	/*
Sage Weil	afcdaea	2009-10-14 14:27:38 -0700	[diff] [blame]	2715	* Flush all dirty caps to the mds
				2716	*/
				2717	void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
				2718	{
				2719	struct ceph_inode_info *ci;
				2720	struct inode *inode;
				2721
				2722	dout("flush_dirty_caps\n");
				2723	spin_lock(&mdsc->cap_dirty_lock);
				2724	while (!list_empty(&mdsc->cap_dirty)) {
				2725	ci = list_first_entry(&mdsc->cap_dirty,
				2726	struct ceph_inode_info,
				2727	i_dirty_item);
				2728	inode = igrab(&ci->vfs_inode);
				2729	spin_unlock(&mdsc->cap_dirty_lock);
				2730	if (inode) {
				2731	ceph_check_caps(ci, CHECK_CAPS_NODELAY\|CHECK_CAPS_FLUSH,
				2732	NULL);
				2733	iput(inode);
				2734	}
				2735	spin_lock(&mdsc->cap_dirty_lock);
				2736	}
				2737	spin_unlock(&mdsc->cap_dirty_lock);
				2738	}
				2739
				2740	/*
Sage Weil	a8599bd	2009-10-06 11:31:12 -0700	[diff] [blame]	2741	* Drop open file reference. If we were the last open file,
				2742	* we may need to release capabilities to the MDS (or schedule
				2743	* their delayed release).
				2744	*/
				2745	void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
				2746	{
				2747	struct inode *inode = &ci->vfs_inode;
				2748	int last = 0;
				2749
				2750	spin_lock(&inode->i_lock);
				2751	dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
				2752	ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
				2753	BUG_ON(ci->i_nr_by_mode[fmode] == 0);
				2754	if (--ci->i_nr_by_mode[fmode] == 0)
				2755	last++;
				2756	spin_unlock(&inode->i_lock);
				2757
				2758	if (last && ci->i_vino.snap == CEPH_NOSNAP)
				2759	ceph_check_caps(ci, 0, NULL);
				2760	}
				2761
				2762	/*
				2763	* Helpers for embedding cap and dentry lease releases into mds
				2764	* requests.
				2765	*
				2766	* @force is used by dentry_release (below) to force inclusion of a
				2767	* record for the directory inode, even when there aren't any caps to
				2768	* drop.
				2769	*/
				2770	int ceph_encode_inode_release(void *p, struct inode inode,
				2771	int mds, int drop, int unless, int force)
				2772	{
				2773	struct ceph_inode_info *ci = ceph_inode(inode);
				2774	struct ceph_cap *cap;
				2775	struct ceph_mds_request_release rel = p;
				2776	int ret = 0;
				2777
				2778	dout("encode_inode_release %p mds%d drop %s unless %s\n", inode,
				2779	mds, ceph_cap_string(drop), ceph_cap_string(unless));
				2780
				2781	spin_lock(&inode->i_lock);
				2782	cap = __get_cap_for_mds(ci, mds);
				2783	if (cap && __cap_is_valid(cap)) {
				2784	if (force \|\|
				2785	((cap->issued & drop) &&
				2786	(cap->issued & unless) == 0)) {
				2787	if ((cap->issued & drop) &&
				2788	(cap->issued & unless) == 0) {
				2789	dout("encode_inode_release %p cap %p %s -> "
				2790	"%s\n", inode, cap,
				2791	ceph_cap_string(cap->issued),
				2792	ceph_cap_string(cap->issued & ~drop));
				2793	cap->issued &= ~drop;
				2794	cap->implemented &= ~drop;
				2795	if (ci->i_ceph_flags & CEPH_I_NODELAY) {
				2796	int wanted = __ceph_caps_wanted(ci);
				2797	dout(" wanted %s -> %s (act %s)\n",
				2798	ceph_cap_string(cap->mds_wanted),
				2799	ceph_cap_string(cap->mds_wanted &
				2800	~wanted),
				2801	ceph_cap_string(wanted));
				2802	cap->mds_wanted &= wanted;
				2803	}
				2804	} else {
				2805	dout("encode_inode_release %p cap %p %s"
				2806	" (force)\n", inode, cap,
				2807	ceph_cap_string(cap->issued));
				2808	}
				2809
				2810	rel->ino = cpu_to_le64(ceph_ino(inode));
				2811	rel->cap_id = cpu_to_le64(cap->cap_id);
				2812	rel->seq = cpu_to_le32(cap->seq);
				2813	rel->issue_seq = cpu_to_le32(cap->issue_seq),
				2814	rel->mseq = cpu_to_le32(cap->mseq);
				2815	rel->caps = cpu_to_le32(cap->issued);
				2816	rel->wanted = cpu_to_le32(cap->mds_wanted);
				2817	rel->dname_len = 0;
				2818	rel->dname_seq = 0;
				2819	p += sizeof(rel);
				2820	ret = 1;
				2821	} else {
				2822	dout("encode_inode_release %p cap %p %s\n",
				2823	inode, cap, ceph_cap_string(cap->issued));
				2824	}
				2825	}
				2826	spin_unlock(&inode->i_lock);
				2827	return ret;
				2828	}
				2829
				2830	int ceph_encode_dentry_release(void *p, struct dentry dentry,
				2831	int mds, int drop, int unless)
				2832	{
				2833	struct inode *dir = dentry->d_parent->d_inode;
				2834	struct ceph_mds_request_release rel = p;
				2835	struct ceph_dentry_info *di = ceph_dentry(dentry);
				2836	int force = 0;
				2837	int ret;
				2838
				2839	/*
				2840	* force an record for the directory caps if we have a dentry lease.
				2841	* this is racy (can't take i_lock and d_lock together), but it
				2842	* doesn't have to be perfect; the mds will revoke anything we don't
				2843	* release.
				2844	*/
				2845	spin_lock(&dentry->d_lock);
				2846	if (di->lease_session && di->lease_session->s_mds == mds)
				2847	force = 1;
				2848	spin_unlock(&dentry->d_lock);
				2849
				2850	ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
				2851
				2852	spin_lock(&dentry->d_lock);
				2853	if (ret && di->lease_session && di->lease_session->s_mds == mds) {
				2854	dout("encode_dentry_release %p mds%d seq %d\n",
				2855	dentry, mds, (int)di->lease_seq);
				2856	rel->dname_len = cpu_to_le32(dentry->d_name.len);
				2857	memcpy(*p, dentry->d_name.name, dentry->d_name.len);
				2858	*p += dentry->d_name.len;
				2859	rel->dname_seq = cpu_to_le32(di->lease_seq);
				2860	}
				2861	spin_unlock(&dentry->d_lock);
				2862	return ret;
				2863	}