Blame - fs/ceph/inode.c - kernel/msm-5.4

blob: 85b4d2ffdeba933da4ac816663a04c3c9ed91924 [file] [log] [blame]

Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	1	#include "ceph_debug.h"
				2
				3	#include <linux/module.h>
				4	#include <linux/fs.h>
				5	#include <linux/smp_lock.h>
				6	#include <linux/slab.h>
				7	#include <linux/string.h>
				8	#include <linux/uaccess.h>
				9	#include <linux/kernel.h>
				10	#include <linux/namei.h>
				11	#include <linux/writeback.h>
				12	#include <linux/vmalloc.h>
Yehuda Sadeh	c9af9fb	2010-02-19 00:10:11 +0000	[diff] [blame]	13	#include <linux/pagevec.h>
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	14
				15	#include "super.h"
				16	#include "decode.h"
				17
				18	/*
				19	* Ceph inode operations
				20	*
				21	* Implement basic inode helpers (get, alloc) and inode ops (getattr,
				22	* setattr, etc.), xattr helpers, and helpers for assimilating
				23	* metadata returned by the MDS into our cache.
				24	*
				25	* Also define helpers for doing asynchronous writeback, invalidation,
				26	* and truncation for the benefit of those who can't afford to block
				27	* (typically because they are in the message handler path).
				28	*/
				29
				30	static const struct inode_operations ceph_symlink_iops;
				31
Sage Weil	3c6f6b7	2010-02-09 15:24:44 -0800	[diff] [blame]	32	static void ceph_invalidate_work(struct work_struct *work);
				33	static void ceph_writeback_work(struct work_struct *work);
				34	static void ceph_vmtruncate_work(struct work_struct *work);
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	35
				36	/*
				37	* find or create an inode, given the ceph ino number
				38	*/
				39	struct inode ceph_get_inode(struct super_block sb, struct ceph_vino vino)
				40	{
				41	struct inode *inode;
				42	ino_t t = ceph_vino_to_ino(vino);
				43
				44	inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
				45	if (inode == NULL)
				46	return ERR_PTR(-ENOMEM);
				47	if (inode->i_state & I_NEW) {
				48	dout("get_inode created new inode %p %llx.%llx ino %llx\n",
				49	inode, ceph_vinop(inode), (u64)inode->i_ino);
				50	unlock_new_inode(inode);
				51	}
				52
				53	dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
				54	vino.snap, inode);
				55	return inode;
				56	}
				57
				58	/*
				59	* get/constuct snapdir inode for a given directory
				60	*/
				61	struct inode ceph_get_snapdir(struct inode parent)
				62	{
				63	struct ceph_vino vino = {
				64	.ino = ceph_ino(parent),
				65	.snap = CEPH_SNAPDIR,
				66	};
				67	struct inode *inode = ceph_get_inode(parent->i_sb, vino);
Sage Weil	b377ff1	2009-11-11 15:22:37 -0800	[diff] [blame]	68	struct ceph_inode_info *ci = ceph_inode(inode);
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	69
				70	BUG_ON(!S_ISDIR(parent->i_mode));
				71	if (IS_ERR(inode))
				72	return ERR_PTR(PTR_ERR(inode));
				73	inode->i_mode = parent->i_mode;
				74	inode->i_uid = parent->i_uid;
				75	inode->i_gid = parent->i_gid;
				76	inode->i_op = &ceph_dir_iops;
				77	inode->i_fop = &ceph_dir_fops;
Sage Weil	b377ff1	2009-11-11 15:22:37 -0800	[diff] [blame]	78	ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
				79	ci->i_rbytes = 0;
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	80	return inode;
				81	}
				82
				83	const struct inode_operations ceph_file_iops = {
				84	.permission = ceph_permission,
				85	.setattr = ceph_setattr,
				86	.getattr = ceph_getattr,
				87	.setxattr = ceph_setxattr,
				88	.getxattr = ceph_getxattr,
				89	.listxattr = ceph_listxattr,
				90	.removexattr = ceph_removexattr,
				91	};
				92
				93
				94	/*
				95	* We use a 'frag tree' to keep track of the MDS's directory fragments
				96	* for a given inode (usually there is just a single fragment). We
				97	* need to know when a child frag is delegated to a new MDS, or when
				98	* it is flagged as replicated, so we can direct our requests
				99	* accordingly.
				100	*/
				101
				102	/*
				103	* find/create a frag in the tree
				104	*/
				105	static struct ceph_inode_frag __get_or_create_frag(struct ceph_inode_info ci,
				106	u32 f)
				107	{
				108	struct rb_node **p;
				109	struct rb_node *parent = NULL;
				110	struct ceph_inode_frag *frag;
				111	int c;
				112
				113	p = &ci->i_fragtree.rb_node;
				114	while (*p) {
				115	parent = *p;
				116	frag = rb_entry(parent, struct ceph_inode_frag, node);
				117	c = ceph_frag_compare(f, frag->frag);
				118	if (c < 0)
				119	p = &(*p)->rb_left;
				120	else if (c > 0)
				121	p = &(*p)->rb_right;
				122	else
				123	return frag;
				124	}
				125
				126	frag = kmalloc(sizeof(*frag), GFP_NOFS);
				127	if (!frag) {
				128	pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
				129	"frag %x\n", &ci->vfs_inode,
				130	ceph_vinop(&ci->vfs_inode), f);
				131	return ERR_PTR(-ENOMEM);
				132	}
				133	frag->frag = f;
				134	frag->split_by = 0;
				135	frag->mds = -1;
				136	frag->ndist = 0;
				137
				138	rb_link_node(&frag->node, parent, p);
				139	rb_insert_color(&frag->node, &ci->i_fragtree);
				140
				141	dout("get_or_create_frag added %llx.%llx frag %x\n",
				142	ceph_vinop(&ci->vfs_inode), f);
				143	return frag;
				144	}
				145
				146	/*
				147	* find a specific frag @f
				148	*/
				149	struct ceph_inode_frag __ceph_find_frag(struct ceph_inode_info ci, u32 f)
				150	{
				151	struct rb_node *n = ci->i_fragtree.rb_node;
				152
				153	while (n) {
				154	struct ceph_inode_frag *frag =
				155	rb_entry(n, struct ceph_inode_frag, node);
				156	int c = ceph_frag_compare(f, frag->frag);
				157	if (c < 0)
				158	n = n->rb_left;
				159	else if (c > 0)
				160	n = n->rb_right;
				161	else
				162	return frag;
				163	}
				164	return NULL;
				165	}
				166
				167	/*
				168	* Choose frag containing the given value @v. If @pfrag is
				169	* specified, copy the frag delegation info to the caller if
				170	* it is present.
				171	*/
				172	u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
				173	struct ceph_inode_frag *pfrag,
				174	int *found)
				175	{
				176	u32 t = ceph_frag_make(0, 0);
				177	struct ceph_inode_frag *frag;
				178	unsigned nway, i;
				179	u32 n;
				180
				181	if (found)
				182	*found = 0;
				183
				184	mutex_lock(&ci->i_fragtree_mutex);
				185	while (1) {
				186	WARN_ON(!ceph_frag_contains_value(t, v));
				187	frag = __ceph_find_frag(ci, t);
				188	if (!frag)
				189	break; /* t is a leaf */
				190	if (frag->split_by == 0) {
				191	if (pfrag)
				192	memcpy(pfrag, frag, sizeof(*pfrag));
				193	if (found)
				194	*found = 1;
				195	break;
				196	}
				197
				198	/* choose child */
				199	nway = 1 << frag->split_by;
				200	dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
				201	frag->split_by, nway);
				202	for (i = 0; i < nway; i++) {
				203	n = ceph_frag_make_child(t, frag->split_by, i);
				204	if (ceph_frag_contains_value(n, v)) {
				205	t = n;
				206	break;
				207	}
				208	}
				209	BUG_ON(i == nway);
				210	}
				211	dout("choose_frag(%x) = %x\n", v, t);
				212
				213	mutex_unlock(&ci->i_fragtree_mutex);
				214	return t;
				215	}
				216
				217	/*
				218	* Process dirfrag (delegation) info from the mds. Include leaf
				219	* fragment in tree ONLY if ndist > 0. Otherwise, only
				220	* branches/splits are included in i_fragtree)
				221	*/
				222	static int ceph_fill_dirfrag(struct inode *inode,
				223	struct ceph_mds_reply_dirfrag *dirinfo)
				224	{
				225	struct ceph_inode_info *ci = ceph_inode(inode);
				226	struct ceph_inode_frag *frag;
				227	u32 id = le32_to_cpu(dirinfo->frag);
				228	int mds = le32_to_cpu(dirinfo->auth);
				229	int ndist = le32_to_cpu(dirinfo->ndist);
				230	int i;
				231	int err = 0;
				232
				233	mutex_lock(&ci->i_fragtree_mutex);
				234	if (ndist == 0) {
				235	/* no delegation info needed. */
				236	frag = __ceph_find_frag(ci, id);
				237	if (!frag)
				238	goto out;
				239	if (frag->split_by == 0) {
				240	/* tree leaf, remove */
				241	dout("fill_dirfrag removed %llx.%llx frag %x"
				242	" (no ref)\n", ceph_vinop(inode), id);
				243	rb_erase(&frag->node, &ci->i_fragtree);
				244	kfree(frag);
				245	} else {
				246	/* tree branch, keep and clear */
				247	dout("fill_dirfrag cleared %llx.%llx frag %x"
				248	" referral\n", ceph_vinop(inode), id);
				249	frag->mds = -1;
				250	frag->ndist = 0;
				251	}
				252	goto out;
				253	}
				254
				255
				256	/* find/add this frag to store mds delegation info */
				257	frag = __get_or_create_frag(ci, id);
				258	if (IS_ERR(frag)) {
				259	/* this is not the end of the world; we can continue
				260	with bad/inaccurate delegation info */
				261	pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
				262	ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
				263	err = -ENOMEM;
				264	goto out;
				265	}
				266
				267	frag->mds = mds;
				268	frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
				269	for (i = 0; i < frag->ndist; i++)
				270	frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
				271	dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
				272	ceph_vinop(inode), frag->frag, frag->ndist);
				273
				274	out:
				275	mutex_unlock(&ci->i_fragtree_mutex);
				276	return err;
				277	}
				278
				279
				280	/*
				281	* initialize a newly allocated inode.
				282	*/
				283	struct inode ceph_alloc_inode(struct super_block sb)
				284	{
				285	struct ceph_inode_info *ci;
				286	int i;
				287
				288	ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
				289	if (!ci)
				290	return NULL;
				291
				292	dout("alloc_inode %p\n", &ci->vfs_inode);
				293
				294	ci->i_version = 0;
				295	ci->i_time_warp_seq = 0;
				296	ci->i_ceph_flags = 0;
				297	ci->i_release_count = 0;
				298	ci->i_symlink = NULL;
				299
				300	ci->i_fragtree = RB_ROOT;
				301	mutex_init(&ci->i_fragtree_mutex);
				302
				303	ci->i_xattrs.blob = NULL;
				304	ci->i_xattrs.prealloc_blob = NULL;
				305	ci->i_xattrs.dirty = false;
				306	ci->i_xattrs.index = RB_ROOT;
				307	ci->i_xattrs.count = 0;
				308	ci->i_xattrs.names_size = 0;
				309	ci->i_xattrs.vals_size = 0;
				310	ci->i_xattrs.version = 0;
				311	ci->i_xattrs.index_version = 0;
				312
				313	ci->i_caps = RB_ROOT;
				314	ci->i_auth_cap = NULL;
				315	ci->i_dirty_caps = 0;
				316	ci->i_flushing_caps = 0;
				317	INIT_LIST_HEAD(&ci->i_dirty_item);
				318	INIT_LIST_HEAD(&ci->i_flushing_item);
				319	ci->i_cap_flush_seq = 0;
				320	ci->i_cap_flush_last_tid = 0;
				321	memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
				322	init_waitqueue_head(&ci->i_cap_wq);
				323	ci->i_hold_caps_min = 0;
				324	ci->i_hold_caps_max = 0;
				325	INIT_LIST_HEAD(&ci->i_cap_delay_list);
				326	ci->i_cap_exporting_mds = 0;
				327	ci->i_cap_exporting_mseq = 0;
				328	ci->i_cap_exporting_issued = 0;
				329	INIT_LIST_HEAD(&ci->i_cap_snaps);
				330	ci->i_head_snapc = NULL;
				331	ci->i_snap_caps = 0;
				332
				333	for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
				334	ci->i_nr_by_mode[i] = 0;
				335
				336	ci->i_truncate_seq = 0;
				337	ci->i_truncate_size = 0;
				338	ci->i_truncate_pending = 0;
				339
				340	ci->i_max_size = 0;
				341	ci->i_reported_size = 0;
				342	ci->i_wanted_max_size = 0;
				343	ci->i_requested_max_size = 0;
				344
				345	ci->i_pin_ref = 0;
				346	ci->i_rd_ref = 0;
				347	ci->i_rdcache_ref = 0;
				348	ci->i_wr_ref = 0;
				349	ci->i_wrbuffer_ref = 0;
				350	ci->i_wrbuffer_ref_head = 0;
				351	ci->i_shared_gen = 0;
				352	ci->i_rdcache_gen = 0;
				353	ci->i_rdcache_revoking = 0;
				354
				355	INIT_LIST_HEAD(&ci->i_unsafe_writes);
				356	INIT_LIST_HEAD(&ci->i_unsafe_dirops);
				357	spin_lock_init(&ci->i_unsafe_lock);
				358
				359	ci->i_snap_realm = NULL;
				360	INIT_LIST_HEAD(&ci->i_snap_realm_item);
				361	INIT_LIST_HEAD(&ci->i_snap_flush_item);
				362
Sage Weil	3c6f6b7	2010-02-09 15:24:44 -0800	[diff] [blame]	363	INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
				364	INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	365
				366	INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
				367
				368	return &ci->vfs_inode;
				369	}
				370
				371	void ceph_destroy_inode(struct inode *inode)
				372	{
				373	struct ceph_inode_info *ci = ceph_inode(inode);
				374	struct ceph_inode_frag *frag;
				375	struct rb_node *n;
				376
				377	dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
				378
				379	ceph_queue_caps_release(inode);
				380
Sage Weil	8b218b8	2010-03-09 12:59:08 -0800	[diff] [blame]	381	/*
				382	* we may still have a snap_realm reference if there are stray
				383	* caps in i_cap_exporting_issued or i_snap_caps.
				384	*/
				385	if (ci->i_snap_realm) {
				386	struct ceph_mds_client *mdsc =
				387	&ceph_client(ci->vfs_inode.i_sb)->mdsc;
				388	struct ceph_snap_realm *realm = ci->i_snap_realm;
				389
				390	dout(" dropping residual ref to snap realm %p\n", realm);
				391	spin_lock(&realm->inodes_with_caps_lock);
				392	list_del_init(&ci->i_snap_realm_item);
				393	spin_unlock(&realm->inodes_with_caps_lock);
				394	ceph_put_snap_realm(mdsc, realm);
				395	}
				396
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	397	kfree(ci->i_symlink);
				398	while ((n = rb_first(&ci->i_fragtree)) != NULL) {
				399	frag = rb_entry(n, struct ceph_inode_frag, node);
				400	rb_erase(n, &ci->i_fragtree);
				401	kfree(frag);
				402	}
				403
				404	__ceph_destroy_xattrs(ci);
Sage Weil	b6c1d5b	2009-12-07 12:17:17 -0800	[diff] [blame]	405	if (ci->i_xattrs.blob)
				406	ceph_buffer_put(ci->i_xattrs.blob);
				407	if (ci->i_xattrs.prealloc_blob)
				408	ceph_buffer_put(ci->i_xattrs.prealloc_blob);
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	409
				410	kmem_cache_free(ceph_inode_cachep, ci);
				411	}
				412
				413
				414	/*
				415	* Helpers to fill in size, ctime, mtime, and atime. We have to be
				416	* careful because either the client or MDS may have more up to date
				417	* info, depending on which capabilities are held, and whether
				418	* time_warp_seq or truncate_seq have increased. (Ordinarily, mtime
				419	* and size are monotonically increasing, except when utimes() or
				420	* truncate() increments the corresponding _seq values.)
				421	*/
				422	int ceph_fill_file_size(struct inode *inode, int issued,
				423	u32 truncate_seq, u64 truncate_size, u64 size)
				424	{
				425	struct ceph_inode_info *ci = ceph_inode(inode);
				426	int queue_trunc = 0;
				427
				428	if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 \|\|
				429	(truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
				430	dout("size %lld -> %llu\n", inode->i_size, size);
				431	inode->i_size = size;
				432	inode->i_blocks = (size + (1<<9) - 1) >> 9;
				433	ci->i_reported_size = size;
				434	if (truncate_seq != ci->i_truncate_seq) {
				435	dout("truncate_seq %u -> %u\n",
				436	ci->i_truncate_seq, truncate_seq);
				437	ci->i_truncate_seq = truncate_seq;
Yehuda Sadeh	3d497d8	2010-02-09 11:08:40 -0800	[diff] [blame]	438	/*
				439	* If we hold relevant caps, or in the case where we're
				440	* not the only client referencing this file and we
				441	* don't hold those caps, then we need to check whether
				442	* the file is either opened or mmaped
				443	*/
				444	if ((issued & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_RD\|
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	445	CEPH_CAP_FILE_WR\|CEPH_CAP_FILE_BUFFER\|
Yehuda Sadeh	3d497d8	2010-02-09 11:08:40 -0800	[diff] [blame]	446	CEPH_CAP_FILE_EXCL)) \|\|
				447	mapping_mapped(inode->i_mapping) \|\|
				448	__ceph_caps_file_wanted(ci)) {
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	449	ci->i_truncate_pending++;
				450	queue_trunc = 1;
				451	}
				452	}
				453	}
				454	if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
				455	ci->i_truncate_size != truncate_size) {
				456	dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
				457	truncate_size);
				458	ci->i_truncate_size = truncate_size;
				459	}
				460	return queue_trunc;
				461	}
				462
				463	void ceph_fill_file_time(struct inode *inode, int issued,
				464	u64 time_warp_seq, struct timespec *ctime,
				465	struct timespec mtime, struct timespec atime)
				466	{
				467	struct ceph_inode_info *ci = ceph_inode(inode);
				468	int warn = 0;
				469
				470	if (issued & (CEPH_CAP_FILE_EXCL\|
				471	CEPH_CAP_FILE_WR\|
				472	CEPH_CAP_FILE_BUFFER)) {
				473	if (timespec_compare(ctime, &inode->i_ctime) > 0) {
				474	dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
				475	inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
				476	ctime->tv_sec, ctime->tv_nsec);
				477	inode->i_ctime = *ctime;
				478	}
				479	if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
				480	/* the MDS did a utimes() */
				481	dout("mtime %ld.%09ld -> %ld.%09ld "
				482	"tw %d -> %d\n",
				483	inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
				484	mtime->tv_sec, mtime->tv_nsec,
				485	ci->i_time_warp_seq, (int)time_warp_seq);
				486
				487	inode->i_mtime = *mtime;
				488	inode->i_atime = *atime;
				489	ci->i_time_warp_seq = time_warp_seq;
				490	} else if (time_warp_seq == ci->i_time_warp_seq) {
				491	/* nobody did utimes(); take the max */
				492	if (timespec_compare(mtime, &inode->i_mtime) > 0) {
				493	dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
				494	inode->i_mtime.tv_sec,
				495	inode->i_mtime.tv_nsec,
				496	mtime->tv_sec, mtime->tv_nsec);
				497	inode->i_mtime = *mtime;
				498	}
				499	if (timespec_compare(atime, &inode->i_atime) > 0) {
				500	dout("atime %ld.%09ld -> %ld.%09ld inc\n",
				501	inode->i_atime.tv_sec,
				502	inode->i_atime.tv_nsec,
				503	atime->tv_sec, atime->tv_nsec);
				504	inode->i_atime = *atime;
				505	}
				506	} else if (issued & CEPH_CAP_FILE_EXCL) {
				507	/* we did a utimes(); ignore mds values */
				508	} else {
				509	warn = 1;
				510	}
				511	} else {
				512	/* we have no write caps; whatever the MDS says is true */
				513	if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
				514	inode->i_ctime = *ctime;
				515	inode->i_mtime = *mtime;
				516	inode->i_atime = *atime;
				517	ci->i_time_warp_seq = time_warp_seq;
				518	} else {
				519	warn = 1;
				520	}
				521	}
				522	if (warn) /* time_warp_seq shouldn't go backwards */
				523	dout("%p mds time_warp_seq %llu < %u\n",
				524	inode, time_warp_seq, ci->i_time_warp_seq);
				525	}
				526
				527	/*
				528	* Populate an inode based on info from mds. May be called on new or
				529	* existing inodes.
				530	*/
				531	static int fill_inode(struct inode *inode,
				532	struct ceph_mds_reply_info_in *iinfo,
				533	struct ceph_mds_reply_dirfrag *dirinfo,
				534	struct ceph_mds_session *session,
				535	unsigned long ttl_from, int cap_fmode,
				536	struct ceph_cap_reservation *caps_reservation)
				537	{
				538	struct ceph_mds_reply_inode *info = iinfo->in;
				539	struct ceph_inode_info *ci = ceph_inode(inode);
				540	int i;
				541	int issued, implemented;
				542	struct timespec mtime, atime, ctime;
				543	u32 nsplits;
				544	struct ceph_buffer *xattr_blob = NULL;
				545	int err = 0;
				546	int queue_trunc = 0;
				547
				548	dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
				549	inode, ceph_vinop(inode), le64_to_cpu(info->version),
				550	ci->i_version);
				551
				552	/*
				553	* prealloc xattr data, if it looks like we'll need it. only
				554	* if len > 4 (meaning there are actually xattrs; the first 4
				555	* bytes are the xattr count).
				556	*/
				557	if (iinfo->xattr_len > 4) {
Sage Weil	b6c1d5b	2009-12-07 12:17:17 -0800	[diff] [blame]	558	xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	559	if (!xattr_blob)
				560	pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
				561	iinfo->xattr_len);
				562	}
				563
				564	spin_lock(&inode->i_lock);
				565
				566	/*
				567	* provided version will be odd if inode value is projected,
				568	* even if stable. skip the update if we have a newer info
				569	* (e.g., due to inode info racing form multiple MDSs), or if
				570	* we are getting projected (unstable) inode info.
				571	*/
				572	if (le64_to_cpu(info->version) > 0 &&
				573	(ci->i_version & ~1) > le64_to_cpu(info->version))
				574	goto no_change;
				575
				576	issued = __ceph_caps_issued(ci, &implemented);
				577	issued \|= implemented \| __ceph_caps_dirty(ci);
				578
				579	/* update inode */
				580	ci->i_version = le64_to_cpu(info->version);
				581	inode->i_version++;
				582	inode->i_rdev = le32_to_cpu(info->rdev);
				583
				584	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
				585	inode->i_mode = le32_to_cpu(info->mode);
				586	inode->i_uid = le32_to_cpu(info->uid);
				587	inode->i_gid = le32_to_cpu(info->gid);
				588	dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
				589	inode->i_uid, inode->i_gid);
				590	}
				591
				592	if ((issued & CEPH_CAP_LINK_EXCL) == 0)
				593	inode->i_nlink = le32_to_cpu(info->nlink);
				594
				595	/* be careful with mtime, atime, size */
				596	ceph_decode_timespec(&atime, &info->atime);
				597	ceph_decode_timespec(&mtime, &info->mtime);
				598	ceph_decode_timespec(&ctime, &info->ctime);
				599	queue_trunc = ceph_fill_file_size(inode, issued,
				600	le32_to_cpu(info->truncate_seq),
				601	le64_to_cpu(info->truncate_size),
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	602	le64_to_cpu(info->size));
				603	ceph_fill_file_time(inode, issued,
				604	le32_to_cpu(info->time_warp_seq),
				605	&ctime, &mtime, &atime);
				606
				607	ci->i_max_size = le64_to_cpu(info->max_size);
				608	ci->i_layout = info->layout;
				609	inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
				610
				611	/* xattrs */
				612	/* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
				613	if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
				614	le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
				615	if (ci->i_xattrs.blob)
				616	ceph_buffer_put(ci->i_xattrs.blob);
				617	ci->i_xattrs.blob = xattr_blob;
				618	if (xattr_blob)
				619	memcpy(ci->i_xattrs.blob->vec.iov_base,
				620	iinfo->xattr_data, iinfo->xattr_len);
				621	ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
				622	}
				623
				624	inode->i_mapping->a_ops = &ceph_aops;
				625	inode->i_mapping->backing_dev_info =
				626	&ceph_client(inode->i_sb)->backing_dev_info;
				627
				628	switch (inode->i_mode & S_IFMT) {
				629	case S_IFIFO:
				630	case S_IFBLK:
				631	case S_IFCHR:
				632	case S_IFSOCK:
				633	init_special_inode(inode, inode->i_mode, inode->i_rdev);
				634	inode->i_op = &ceph_file_iops;
				635	break;
				636	case S_IFREG:
				637	inode->i_op = &ceph_file_iops;
				638	inode->i_fop = &ceph_file_fops;
				639	break;
				640	case S_IFLNK:
				641	inode->i_op = &ceph_symlink_iops;
				642	if (!ci->i_symlink) {
				643	int symlen = iinfo->symlink_len;
				644	char *sym;
				645
				646	BUG_ON(symlen != inode->i_size);
				647	spin_unlock(&inode->i_lock);
				648
				649	err = -ENOMEM;
				650	sym = kmalloc(symlen+1, GFP_NOFS);
				651	if (!sym)
				652	goto out;
				653	memcpy(sym, iinfo->symlink, symlen);
				654	sym[symlen] = 0;
				655
				656	spin_lock(&inode->i_lock);
				657	if (!ci->i_symlink)
				658	ci->i_symlink = sym;
				659	else
				660	kfree(sym); /* lost a race */
				661	}
				662	break;
				663	case S_IFDIR:
				664	inode->i_op = &ceph_dir_iops;
				665	inode->i_fop = &ceph_dir_fops;
				666
				667	ci->i_files = le64_to_cpu(info->files);
				668	ci->i_subdirs = le64_to_cpu(info->subdirs);
				669	ci->i_rbytes = le64_to_cpu(info->rbytes);
				670	ci->i_rfiles = le64_to_cpu(info->rfiles);
				671	ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
				672	ceph_decode_timespec(&ci->i_rctime, &info->rctime);
				673
				674	/* set dir completion flag? */
				675	if (ci->i_files == 0 && ci->i_subdirs == 0 &&
				676	ceph_snap(inode) == CEPH_NOSNAP &&
				677	(le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
				678	dout(" marking %p complete (empty)\n", inode);
				679	ci->i_ceph_flags \|= CEPH_I_COMPLETE;
				680	ci->i_max_offset = 2;
				681	}
				682
				683	/* it may be better to set st_size in getattr instead? */
				684	if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
				685	inode->i_size = ci->i_rbytes;
				686	break;
				687	default:
				688	pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
				689	ceph_vinop(inode), inode->i_mode);
				690	}
				691
				692	no_change:
				693	spin_unlock(&inode->i_lock);
				694
				695	/* queue truncate if we saw i_size decrease */
				696	if (queue_trunc)
Sage Weil	3c6f6b7	2010-02-09 15:24:44 -0800	[diff] [blame]	697	ceph_queue_vmtruncate(inode);
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	698
				699	/* populate frag tree */
				700	/* FIXME: move me up, if/when version reflects fragtree changes */
				701	nsplits = le32_to_cpu(info->fragtree.nsplits);
				702	mutex_lock(&ci->i_fragtree_mutex);
				703	for (i = 0; i < nsplits; i++) {
				704	u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
				705	struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
				706
				707	if (IS_ERR(frag))
				708	continue;
				709	frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
				710	dout(" frag %x split by %d\n", frag->frag, frag->split_by);
				711	}
				712	mutex_unlock(&ci->i_fragtree_mutex);
				713
				714	/* were we issued a capability? */
				715	if (info->cap.caps) {
				716	if (ceph_snap(inode) == CEPH_NOSNAP) {
				717	ceph_add_cap(inode, session,
				718	le64_to_cpu(info->cap.cap_id),
				719	cap_fmode,
				720	le32_to_cpu(info->cap.caps),
				721	le32_to_cpu(info->cap.wanted),
				722	le32_to_cpu(info->cap.seq),
				723	le32_to_cpu(info->cap.mseq),
				724	le64_to_cpu(info->cap.realm),
				725	info->cap.flags,
				726	caps_reservation);
				727	} else {
				728	spin_lock(&inode->i_lock);
				729	dout(" %p got snap_caps %s\n", inode,
				730	ceph_cap_string(le32_to_cpu(info->cap.caps)));
				731	ci->i_snap_caps \|= le32_to_cpu(info->cap.caps);
				732	if (cap_fmode >= 0)
				733	__ceph_get_fmode(ci, cap_fmode);
				734	spin_unlock(&inode->i_lock);
				735	}
Sage Weil	04d000e	2010-05-07 11:26:34 -0700	[diff] [blame]	736	} else if (cap_fmode >= 0) {
				737	pr_warning("mds issued no caps on %llx.%llx\n",
				738	ceph_vinop(inode));
				739	__ceph_get_fmode(ci, cap_fmode);
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	740	}
				741
				742	/* update delegation info? */
				743	if (dirinfo)
				744	ceph_fill_dirfrag(inode, dirinfo);
				745
				746	err = 0;
				747
				748	out:
Sage Weil	b6c1d5b	2009-12-07 12:17:17 -0800	[diff] [blame]	749	if (xattr_blob)
				750	ceph_buffer_put(xattr_blob);
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	751	return err;
				752	}
				753
				754	/*
				755	* caller should hold session s_mutex.
				756	*/
				757	static void update_dentry_lease(struct dentry *dentry,
				758	struct ceph_mds_reply_lease *lease,
				759	struct ceph_mds_session *session,
				760	unsigned long from_time)
				761	{
				762	struct ceph_dentry_info *di = ceph_dentry(dentry);
				763	long unsigned duration = le32_to_cpu(lease->duration_ms);
				764	long unsigned ttl = from_time + (duration * HZ) / 1000;
				765	long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
				766	struct inode *dir;
				767
				768	/* only track leases on regular dentries */
				769	if (dentry->d_op != &ceph_dentry_ops)
				770	return;
				771
				772	spin_lock(&dentry->d_lock);
				773	dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
				774	dentry, le16_to_cpu(lease->mask), duration, ttl);
				775
				776	/* make lease_rdcache_gen match directory */
				777	dir = dentry->d_parent->d_inode;
				778	di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
				779
				780	if (lease->mask == 0)
				781	goto out_unlock;
				782
				783	if (di->lease_gen == session->s_cap_gen &&
				784	time_before(ttl, dentry->d_time))
				785	goto out_unlock; /* we already have a newer lease. */
				786
				787	if (di->lease_session && di->lease_session != session)
				788	goto out_unlock;
				789
				790	ceph_dentry_lru_touch(dentry);
				791
				792	if (!di->lease_session)
				793	di->lease_session = ceph_get_mds_session(session);
				794	di->lease_gen = session->s_cap_gen;
				795	di->lease_seq = le32_to_cpu(lease->seq);
				796	di->lease_renew_after = half_ttl;
				797	di->lease_renew_from = 0;
				798	dentry->d_time = ttl;
				799	out_unlock:
				800	spin_unlock(&dentry->d_lock);
				801	return;
				802	}
				803
				804	/*
				805	* splice a dentry to an inode.
				806	* caller must hold directory i_mutex for this to be safe.
				807	*
				808	* we will only rehash the resulting dentry if @prehash is
				809	* true; @prehash will be set to false (for the benefit of
				810	* the caller) if we fail.
				811	*/
				812	static struct dentry splice_dentry(struct dentry dn, struct inode *in,
				813	bool *prehash)
				814	{
				815	struct dentry *realdn;
				816
				817	/* dn must be unhashed */
				818	if (!d_unhashed(dn))
				819	d_drop(dn);
				820	realdn = d_materialise_unique(dn, in);
				821	if (IS_ERR(realdn)) {
				822	pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
				823	dn, in, ceph_vinop(in));
				824	if (prehash)
				825	prehash = false; / don't rehash on error */
				826	dn = realdn; /* note realdn contains the error */
				827	goto out;
				828	} else if (realdn) {
				829	dout("dn %p (%d) spliced with %p (%d) "
				830	"inode %p ino %llx.%llx\n",
				831	dn, atomic_read(&dn->d_count),
				832	realdn, atomic_read(&realdn->d_count),
				833	realdn->d_inode, ceph_vinop(realdn->d_inode));
				834	dput(dn);
				835	dn = realdn;
				836	} else {
				837	BUG_ON(!ceph_dentry(dn));
				838
				839	dout("dn %p attached to %p ino %llx.%llx\n",
				840	dn, dn->d_inode, ceph_vinop(dn->d_inode));
				841	}
				842	if ((!prehash \|\| *prehash) && d_unhashed(dn))
				843	d_rehash(dn);
				844	out:
				845	return dn;
				846	}
				847
				848	/*
Yehuda Sadeh	4baa75e	2010-01-07 15:36:32 -0800	[diff] [blame]	849	* Set dentry's directory position based on the current dir's max, and
				850	* order it in d_subdirs, so that dcache_readdir behaves.
				851	*/
				852	static void ceph_set_dentry_offset(struct dentry *dn)
				853	{
				854	struct dentry *dir = dn->d_parent;
				855	struct inode *inode = dn->d_parent->d_inode;
				856	struct ceph_dentry_info *di;
				857
				858	BUG_ON(!inode);
				859
				860	di = ceph_dentry(dn);
				861
				862	spin_lock(&inode->i_lock);
				863	di->offset = ceph_inode(inode)->i_max_offset++;
				864	spin_unlock(&inode->i_lock);
				865
				866	spin_lock(&dcache_lock);
				867	spin_lock(&dn->d_lock);
				868	list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
				869	dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
				870	dn->d_u.d_child.prev, dn->d_u.d_child.next);
				871	spin_unlock(&dn->d_lock);
				872	spin_unlock(&dcache_lock);
				873	}
				874
				875	/*
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	876	* Incorporate results into the local cache. This is either just
				877	* one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
				878	* after a lookup).
				879	*
				880	* A reply may contain
				881	* a directory inode along with a dentry.
				882	* and/or a target inode
				883	*
				884	* Called with snap_rwsem (read).
				885	*/
				886	int ceph_fill_trace(struct super_block sb, struct ceph_mds_request req,
				887	struct ceph_mds_session *session)
				888	{
				889	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
				890	struct inode *in = NULL;
				891	struct ceph_mds_reply_inode *ininfo;
				892	struct ceph_vino vino;
Sage Weil	9358c6d	2010-03-30 13:54:41 -0700	[diff] [blame]	893	struct ceph_client *client = ceph_sb_to_client(sb);
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	894	int i = 0;
				895	int err = 0;
				896
				897	dout("fill_trace %p is_dentry %d is_target %d\n", req,
				898	rinfo->head->is_dentry, rinfo->head->is_target);
				899
				900	#if 0
				901	/*
				902	* Debugging hook:
				903	*
				904	* If we resend completed ops to a recovering mds, we get no
				905	* trace. Since that is very rare, pretend this is the case
				906	* to ensure the 'no trace' handlers in the callers behave.
				907	*
				908	* Fill in inodes unconditionally to avoid breaking cap
				909	* invariants.
				910	*/
				911	if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
				912	pr_info("fill_trace faking empty trace on %lld %s\n",
				913	req->r_tid, ceph_mds_op_name(rinfo->head->op));
				914	if (rinfo->head->is_dentry) {
				915	rinfo->head->is_dentry = 0;
				916	err = fill_inode(req->r_locked_dir,
				917	&rinfo->diri, rinfo->dirfrag,
				918	session, req->r_request_started, -1);
				919	}
				920	if (rinfo->head->is_target) {
				921	rinfo->head->is_target = 0;
				922	ininfo = rinfo->targeti.in;
				923	vino.ino = le64_to_cpu(ininfo->ino);
				924	vino.snap = le64_to_cpu(ininfo->snapid);
				925	in = ceph_get_inode(sb, vino);
				926	err = fill_inode(in, &rinfo->targeti, NULL,
				927	session, req->r_request_started,
				928	req->r_fmode);
				929	iput(in);
				930	}
				931	}
				932	#endif
				933
				934	if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
				935	dout("fill_trace reply is empty!\n");
				936	if (rinfo->head->result == 0 && req->r_locked_dir) {
				937	struct ceph_inode_info *ci =
				938	ceph_inode(req->r_locked_dir);
				939	dout(" clearing %p complete (empty trace)\n",
				940	req->r_locked_dir);
				941	ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
				942	ci->i_release_count++;
				943	}
				944	return 0;
				945	}
				946
				947	if (rinfo->head->is_dentry) {
Sage Weil	5b1daec	2010-01-25 11:33:08 -0800	[diff] [blame]	948	struct inode *dir = req->r_locked_dir;
				949
				950	err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
				951	session, req->r_request_started, -1,
				952	&req->r_caps_reservation);
				953	if (err < 0)
				954	return err;
				955	}
				956
Sage Weil	9358c6d	2010-03-30 13:54:41 -0700	[diff] [blame]	957	/*
				958	* ignore null lease/binding on snapdir ENOENT, or else we
				959	* will have trouble splicing in the virtual snapdir later
				960	*/
				961	if (rinfo->head->is_dentry && !req->r_aborted &&
				962	(rinfo->head->is_target \|\| strncmp(req->r_dentry->d_name.name,
				963	client->mount_args->snapdir_name,
				964	req->r_dentry->d_name.len))) {
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	965	/*
				966	* lookup link rename : null -> possibly existing inode
				967	* mknod symlink mkdir : null -> new inode
				968	* unlink : linked -> null
				969	*/
				970	struct inode *dir = req->r_locked_dir;
				971	struct dentry *dn = req->r_dentry;
				972	bool have_dir_cap, have_lease;
				973
				974	BUG_ON(!dn);
				975	BUG_ON(!dir);
				976	BUG_ON(dn->d_parent->d_inode != dir);
				977	BUG_ON(ceph_ino(dir) !=
				978	le64_to_cpu(rinfo->diri.in->ino));
				979	BUG_ON(ceph_snap(dir) !=
				980	le64_to_cpu(rinfo->diri.in->snapid));
				981
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	982	/* do we have a lease on the whole dir? */
				983	have_dir_cap =
				984	(le32_to_cpu(rinfo->diri.in->cap.caps) &
				985	CEPH_CAP_FILE_SHARED);
				986
				987	/* do we have a dn lease? */
				988	have_lease = have_dir_cap \|\|
				989	(le16_to_cpu(rinfo->dlease->mask) &
				990	CEPH_LOCK_DN);
				991
				992	if (!have_lease)
				993	dout("fill_trace no dentry lease or dir cap\n");
				994
				995	/* rename? */
				996	if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
				997	dout(" src %p '%.s' dst %p '%.s'\n",
				998	req->r_old_dentry,
				999	req->r_old_dentry->d_name.len,
				1000	req->r_old_dentry->d_name.name,
				1001	dn, dn->d_name.len, dn->d_name.name);
				1002	dout("fill_trace doing d_move %p -> %p\n",
				1003	req->r_old_dentry, dn);
Sage Weil	c10f5e1	2010-04-16 12:56:11 -0700	[diff] [blame]	1004
				1005	/* d_move screws up d_subdirs order */
				1006	ceph_i_clear(dir, CEPH_I_COMPLETE);
				1007
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	1008	d_move(req->r_old_dentry, dn);
				1009	dout(" src %p '%.s' dst %p '%.s'\n",
				1010	req->r_old_dentry,
				1011	req->r_old_dentry->d_name.len,
				1012	req->r_old_dentry->d_name.name,
				1013	dn, dn->d_name.len, dn->d_name.name);
Sage Weil	c4a29f2	2009-12-21 11:42:18 -0800	[diff] [blame]	1014	/* ensure target dentry is invalidated, despite
				1015	rehashing bug in vfs_rename_dir */
				1016	dn->d_time = jiffies;
				1017	ceph_dentry(dn)->lease_shared_gen = 0;
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	1018	/* take overwritten dentry's readdir offset */
				1019	ceph_dentry(req->r_old_dentry)->offset =
				1020	ceph_dentry(dn)->offset;
				1021	dn = req->r_old_dentry; /* use old_dentry */
				1022	in = dn->d_inode;
				1023	}
				1024
				1025	/* null dentry? */
				1026	if (!rinfo->head->is_target) {
				1027	dout("fill_trace null dentry\n");
				1028	if (dn->d_inode) {
				1029	dout("d_delete %p\n", dn);
				1030	d_delete(dn);
				1031	} else {
				1032	dout("d_instantiate %p NULL\n", dn);
				1033	d_instantiate(dn, NULL);
				1034	if (have_lease && d_unhashed(dn))
				1035	d_rehash(dn);
				1036	update_dentry_lease(dn, rinfo->dlease,
				1037	session,
				1038	req->r_request_started);
				1039	}
				1040	goto done;
				1041	}
				1042
				1043	/* attach proper inode */
				1044	ininfo = rinfo->targeti.in;
				1045	vino.ino = le64_to_cpu(ininfo->ino);
				1046	vino.snap = le64_to_cpu(ininfo->snapid);
				1047	if (!dn->d_inode) {
				1048	in = ceph_get_inode(sb, vino);
				1049	if (IS_ERR(in)) {
				1050	pr_err("fill_trace bad get_inode "
				1051	"%llx.%llx\n", vino.ino, vino.snap);
				1052	err = PTR_ERR(in);
				1053	d_delete(dn);
				1054	goto done;
				1055	}
				1056	dn = splice_dentry(dn, in, &have_lease);
				1057	if (IS_ERR(dn)) {
				1058	err = PTR_ERR(dn);
				1059	goto done;
				1060	}
				1061	req->r_dentry = dn; /* may have spliced */
Yehuda Sadeh	4baa75e	2010-01-07 15:36:32 -0800	[diff] [blame]	1062	ceph_set_dentry_offset(dn);
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	1063	igrab(in);
				1064	} else if (ceph_ino(in) == vino.ino &&
				1065	ceph_snap(in) == vino.snap) {
				1066	igrab(in);
				1067	} else {
				1068	dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
				1069	dn, in, ceph_ino(in), ceph_snap(in),
				1070	vino.ino, vino.snap);
				1071	have_lease = false;
				1072	in = NULL;
				1073	}
				1074
				1075	if (have_lease)
				1076	update_dentry_lease(dn, rinfo->dlease, session,
				1077	req->r_request_started);
				1078	dout(" final dn %p\n", dn);
				1079	i++;
				1080	} else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP \|\|
				1081	req->r_op == CEPH_MDS_OP_MKSNAP) {
				1082	struct dentry *dn = req->r_dentry;
				1083
				1084	/* fill out a snapdir LOOKUPSNAP dentry */
				1085	BUG_ON(!dn);
				1086	BUG_ON(!req->r_locked_dir);
				1087	BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
				1088	ininfo = rinfo->targeti.in;
				1089	vino.ino = le64_to_cpu(ininfo->ino);
				1090	vino.snap = le64_to_cpu(ininfo->snapid);
				1091	in = ceph_get_inode(sb, vino);
				1092	if (IS_ERR(in)) {
				1093	pr_err("fill_inode get_inode badness %llx.%llx\n",
				1094	vino.ino, vino.snap);
				1095	err = PTR_ERR(in);
				1096	d_delete(dn);
				1097	goto done;
				1098	}
				1099	dout(" linking snapped dir %p to dn %p\n", in, dn);
				1100	dn = splice_dentry(dn, in, NULL);
				1101	if (IS_ERR(dn)) {
				1102	err = PTR_ERR(dn);
				1103	goto done;
				1104	}
Yehuda Sadeh	4baa75e	2010-01-07 15:36:32 -0800	[diff] [blame]	1105	ceph_set_dentry_offset(dn);
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	1106	req->r_dentry = dn; /* may have spliced */
				1107	igrab(in);
				1108	rinfo->head->is_dentry = 1; /* fool notrace handlers */
				1109	}
				1110
				1111	if (rinfo->head->is_target) {
				1112	vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
				1113	vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
				1114
				1115	if (in == NULL \|\| ceph_ino(in) != vino.ino \|\|
				1116	ceph_snap(in) != vino.snap) {
				1117	in = ceph_get_inode(sb, vino);
				1118	if (IS_ERR(in)) {
				1119	err = PTR_ERR(in);
				1120	goto done;
				1121	}
				1122	}
				1123	req->r_target_inode = in;
				1124
				1125	err = fill_inode(in,
				1126	&rinfo->targeti, NULL,
				1127	session, req->r_request_started,
				1128	(le32_to_cpu(rinfo->head->result) == 0) ?
				1129	req->r_fmode : -1,
				1130	&req->r_caps_reservation);
				1131	if (err < 0) {
				1132	pr_err("fill_inode badness %p %llx.%llx\n",
				1133	in, ceph_vinop(in));
				1134	goto done;
				1135	}
				1136	}
				1137
				1138	done:
				1139	dout("fill_trace done err=%d\n", err);
				1140	return err;
				1141	}
				1142
				1143	/*
				1144	* Prepopulate our cache with readdir results, leases, etc.
				1145	*/
				1146	int ceph_readdir_prepopulate(struct ceph_mds_request *req,
				1147	struct ceph_mds_session *session)
				1148	{
				1149	struct dentry *parent = req->r_dentry;
				1150	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
				1151	struct qstr dname;
				1152	struct dentry *dn;
				1153	struct inode *in;
				1154	int err = 0, i;
				1155	struct inode *snapdir = NULL;
				1156	struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
				1157	u64 frag = le32_to_cpu(rhead->args.readdir.frag);
				1158	struct ceph_dentry_info *di;
				1159
				1160	if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
				1161	snapdir = ceph_get_snapdir(parent->d_inode);
				1162	parent = d_find_alias(snapdir);
				1163	dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
				1164	rinfo->dir_nr, parent);
				1165	} else {
				1166	dout("readdir_prepopulate %d items under dn %p\n",
				1167	rinfo->dir_nr, parent);
				1168	if (rinfo->dir_dir)
				1169	ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
				1170	}
				1171
				1172	for (i = 0; i < rinfo->dir_nr; i++) {
				1173	struct ceph_vino vino;
				1174
				1175	dname.name = rinfo->dir_dname[i];
				1176	dname.len = rinfo->dir_dname_len[i];
				1177	dname.hash = full_name_hash(dname.name, dname.len);
				1178
				1179	vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
				1180	vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
				1181
				1182	retry_lookup:
				1183	dn = d_lookup(parent, &dname);
				1184	dout("d_lookup on parent=%p name=%.*s got %p\n",
				1185	parent, dname.len, dname.name, dn);
				1186
				1187	if (!dn) {
				1188	dn = d_alloc(parent, &dname);
				1189	dout("d_alloc %p '%.*s' = %p\n", parent,
				1190	dname.len, dname.name, dn);
				1191	if (dn == NULL) {
				1192	dout("d_alloc badness\n");
				1193	err = -ENOMEM;
				1194	goto out;
				1195	}
				1196	err = ceph_init_dentry(dn);
				1197	if (err < 0)
				1198	goto out;
				1199	} else if (dn->d_inode &&
				1200	(ceph_ino(dn->d_inode) != vino.ino \|\|
				1201	ceph_snap(dn->d_inode) != vino.snap)) {
				1202	dout(" dn %p points to wrong inode %p\n",
				1203	dn, dn->d_inode);
				1204	d_delete(dn);
				1205	dput(dn);
				1206	goto retry_lookup;
				1207	} else {
				1208	/* reorder parent's d_subdirs */
				1209	spin_lock(&dcache_lock);
				1210	spin_lock(&dn->d_lock);
				1211	list_move(&dn->d_u.d_child, &parent->d_subdirs);
				1212	spin_unlock(&dn->d_lock);
				1213	spin_unlock(&dcache_lock);
				1214	}
				1215
				1216	di = dn->d_fsdata;
				1217	di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
				1218
				1219	/* inode */
				1220	if (dn->d_inode) {
				1221	in = dn->d_inode;
				1222	} else {
				1223	in = ceph_get_inode(parent->d_sb, vino);
				1224	if (in == NULL) {
				1225	dout("new_inode badness\n");
				1226	d_delete(dn);
				1227	dput(dn);
				1228	err = -ENOMEM;
				1229	goto out;
				1230	}
				1231	dn = splice_dentry(dn, in, NULL);
				1232	}
				1233
				1234	if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
				1235	req->r_request_started, -1,
				1236	&req->r_caps_reservation) < 0) {
				1237	pr_err("fill_inode badness on %p\n", in);
				1238	dput(dn);
				1239	continue;
				1240	}
				1241	update_dentry_lease(dn, rinfo->dir_dlease[i],
				1242	req->r_session, req->r_request_started);
				1243	dput(dn);
				1244	}
				1245	req->r_did_prepopulate = true;
				1246
				1247	out:
				1248	if (snapdir) {
				1249	iput(snapdir);
				1250	dput(parent);
				1251	}
				1252	dout("readdir_prepopulate done\n");
				1253	return err;
				1254	}
				1255
				1256	int ceph_inode_set_size(struct inode *inode, loff_t size)
				1257	{
				1258	struct ceph_inode_info *ci = ceph_inode(inode);
				1259	int ret = 0;
				1260
				1261	spin_lock(&inode->i_lock);
				1262	dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
				1263	inode->i_size = size;
				1264	inode->i_blocks = (size + (1 << 9) - 1) >> 9;
				1265
				1266	/* tell the MDS if we are approaching max_size */
				1267	if ((size << 1) >= ci->i_max_size &&
				1268	(ci->i_reported_size << 1) < ci->i_max_size)
				1269	ret = 1;
				1270
				1271	spin_unlock(&inode->i_lock);
				1272	return ret;
				1273	}
				1274
				1275	/*
				1276	* Write back inode data in a worker thread. (This can't be done
				1277	* in the message handler context.)
				1278	*/
Sage Weil	3c6f6b7	2010-02-09 15:24:44 -0800	[diff] [blame]	1279	void ceph_queue_writeback(struct inode *inode)
				1280	{
				1281	if (queue_work(ceph_inode_to_client(inode)->wb_wq,
				1282	&ceph_inode(inode)->i_wb_work)) {
Sage Weil	2c27c9a	2010-02-17 15:45:51 -0800	[diff] [blame]	1283	dout("ceph_queue_writeback %p\n", inode);
Sage Weil	3c6f6b7	2010-02-09 15:24:44 -0800	[diff] [blame]	1284	igrab(inode);
				1285	} else {
Sage Weil	2c27c9a	2010-02-17 15:45:51 -0800	[diff] [blame]	1286	dout("ceph_queue_writeback %p failed\n", inode);
Sage Weil	3c6f6b7	2010-02-09 15:24:44 -0800	[diff] [blame]	1287	}
				1288	}
				1289
				1290	static void ceph_writeback_work(struct work_struct *work)
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	1291	{
				1292	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
				1293	i_wb_work);
				1294	struct inode *inode = &ci->vfs_inode;
				1295
				1296	dout("writeback %p\n", inode);
				1297	filemap_fdatawrite(&inode->i_data);
				1298	iput(inode);
				1299	}
				1300
				1301	/*
Sage Weil	3c6f6b7	2010-02-09 15:24:44 -0800	[diff] [blame]	1302	* queue an async invalidation
				1303	*/
				1304	void ceph_queue_invalidate(struct inode *inode)
				1305	{
				1306	if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
				1307	&ceph_inode(inode)->i_pg_inv_work)) {
				1308	dout("ceph_queue_invalidate %p\n", inode);
				1309	igrab(inode);
				1310	} else {
				1311	dout("ceph_queue_invalidate %p failed\n", inode);
				1312	}
				1313	}
				1314
				1315	/*
Yehuda Sadeh	c9af9fb	2010-02-19 00:10:11 +0000	[diff] [blame]	1316	* invalidate any pages that are not dirty or under writeback. this
				1317	* includes pages that are clean and mapped.
				1318	*/
				1319	static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
				1320	{
				1321	struct pagevec pvec;
				1322	pgoff_t next = 0;
				1323	int i;
				1324
				1325	pagevec_init(&pvec, 0);
				1326	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
				1327	for (i = 0; i < pagevec_count(&pvec); i++) {
				1328	struct page *page = pvec.pages[i];
				1329	pgoff_t index;
				1330	int skip_page =
				1331	(PageDirty(page) \|\| PageWriteback(page));
				1332
				1333	if (!skip_page)
				1334	skip_page = !trylock_page(page);
				1335
				1336	/*
				1337	* We really shouldn't be looking at the ->index of an
				1338	* unlocked page. But we're not allowed to lock these
				1339	* pages. So we rely upon nobody altering the ->index
				1340	* of this (pinned-by-us) page.
				1341	*/
				1342	index = page->index;
				1343	if (index > next)
				1344	next = index;
				1345	next++;
				1346
				1347	if (skip_page)
				1348	continue;
				1349
				1350	generic_error_remove_page(mapping, page);
				1351	unlock_page(page);
				1352	}
				1353	pagevec_release(&pvec);
				1354	cond_resched();
				1355	}
				1356	}
				1357
				1358	/*
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	1359	* Invalidate inode pages in a worker thread. (This can't be done
				1360	* in the message handler context.)
				1361	*/
Sage Weil	3c6f6b7	2010-02-09 15:24:44 -0800	[diff] [blame]	1362	static void ceph_invalidate_work(struct work_struct *work)
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	1363	{
				1364	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
				1365	i_pg_inv_work);
				1366	struct inode *inode = &ci->vfs_inode;
				1367	u32 orig_gen;
				1368	int check = 0;
				1369
				1370	spin_lock(&inode->i_lock);
				1371	dout("invalidate_pages %p gen %d revoking %d\n", inode,
				1372	ci->i_rdcache_gen, ci->i_rdcache_revoking);
				1373	if (ci->i_rdcache_gen == 0 \|\|
				1374	ci->i_rdcache_revoking != ci->i_rdcache_gen) {
				1375	BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
				1376	/* nevermind! */
				1377	ci->i_rdcache_revoking = 0;
				1378	spin_unlock(&inode->i_lock);
				1379	goto out;
				1380	}
				1381	orig_gen = ci->i_rdcache_gen;
				1382	spin_unlock(&inode->i_lock);
				1383
Yehuda Sadeh	c9af9fb	2010-02-19 00:10:11 +0000	[diff] [blame]	1384	ceph_invalidate_nondirty_pages(inode->i_mapping);
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	1385
				1386	spin_lock(&inode->i_lock);
				1387	if (orig_gen == ci->i_rdcache_gen) {
				1388	dout("invalidate_pages %p gen %d successful\n", inode,
				1389	ci->i_rdcache_gen);
				1390	ci->i_rdcache_gen = 0;
				1391	ci->i_rdcache_revoking = 0;
				1392	check = 1;
				1393	} else {
				1394	dout("invalidate_pages %p gen %d raced, gen now %d\n",
				1395	inode, orig_gen, ci->i_rdcache_gen);
				1396	}
				1397	spin_unlock(&inode->i_lock);
				1398
				1399	if (check)
				1400	ceph_check_caps(ci, 0, NULL);
				1401	out:
				1402	iput(inode);
				1403	}
				1404
				1405
				1406	/*
				1407	* called by trunc_wq; take i_mutex ourselves
				1408	*
				1409	* We also truncate in a separate thread as well.
				1410	*/
Sage Weil	3c6f6b7	2010-02-09 15:24:44 -0800	[diff] [blame]	1411	static void ceph_vmtruncate_work(struct work_struct *work)
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	1412	{
				1413	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
				1414	i_vmtruncate_work);
				1415	struct inode *inode = &ci->vfs_inode;
				1416
				1417	dout("vmtruncate_work %p\n", inode);
				1418	mutex_lock(&inode->i_mutex);
				1419	__ceph_do_pending_vmtruncate(inode);
				1420	mutex_unlock(&inode->i_mutex);
				1421	iput(inode);
				1422	}
				1423
				1424	/*
Sage Weil	3c6f6b7	2010-02-09 15:24:44 -0800	[diff] [blame]	1425	* Queue an async vmtruncate. If we fail to queue work, we will handle
				1426	* the truncation the next time we call __ceph_do_pending_vmtruncate.
				1427	*/
				1428	void ceph_queue_vmtruncate(struct inode *inode)
				1429	{
				1430	struct ceph_inode_info *ci = ceph_inode(inode);
				1431
				1432	if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
				1433	&ci->i_vmtruncate_work)) {
				1434	dout("ceph_queue_vmtruncate %p\n", inode);
				1435	igrab(inode);
				1436	} else {
				1437	dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
				1438	inode, ci->i_truncate_pending);
				1439	}
				1440	}
				1441
				1442	/*
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	1443	* called with i_mutex held.
				1444	*
				1445	* Make sure any pending truncation is applied before doing anything
				1446	* that may depend on it.
				1447	*/
				1448	void __ceph_do_pending_vmtruncate(struct inode *inode)
				1449	{
				1450	struct ceph_inode_info *ci = ceph_inode(inode);
				1451	u64 to;
				1452	int wrbuffer_refs, wake = 0;
				1453
				1454	retry:
				1455	spin_lock(&inode->i_lock);
				1456	if (ci->i_truncate_pending == 0) {
				1457	dout("__do_pending_vmtruncate %p none pending\n", inode);
				1458	spin_unlock(&inode->i_lock);
				1459	return;
				1460	}
				1461
				1462	/*
				1463	* make sure any dirty snapped pages are flushed before we
				1464	* possibly truncate them.. so write AND block!
				1465	*/
				1466	if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
				1467	dout("__do_pending_vmtruncate %p flushing snaps first\n",
				1468	inode);
				1469	spin_unlock(&inode->i_lock);
				1470	filemap_write_and_wait_range(&inode->i_data, 0,
				1471	inode->i_sb->s_maxbytes);
				1472	goto retry;
				1473	}
				1474
				1475	to = ci->i_truncate_size;
				1476	wrbuffer_refs = ci->i_wrbuffer_ref;
				1477	dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
				1478	ci->i_truncate_pending, to);
				1479	spin_unlock(&inode->i_lock);
				1480
				1481	truncate_inode_pages(inode->i_mapping, to);
				1482
				1483	spin_lock(&inode->i_lock);
				1484	ci->i_truncate_pending--;
				1485	if (ci->i_truncate_pending == 0)
				1486	wake = 1;
				1487	spin_unlock(&inode->i_lock);
				1488
				1489	if (wrbuffer_refs == 0)
				1490	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
				1491	if (wake)
				1492	wake_up(&ci->i_cap_wq);
				1493	}
				1494
				1495
				1496	/*
				1497	* symlinks
				1498	*/
				1499	static void ceph_sym_follow_link(struct dentry dentry, struct nameidata *nd)
				1500	{
				1501	struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
				1502	nd_set_link(nd, ci->i_symlink);
				1503	return NULL;
				1504	}
				1505
				1506	static const struct inode_operations ceph_symlink_iops = {
				1507	.readlink = generic_readlink,
				1508	.follow_link = ceph_sym_follow_link,
				1509	};
				1510
				1511	/*
				1512	* setattr
				1513	*/
				1514	int ceph_setattr(struct dentry dentry, struct iattr attr)
				1515	{
				1516	struct inode *inode = dentry->d_inode;
				1517	struct ceph_inode_info *ci = ceph_inode(inode);
				1518	struct inode *parent_inode = dentry->d_parent->d_inode;
				1519	const unsigned int ia_valid = attr->ia_valid;
				1520	struct ceph_mds_request *req;
				1521	struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
				1522	int issued;
				1523	int release = 0, dirtied = 0;
				1524	int mask = 0;
				1525	int err = 0;
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	1526
				1527	if (ceph_snap(inode) != CEPH_NOSNAP)
				1528	return -EROFS;
				1529
				1530	__ceph_do_pending_vmtruncate(inode);
				1531
				1532	err = inode_change_ok(inode, attr);
				1533	if (err != 0)
				1534	return err;
				1535
				1536	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
				1537	USE_AUTH_MDS);
				1538	if (IS_ERR(req))
				1539	return PTR_ERR(req);
				1540
				1541	spin_lock(&inode->i_lock);
				1542	issued = __ceph_caps_issued(ci, NULL);
				1543	dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
				1544
				1545	if (ia_valid & ATTR_UID) {
				1546	dout("setattr %p uid %d -> %d\n", inode,
				1547	inode->i_uid, attr->ia_uid);
				1548	if (issued & CEPH_CAP_AUTH_EXCL) {
				1549	inode->i_uid = attr->ia_uid;
				1550	dirtied \|= CEPH_CAP_AUTH_EXCL;
				1551	} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 \|\|
				1552	attr->ia_uid != inode->i_uid) {
				1553	req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
				1554	mask \|= CEPH_SETATTR_UID;
				1555	release \|= CEPH_CAP_AUTH_SHARED;
				1556	}
				1557	}
				1558	if (ia_valid & ATTR_GID) {
				1559	dout("setattr %p gid %d -> %d\n", inode,
				1560	inode->i_gid, attr->ia_gid);
				1561	if (issued & CEPH_CAP_AUTH_EXCL) {
				1562	inode->i_gid = attr->ia_gid;
				1563	dirtied \|= CEPH_CAP_AUTH_EXCL;
				1564	} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 \|\|
				1565	attr->ia_gid != inode->i_gid) {
				1566	req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
				1567	mask \|= CEPH_SETATTR_GID;
				1568	release \|= CEPH_CAP_AUTH_SHARED;
				1569	}
				1570	}
				1571	if (ia_valid & ATTR_MODE) {
				1572	dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
				1573	attr->ia_mode);
				1574	if (issued & CEPH_CAP_AUTH_EXCL) {
				1575	inode->i_mode = attr->ia_mode;
				1576	dirtied \|= CEPH_CAP_AUTH_EXCL;
				1577	} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 \|\|
				1578	attr->ia_mode != inode->i_mode) {
				1579	req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
				1580	mask \|= CEPH_SETATTR_MODE;
				1581	release \|= CEPH_CAP_AUTH_SHARED;
				1582	}
				1583	}
				1584
				1585	if (ia_valid & ATTR_ATIME) {
				1586	dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
				1587	inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
				1588	attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
				1589	if (issued & CEPH_CAP_FILE_EXCL) {
				1590	ci->i_time_warp_seq++;
				1591	inode->i_atime = attr->ia_atime;
				1592	dirtied \|= CEPH_CAP_FILE_EXCL;
				1593	} else if ((issued & CEPH_CAP_FILE_WR) &&
				1594	timespec_compare(&inode->i_atime,
				1595	&attr->ia_atime) < 0) {
				1596	inode->i_atime = attr->ia_atime;
				1597	dirtied \|= CEPH_CAP_FILE_WR;
				1598	} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 \|\|
				1599	!timespec_equal(&inode->i_atime, &attr->ia_atime)) {
				1600	ceph_encode_timespec(&req->r_args.setattr.atime,
				1601	&attr->ia_atime);
				1602	mask \|= CEPH_SETATTR_ATIME;
				1603	release \|= CEPH_CAP_FILE_CACHE \| CEPH_CAP_FILE_RD \|
				1604	CEPH_CAP_FILE_WR;
				1605	}
				1606	}
				1607	if (ia_valid & ATTR_MTIME) {
				1608	dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
				1609	inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
				1610	attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
				1611	if (issued & CEPH_CAP_FILE_EXCL) {
				1612	ci->i_time_warp_seq++;
				1613	inode->i_mtime = attr->ia_mtime;
				1614	dirtied \|= CEPH_CAP_FILE_EXCL;
				1615	} else if ((issued & CEPH_CAP_FILE_WR) &&
				1616	timespec_compare(&inode->i_mtime,
				1617	&attr->ia_mtime) < 0) {
				1618	inode->i_mtime = attr->ia_mtime;
				1619	dirtied \|= CEPH_CAP_FILE_WR;
				1620	} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 \|\|
				1621	!timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
				1622	ceph_encode_timespec(&req->r_args.setattr.mtime,
				1623	&attr->ia_mtime);
				1624	mask \|= CEPH_SETATTR_MTIME;
				1625	release \|= CEPH_CAP_FILE_SHARED \| CEPH_CAP_FILE_RD \|
				1626	CEPH_CAP_FILE_WR;
				1627	}
				1628	}
				1629	if (ia_valid & ATTR_SIZE) {
				1630	dout("setattr %p size %lld -> %lld\n", inode,
				1631	inode->i_size, attr->ia_size);
				1632	if (attr->ia_size > inode->i_sb->s_maxbytes) {
				1633	err = -EINVAL;
				1634	goto out;
				1635	}
				1636	if ((issued & CEPH_CAP_FILE_EXCL) &&
				1637	attr->ia_size > inode->i_size) {
				1638	inode->i_size = attr->ia_size;
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	1639	inode->i_blocks =
				1640	(attr->ia_size + (1 << 9) - 1) >> 9;
				1641	inode->i_ctime = attr->ia_ctime;
				1642	ci->i_reported_size = attr->ia_size;
				1643	dirtied \|= CEPH_CAP_FILE_EXCL;
				1644	} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 \|\|
				1645	attr->ia_size != inode->i_size) {
				1646	req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
				1647	req->r_args.setattr.old_size =
				1648	cpu_to_le64(inode->i_size);
				1649	mask \|= CEPH_SETATTR_SIZE;
				1650	release \|= CEPH_CAP_FILE_SHARED \| CEPH_CAP_FILE_RD \|
				1651	CEPH_CAP_FILE_WR;
				1652	}
				1653	}
				1654
				1655	/* these do nothing */
				1656	if (ia_valid & ATTR_CTIME) {
				1657	bool only = (ia_valid & (ATTR_SIZE\|ATTR_MTIME\|ATTR_ATIME\|
				1658	ATTR_MODE\|ATTR_UID\|ATTR_GID)) == 0;
				1659	dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
				1660	inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
				1661	attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
				1662	only ? "ctime only" : "ignored");
				1663	inode->i_ctime = attr->ia_ctime;
				1664	if (only) {
				1665	/*
				1666	* if kernel wants to dirty ctime but nothing else,
				1667	* we need to choose a cap to dirty under, or do
				1668	* a almost-no-op setattr
				1669	*/
				1670	if (issued & CEPH_CAP_AUTH_EXCL)
				1671	dirtied \|= CEPH_CAP_AUTH_EXCL;
				1672	else if (issued & CEPH_CAP_FILE_EXCL)
				1673	dirtied \|= CEPH_CAP_FILE_EXCL;
				1674	else if (issued & CEPH_CAP_XATTR_EXCL)
				1675	dirtied \|= CEPH_CAP_XATTR_EXCL;
				1676	else
				1677	mask \|= CEPH_SETATTR_CTIME;
				1678	}
				1679	}
				1680	if (ia_valid & ATTR_FILE)
				1681	dout("setattr %p ATTR_FILE ... hrm!\n", inode);
				1682
				1683	if (dirtied) {
				1684	__ceph_mark_dirty_caps(ci, dirtied);
				1685	inode->i_ctime = CURRENT_TIME;
				1686	}
				1687
				1688	release &= issued;
				1689	spin_unlock(&inode->i_lock);
				1690
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	1691	if (mask) {
				1692	req->r_inode = igrab(inode);
				1693	req->r_inode_drop = release;
				1694	req->r_args.setattr.mask = cpu_to_le32(mask);
				1695	req->r_num_caps = 1;
				1696	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
				1697	}
				1698	dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
				1699	ceph_cap_string(dirtied), mask);
				1700
				1701	ceph_mdsc_put_request(req);
				1702	__ceph_do_pending_vmtruncate(inode);
				1703	return err;
				1704	out:
				1705	spin_unlock(&inode->i_lock);
				1706	ceph_mdsc_put_request(req);
				1707	return err;
				1708	}
				1709
				1710	/*
				1711	* Verify that we have a lease on the given mask. If not,
				1712	* do a getattr against an mds.
				1713	*/
				1714	int ceph_do_getattr(struct inode *inode, int mask)
				1715	{
				1716	struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
				1717	struct ceph_mds_client *mdsc = &client->mdsc;
				1718	struct ceph_mds_request *req;
				1719	int err;
				1720
				1721	if (ceph_snap(inode) == CEPH_SNAPDIR) {
				1722	dout("do_getattr inode %p SNAPDIR\n", inode);
				1723	return 0;
				1724	}
				1725
				1726	dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
				1727	if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
				1728	return 0;
				1729
				1730	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
				1731	if (IS_ERR(req))
				1732	return PTR_ERR(req);
				1733	req->r_inode = igrab(inode);
				1734	req->r_num_caps = 1;
				1735	req->r_args.getattr.mask = cpu_to_le32(mask);
				1736	err = ceph_mdsc_do_request(mdsc, NULL, req);
				1737	ceph_mdsc_put_request(req);
				1738	dout("do_getattr result=%d\n", err);
				1739	return err;
				1740	}
				1741
				1742
				1743	/*
				1744	* Check inode permissions. We verify we have a valid value for
				1745	* the AUTH cap, then call the generic handler.
				1746	*/
				1747	int ceph_permission(struct inode *inode, int mask)
				1748	{
				1749	int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
				1750
				1751	if (!err)
				1752	err = generic_permission(inode, mask, NULL);
				1753	return err;
				1754	}
				1755
				1756	/*
				1757	* Get all attributes. Hopefully somedata we'll have a statlite()
				1758	* and can limit the fields we require to be accurate.
				1759	*/
				1760	int ceph_getattr(struct vfsmount mnt, struct dentry dentry,
				1761	struct kstat *stat)
				1762	{
				1763	struct inode *inode = dentry->d_inode;
Sage Weil	232d4b0	2009-10-21 11:21:49 -0700	[diff] [blame]	1764	struct ceph_inode_info *ci = ceph_inode(inode);
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	1765	int err;
				1766
				1767	err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
				1768	if (!err) {
				1769	generic_fillattr(inode, stat);
				1770	stat->ino = inode->i_ino;
				1771	if (ceph_snap(inode) != CEPH_NOSNAP)
				1772	stat->dev = ceph_snap(inode);
				1773	else
				1774	stat->dev = 0;
Sage Weil	232d4b0	2009-10-21 11:21:49 -0700	[diff] [blame]	1775	if (S_ISDIR(inode->i_mode)) {
				1776	stat->size = ci->i_rbytes;
				1777	stat->blocks = 0;
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	1778	stat->blksize = 65536;
Sage Weil	232d4b0	2009-10-21 11:21:49 -0700	[diff] [blame]	1779	}
Sage Weil	355da1e	2009-10-06 11:31:08 -0700	[diff] [blame]	1780	}
				1781	return err;
				1782	}