Blame - fs/ocfs2/extent_map.c - kernel/msm

blob: fcd4475d1f899754bb5ed61fa069b149a5e48c20 [file] [log] [blame]

Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1	/* -- mode: c; c-basic-offset: 8; --
				2	* vim: noexpandtab sw=8 ts=8 sts=0:
				3	*
				4	* extent_map.c
				5	*
				6	* In-memory extent map for OCFS2. Man, this code was prettier in
				7	* the library.
				8	*
				9	* Copyright (C) 2004 Oracle. All rights reserved.
				10	*
				11	* This program is free software; you can redistribute it and/or
				12	* modify it under the terms of the GNU General Public
				13	* License, version 2, as published by the Free Software Foundation.
				14	*
				15	* This program is distributed in the hope that it will be useful,
				16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				18	* General Public License for more details.
				19	*
				20	* You should have received a copy of the GNU General Public
				21	* License along with this program; if not, write to the
				22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				23	* Boston, MA 021110-1307, USA.
				24	*/
				25
				26	#include <linux/fs.h>
				27	#include <linux/init.h>
				28	#include <linux/types.h>
				29	#include <linux/slab.h>
				30	#include <linux/rbtree.h>
				31
				32	#define MLOG_MASK_PREFIX ML_EXTENT_MAP
				33	#include <cluster/masklog.h>
				34
				35	#include "ocfs2.h"
				36
				37	#include "extent_map.h"
				38	#include "inode.h"
				39	#include "super.h"
				40
				41	#include "buffer_head_io.h"
				42
				43
				44	/*
				45	* SUCK SUCK SUCK
				46	* Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
				47	*/
				48
				49	struct ocfs2_extent_map_entry {
				50	struct rb_node e_node;
				51	int e_tree_depth;
				52	struct ocfs2_extent_rec e_rec;
				53	};
				54
				55	struct ocfs2_em_insert_context {
				56	int need_left;
				57	int need_right;
				58	struct ocfs2_extent_map_entry *new_ent;
				59	struct ocfs2_extent_map_entry *old_ent;
				60	struct ocfs2_extent_map_entry *left_ent;
				61	struct ocfs2_extent_map_entry *right_ent;
				62	};
				63
				64	static kmem_cache_t *ocfs2_em_ent_cachep = NULL;
				65
				66
				67	static struct ocfs2_extent_map_entry *
				68	ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
				69	u32 cpos, u32 clusters,
				70	struct rb_node ***ret_p,
				71	struct rb_node **ret_parent);
				72	static int ocfs2_extent_map_insert(struct inode *inode,
				73	struct ocfs2_extent_rec *rec,
				74	int tree_depth);
				75	static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
				76	struct ocfs2_extent_map_entry *ent);
				77	static int ocfs2_extent_map_find_leaf(struct inode *inode,
				78	u32 cpos, u32 clusters,
				79	struct ocfs2_extent_list *el);
				80	static int ocfs2_extent_map_lookup_read(struct inode *inode,
				81	u32 cpos, u32 clusters,
				82	struct ocfs2_extent_map_entry **ret_ent);
				83	static int ocfs2_extent_map_try_insert(struct inode *inode,
				84	struct ocfs2_extent_rec *rec,
				85	int tree_depth,
				86	struct ocfs2_em_insert_context *ctxt);
				87
				88	/* returns 1 only if the rec contains all the given clusters -- that is that
				89	* rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
				90	* clusters) is >= the argument's endpoint */
				91	static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
				92	u32 cpos, u32 clusters)
				93	{
				94	if (le32_to_cpu(rec->e_cpos) > cpos)
				95	return 0;
				96	if (cpos + clusters > le32_to_cpu(rec->e_cpos) +
				97	le32_to_cpu(rec->e_clusters))
				98	return 0;
				99	return 1;
				100	}
				101
				102
				103	/*
				104	* Find an entry in the tree that intersects the region passed in.
				105	* Note that this will find straddled intervals, it is up to the
				106	* callers to enforce any boundary conditions.
				107	*
				108	* Callers must hold ip_lock. This lookup is not guaranteed to return
				109	* a tree_depth 0 match, and as such can race inserts if the lock
				110	* were not held.
				111	*
				112	* The rb_node garbage lets insertion share the search. Trivial
				113	* callers pass NULL.
				114	*/
				115	static struct ocfs2_extent_map_entry *
				116	ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
				117	u32 cpos, u32 clusters,
				118	struct rb_node ***ret_p,
				119	struct rb_node **ret_parent)
				120	{
				121	struct rb_node **p = &em->em_extents.rb_node;
				122	struct rb_node *parent = NULL;
				123	struct ocfs2_extent_map_entry *ent = NULL;
				124
				125	while (*p)
				126	{
				127	parent = *p;
				128	ent = rb_entry(parent, struct ocfs2_extent_map_entry,
				129	e_node);
				130	if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
				131	p = &(*p)->rb_left;
				132	ent = NULL;
				133	} else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
				134	le32_to_cpu(ent->e_rec.e_clusters))) {
				135	p = &(*p)->rb_right;
				136	ent = NULL;
				137	} else
				138	break;
				139	}
				140
				141	if (ret_p != NULL)
				142	*ret_p = p;
				143	if (ret_parent != NULL)
				144	*ret_parent = parent;
				145	return ent;
				146	}
				147
				148	/*
				149	* Find the leaf containing the interval we want. While we're on our
				150	* way down the tree, fill in every record we see at any depth, because
				151	* we might want it later.
				152	*
				153	* Note that this code is run without ip_lock. That's because it
				154	* sleeps while reading. If someone is also filling the extent list at
				155	* the same time we are, we might have to restart.
				156	*/
				157	static int ocfs2_extent_map_find_leaf(struct inode *inode,
				158	u32 cpos, u32 clusters,
				159	struct ocfs2_extent_list *el)
				160	{
				161	int i, ret;
				162	struct buffer_head *eb_bh = NULL;
				163	u64 blkno;
				164	u32 rec_end;
				165	struct ocfs2_extent_block *eb;
				166	struct ocfs2_extent_rec *rec;
				167
				168	/*
				169	* The bh data containing the el cannot change here, because
				170	* we hold alloc_sem. So we can do this without other
				171	* locks.
				172	*/
				173	while (el->l_tree_depth)
				174	{
				175	blkno = 0;
				176	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
				177	rec = &el->l_recs[i];
				178	rec_end = (le32_to_cpu(rec->e_cpos) +
				179	le32_to_cpu(rec->e_clusters));
				180
				181	ret = -EBADR;
				182	if (rec_end > OCFS2_I(inode)->ip_clusters) {
				183	mlog_errno(ret);
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	184	ocfs2_error(inode->i_sb,
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	185	"Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	186	i,
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	187	(unsigned long long)le64_to_cpu(rec->e_blkno),
				188	(unsigned long long)OCFS2_I(inode)->ip_blkno,
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	189	OCFS2_I(inode)->ip_clusters);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	190	goto out_free;
				191	}
				192
				193	if (rec_end <= cpos) {
				194	ret = ocfs2_extent_map_insert(inode, rec,
				195	le16_to_cpu(el->l_tree_depth));
				196	if (ret && (ret != -EEXIST)) {
				197	mlog_errno(ret);
				198	goto out_free;
				199	}
				200	continue;
				201	}
				202	if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
				203	ret = ocfs2_extent_map_insert(inode, rec,
				204	le16_to_cpu(el->l_tree_depth));
				205	if (ret && (ret != -EEXIST)) {
				206	mlog_errno(ret);
				207	goto out_free;
				208	}
				209	continue;
				210	}
				211
				212	/*
				213	* We've found a record that matches our
				214	* interval. We don't insert it because we're
				215	* about to traverse it.
				216	*/
				217
				218	/* Check to see if we're stradling */
				219	ret = -ESRCH;
				220	if (!ocfs2_extent_rec_contains_clusters(rec,
				221	cpos,
				222	clusters)) {
				223	mlog_errno(ret);
				224	goto out_free;
				225	}
				226
				227	/*
				228	* If we've already found a record, the el has
				229	* two records covering the same interval.
				230	* EEEK!
				231	*/
				232	ret = -EBADR;
				233	if (blkno) {
				234	mlog_errno(ret);
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	235	ocfs2_error(inode->i_sb,
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	236	"Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n",
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	237	cpos, clusters,
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	238	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				239	(unsigned long long)blkno, i,
				240	(unsigned long long)le64_to_cpu(rec->e_blkno));
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	241	goto out_free;
				242	}
				243
				244	blkno = le64_to_cpu(rec->e_blkno);
				245	}
				246
				247	/*
				248	* We don't support holes, and we're still up
				249	* in the branches, so we'd better have found someone
				250	*/
				251	ret = -EBADR;
				252	if (!blkno) {
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	253	ocfs2_error(inode->i_sb,
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	254	"No record found for (cpos = %u, clusters = %u) on inode %llu\n",
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	255	cpos, clusters,
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	256	(unsigned long long)OCFS2_I(inode)->ip_blkno);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	257	mlog_errno(ret);
				258	goto out_free;
				259	}
				260
				261	if (eb_bh) {
				262	brelse(eb_bh);
				263	eb_bh = NULL;
				264	}
				265	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
				266	blkno, &eb_bh, OCFS2_BH_CACHED,
				267	inode);
				268	if (ret) {
				269	mlog_errno(ret);
				270	goto out_free;
				271	}
				272	eb = (struct ocfs2_extent_block *)eb_bh->b_data;
				273	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
				274	OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
				275	ret = -EIO;
				276	goto out_free;
				277	}
				278	el = &eb->h_list;
				279	}
				280
Eric Sesterhenn / snakebyte	ebdec83	2006-01-27 10:32:52 +0100	[diff] [blame]	281	BUG_ON(el->l_tree_depth);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	282
				283	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
				284	rec = &el->l_recs[i];
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	285
				286	if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
				287	OCFS2_I(inode)->ip_clusters) {
				288	ret = -EBADR;
				289	mlog_errno(ret);
				290	ocfs2_error(inode->i_sb,
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	291	"Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	292	i,
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	293	(unsigned long long)le64_to_cpu(rec->e_blkno),
				294	(unsigned long long)OCFS2_I(inode)->ip_blkno,
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	295	OCFS2_I(inode)->ip_clusters);
				296	return ret;
				297	}
				298
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	299	ret = ocfs2_extent_map_insert(inode, rec,
				300	le16_to_cpu(el->l_tree_depth));
Joel Becker	a43db30	2006-04-27 16:36:14 -0700	[diff] [blame]	301	if (ret && (ret != -EEXIST)) {
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	302	mlog_errno(ret);
				303	goto out_free;
				304	}
				305	}
				306
				307	ret = 0;
				308
				309	out_free:
				310	if (eb_bh)
				311	brelse(eb_bh);
				312
				313	return ret;
				314	}
				315
				316	/*
				317	* This lookup actually will read from disk. It has one invariant:
				318	* It will never re-traverse blocks. This means that all inserts should
				319	* be new regions or more granular regions (both allowed by insert).
				320	*/
				321	static int ocfs2_extent_map_lookup_read(struct inode *inode,
				322	u32 cpos,
				323	u32 clusters,
				324	struct ocfs2_extent_map_entry **ret_ent)
				325	{
				326	int ret;
				327	u64 blkno;
				328	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
				329	struct ocfs2_extent_map_entry *ent;
				330	struct buffer_head *bh = NULL;
				331	struct ocfs2_extent_block *eb;
				332	struct ocfs2_dinode *di;
				333	struct ocfs2_extent_list *el;
				334
				335	spin_lock(&OCFS2_I(inode)->ip_lock);
				336	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
				337	if (ent) {
				338	if (!ent->e_tree_depth) {
				339	spin_unlock(&OCFS2_I(inode)->ip_lock);
				340	*ret_ent = ent;
				341	return 0;
				342	}
				343	blkno = le64_to_cpu(ent->e_rec.e_blkno);
				344	spin_unlock(&OCFS2_I(inode)->ip_lock);
				345
				346	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
				347	OCFS2_BH_CACHED, inode);
				348	if (ret) {
				349	mlog_errno(ret);
				350	if (bh)
				351	brelse(bh);
				352	return ret;
				353	}
				354	eb = (struct ocfs2_extent_block *)bh->b_data;
				355	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
				356	OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
				357	brelse(bh);
				358	return -EIO;
				359	}
				360	el = &eb->h_list;
				361	} else {
				362	spin_unlock(&OCFS2_I(inode)->ip_lock);
				363
				364	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
				365	OCFS2_I(inode)->ip_blkno, &bh,
				366	OCFS2_BH_CACHED, inode);
				367	if (ret) {
				368	mlog_errno(ret);
				369	if (bh)
				370	brelse(bh);
				371	return ret;
				372	}
				373	di = (struct ocfs2_dinode *)bh->b_data;
				374	if (!OCFS2_IS_VALID_DINODE(di)) {
				375	brelse(bh);
				376	OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
				377	return -EIO;
				378	}
				379	el = &di->id2.i_list;
				380	}
				381
				382	ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
				383	brelse(bh);
				384	if (ret) {
				385	mlog_errno(ret);
				386	return ret;
				387	}
				388
				389	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
				390	if (!ent) {
				391	ret = -ESRCH;
				392	mlog_errno(ret);
				393	return ret;
				394	}
				395
Eric Sesterhenn / snakebyte	ebdec83	2006-01-27 10:32:52 +0100	[diff] [blame]	396	/* FIXME: Make sure this isn't a corruption */
				397	BUG_ON(ent->e_tree_depth);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	398
				399	*ret_ent = ent;
				400
				401	return 0;
				402	}
				403
				404	/*
				405	* Callers must hold ip_lock. This can insert pieces of the tree,
				406	* thus racing lookup if the lock weren't held.
				407	*/
				408	static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
				409	struct ocfs2_extent_map_entry *ent)
				410	{
				411	struct rb_node *p, parent;
				412	struct ocfs2_extent_map_entry *old_ent;
				413
				414	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
				415	le32_to_cpu(ent->e_rec.e_clusters),
				416	&p, &parent);
				417	if (old_ent)
				418	return -EEXIST;
				419
				420	rb_link_node(&ent->e_node, parent, p);
				421	rb_insert_color(&ent->e_node, &em->em_extents);
				422
				423	return 0;
				424	}
				425
				426
				427	/*
				428	* Simple rule: on any return code other than -EAGAIN, anything left
				429	* in the insert_context will be freed.
Joel Becker	a43db30	2006-04-27 16:36:14 -0700	[diff] [blame]	430	*
				431	* Simple rule #2: A return code of -EEXIST from this function or
				432	* its calls to ocfs2_extent_map_insert_entry() signifies that another
				433	* thread beat us to the insert. It is not an actual error, but it
				434	* tells the caller we have no more work to do.
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	435	*/
				436	static int ocfs2_extent_map_try_insert(struct inode *inode,
				437	struct ocfs2_extent_rec *rec,
				438	int tree_depth,
				439	struct ocfs2_em_insert_context *ctxt)
				440	{
				441	int ret;
				442	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
				443	struct ocfs2_extent_map_entry *old_ent;
				444
				445	ctxt->need_left = 0;
				446	ctxt->need_right = 0;
				447	ctxt->old_ent = NULL;
				448
				449	spin_lock(&OCFS2_I(inode)->ip_lock);
				450	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
				451	if (!ret) {
				452	ctxt->new_ent = NULL;
				453	goto out_unlock;
				454	}
				455
Joel Becker	a43db30	2006-04-27 16:36:14 -0700	[diff] [blame]	456	/* Since insert_entry failed, the map MUST have old_ent */
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	457	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
Joel Becker	a43db30	2006-04-27 16:36:14 -0700	[diff] [blame]	458	le32_to_cpu(rec->e_clusters),
				459	NULL, NULL);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	460
Eric Sesterhenn / snakebyte	ebdec83	2006-01-27 10:32:52 +0100	[diff] [blame]	461	BUG_ON(!old_ent);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	462
Joel Becker	a43db30	2006-04-27 16:36:14 -0700	[diff] [blame]	463	if (old_ent->e_tree_depth < tree_depth) {
				464	/* Another thread beat us to the lower tree_depth */
				465	ret = -EEXIST;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	466	goto out_unlock;
Joel Becker	a43db30	2006-04-27 16:36:14 -0700	[diff] [blame]	467	}
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	468
				469	if (old_ent->e_tree_depth == tree_depth) {
Joel Becker	a43db30	2006-04-27 16:36:14 -0700	[diff] [blame]	470	/*
				471	* Another thread beat us to this tree_depth.
				472	* Let's make sure we agree with that thread (the
				473	* extent_rec should be identical).
				474	*/
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	475	if (!memcmp(rec, &old_ent->e_rec,
				476	sizeof(struct ocfs2_extent_rec)))
				477	ret = 0;
Joel Becker	a43db30	2006-04-27 16:36:14 -0700	[diff] [blame]	478	else
				479	/* FIXME: Should this be ESRCH/EBADR??? */
				480	ret = -EEXIST;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	481
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	482	goto out_unlock;
				483	}
				484
				485	/*
				486	* We do it in this order specifically so that no actual tree
				487	* changes occur until we have all the pieces we need. We
				488	* don't want malloc failures to leave an inconsistent tree.
				489	* Whenever we drop the lock, another process could be
				490	* inserting. Also note that, if another process just beat us
				491	* to an insert, we might not need the same pieces we needed
				492	* the first go round. In the end, the pieces we need will
				493	* be used, and the pieces we don't will be freed.
				494	*/
				495	ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
				496	le32_to_cpu(old_ent->e_rec.e_cpos));
				497	ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
				498	le32_to_cpu(old_ent->e_rec.e_clusters)) >
				499	(le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
				500	ret = -EAGAIN;
				501	if (ctxt->need_left) {
				502	if (!ctxt->left_ent)
				503	goto out_unlock;
				504	(ctxt->left_ent) = old_ent;
				505	ctxt->left_ent->e_rec.e_clusters =
				506	cpu_to_le32(le32_to_cpu(rec->e_cpos) -
				507	le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
				508	}
				509	if (ctxt->need_right) {
				510	if (!ctxt->right_ent)
				511	goto out_unlock;
				512	(ctxt->right_ent) = old_ent;
				513	ctxt->right_ent->e_rec.e_cpos =
				514	cpu_to_le32(le32_to_cpu(rec->e_cpos) +
				515	le32_to_cpu(rec->e_clusters));
				516	ctxt->right_ent->e_rec.e_clusters =
				517	cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
				518	le32_to_cpu(old_ent->e_rec.e_clusters)) -
				519	le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
				520	}
				521
				522	rb_erase(&old_ent->e_node, &em->em_extents);
				523	/* Now that he's erased, set him up for deletion */
				524	ctxt->old_ent = old_ent;
				525
				526	if (ctxt->need_left) {
				527	ret = ocfs2_extent_map_insert_entry(em,
				528	ctxt->left_ent);
				529	if (ret)
				530	goto out_unlock;
				531	ctxt->left_ent = NULL;
				532	}
				533
				534	if (ctxt->need_right) {
				535	ret = ocfs2_extent_map_insert_entry(em,
				536	ctxt->right_ent);
				537	if (ret)
				538	goto out_unlock;
				539	ctxt->right_ent = NULL;
				540	}
				541
				542	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
				543
				544	if (!ret)
				545	ctxt->new_ent = NULL;
				546
				547	out_unlock:
				548	spin_unlock(&OCFS2_I(inode)->ip_lock);
				549
				550	return ret;
				551	}
				552
				553
				554	static int ocfs2_extent_map_insert(struct inode *inode,
				555	struct ocfs2_extent_rec *rec,
				556	int tree_depth)
				557	{
				558	int ret;
				559	struct ocfs2_em_insert_context ctxt = {0, };
				560
				561	if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
				562	OCFS2_I(inode)->ip_map.em_clusters) {
				563	ret = -EBADR;
				564	mlog_errno(ret);
				565	return ret;
				566	}
				567
				568	/* Zero e_clusters means a truncated tail record. It better be EOF */
				569	if (!rec->e_clusters) {
				570	if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
				571	OCFS2_I(inode)->ip_map.em_clusters) {
				572	ret = -EBADR;
				573	mlog_errno(ret);
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	574	ocfs2_error(inode->i_sb,
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	575	"Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n",
				576	(unsigned long long)le64_to_cpu(rec->e_blkno),
				577	(unsigned long long)OCFS2_I(inode)->ip_blkno);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	578	return ret;
				579	}
				580
				581	/* Ignore the truncated tail */
				582	return 0;
				583	}
				584
				585	ret = -ENOMEM;
				586	ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
Sunil Mushran	afae00ab	2006-04-12 14:37:00 -0700	[diff] [blame]	587	GFP_NOFS);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	588	if (!ctxt.new_ent) {
				589	mlog_errno(ret);
				590	return ret;
				591	}
				592
				593	ctxt.new_ent->e_rec = *rec;
				594	ctxt.new_ent->e_tree_depth = tree_depth;
				595
				596	do {
				597	ret = -ENOMEM;
				598	if (ctxt.need_left && !ctxt.left_ent) {
				599	ctxt.left_ent =
				600	kmem_cache_alloc(ocfs2_em_ent_cachep,
Sunil Mushran	afae00ab	2006-04-12 14:37:00 -0700	[diff] [blame]	601	GFP_NOFS);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	602	if (!ctxt.left_ent)
				603	break;
				604	}
				605	if (ctxt.need_right && !ctxt.right_ent) {
				606	ctxt.right_ent =
				607	kmem_cache_alloc(ocfs2_em_ent_cachep,
Sunil Mushran	afae00ab	2006-04-12 14:37:00 -0700	[diff] [blame]	608	GFP_NOFS);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	609	if (!ctxt.right_ent)
				610	break;
				611	}
				612
				613	ret = ocfs2_extent_map_try_insert(inode, rec,
				614	tree_depth, &ctxt);
				615	} while (ret == -EAGAIN);
				616
Joel Becker	a43db30	2006-04-27 16:36:14 -0700	[diff] [blame]	617	if ((ret < 0) && (ret != -EEXIST))
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	618	mlog_errno(ret);
				619
				620	if (ctxt.left_ent)
				621	kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
				622	if (ctxt.right_ent)
				623	kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
				624	if (ctxt.old_ent)
				625	kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
				626	if (ctxt.new_ent)
				627	kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
				628
				629	return ret;
				630	}
				631
				632	/*
				633	* Append this record to the tail of the extent map. It must be
				634	* tree_depth 0. The record might be an extension of an existing
				635	* record, and as such that needs to be handled. eg:
				636	*
				637	* Existing record in the extent map:
				638	*
				639	* cpos = 10, len = 10
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	640	* \|---------\|
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	641	*
				642	* New Record:
				643	*
				644	* cpos = 10, len = 20
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	645	* \|------------------\|
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	646	*
				647	* The passed record is the new on-disk record. The new_clusters value
				648	* is how many clusters were added to the file. If the append is a
				649	* contiguous append, the new_clusters has been added to
				650	* rec->e_clusters. If the append is an entirely new extent, then
				651	* rec->e_clusters is == new_clusters.
				652	*/
				653	int ocfs2_extent_map_append(struct inode *inode,
				654	struct ocfs2_extent_rec *rec,
				655	u32 new_clusters)
				656	{
				657	int ret;
				658	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
				659	struct ocfs2_extent_map_entry *ent;
				660	struct ocfs2_extent_rec *old;
				661
				662	BUG_ON(!new_clusters);
				663	BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
				664
				665	if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
				666	/*
				667	* Size changed underneath us on disk. Drop any
				668	* straddling records and update our idea of
				669	* i_clusters
				670	*/
				671	ocfs2_extent_map_drop(inode, em->em_clusters - 1);
				672	em->em_clusters = OCFS2_I(inode)->ip_clusters;
				673	}
				674
				675	mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
				676	le32_to_cpu(rec->e_clusters)) !=
				677	(em->em_clusters + new_clusters),
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	678	"Inode %llu:\n"
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	679	"rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
				680	"em->em_clusters = %u + new_clusters = %u = %u\n",
Mark Fasheh	b069705	2006-03-03 10:24:33 -0800	[diff] [blame]	681	(unsigned long long)OCFS2_I(inode)->ip_blkno,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	682	le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
				683	le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
				684	em->em_clusters, new_clusters,
				685	em->em_clusters + new_clusters);
				686
				687	em->em_clusters += new_clusters;
				688
				689	ret = -ENOENT;
				690	if (le32_to_cpu(rec->e_clusters) > new_clusters) {
				691	/* This is a contiguous append */
				692	ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
				693	NULL, NULL);
				694	if (ent) {
				695	old = &ent->e_rec;
				696	BUG_ON((le32_to_cpu(rec->e_cpos) +
				697	le32_to_cpu(rec->e_clusters)) !=
				698	(le32_to_cpu(old->e_cpos) +
				699	le32_to_cpu(old->e_clusters) +
				700	new_clusters));
				701	if (ent->e_tree_depth == 0) {
				702	BUG_ON(le32_to_cpu(old->e_cpos) !=
				703	le32_to_cpu(rec->e_cpos));
				704	BUG_ON(le64_to_cpu(old->e_blkno) !=
				705	le64_to_cpu(rec->e_blkno));
				706	ret = 0;
				707	}
				708	/*
				709	* Let non-leafs fall through as -ENOENT to
				710	* force insertion of the new leaf.
				711	*/
				712	le32_add_cpu(&old->e_clusters, new_clusters);
				713	}
				714	}
				715
				716	if (ret == -ENOENT)
				717	ret = ocfs2_extent_map_insert(inode, rec, 0);
				718	if (ret < 0)
				719	mlog_errno(ret);
				720	return ret;
				721	}
				722
				723	#if 0
				724	/* Code here is included but defined out as it completes the extent
				725	* map api and may be used in the future. */
				726
				727	/*
				728	* Look up the record containing this cluster offset. This record is
				729	* part of the extent map. Do not free it. Any changes you make to
				730	* it will reflect in the extent map. So, if your last extent
				731	* is (cpos = 10, clusters = 10) and you truncate the file by 5
				732	* clusters, you can do:
				733	*
				734	* ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
				735	* rec->e_clusters -= 5;
				736	*
				737	* The lookup does not read from disk. If the map isn't filled in for
				738	* an entry, you won't find it.
				739	*
				740	* Also note that the returned record is valid until alloc_sem is
				741	* dropped. After that, truncate and extend can happen. Caveat Emptor.
				742	*/
				743	int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
				744	struct ocfs2_extent_rec **rec,
				745	int *tree_depth)
				746	{
				747	int ret = -ENOENT;
				748	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
				749	struct ocfs2_extent_map_entry *ent;
				750
				751	*rec = NULL;
				752
				753	if (cpos >= OCFS2_I(inode)->ip_clusters)
				754	return -EINVAL;
				755
				756	if (cpos >= em->em_clusters) {
				757	/*
				758	* Size changed underneath us on disk. Drop any
				759	* straddling records and update our idea of
				760	* i_clusters
				761	*/
				762	ocfs2_extent_map_drop(inode, em->em_clusters - 1);
				763	em->em_clusters = OCFS2_I(inode)->ip_clusters ;
				764	}
				765
				766	ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
				767	NULL, NULL);
				768
				769	if (ent) {
				770	*rec = &ent->e_rec;
				771	if (tree_depth)
				772	*tree_depth = ent->e_tree_depth;
				773	ret = 0;
				774	}
				775
				776	return ret;
				777	}
				778
				779	int ocfs2_extent_map_get_clusters(struct inode *inode,
				780	u32 v_cpos, int count,
				781	u32 p_cpos, int ret_count)
				782	{
				783	int ret;
				784	u32 coff, ccount;
				785	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
				786	struct ocfs2_extent_map_entry *ent = NULL;
				787
				788	*p_cpos = ccount = 0;
				789
				790	if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
				791	return -EINVAL;
				792
				793	if ((v_cpos + count) > em->em_clusters) {
				794	/*
				795	* Size changed underneath us on disk. Drop any
				796	* straddling records and update our idea of
				797	* i_clusters
				798	*/
				799	ocfs2_extent_map_drop(inode, em->em_clusters - 1);
				800	em->em_clusters = OCFS2_I(inode)->ip_clusters;
				801	}
				802
				803
				804	ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
				805	if (ret)
				806	return ret;
				807
				808	if (ent) {
				809	/* We should never find ourselves straddling an interval */
				810	if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
				811	v_cpos,
				812	count))
				813	return -ESRCH;
				814
				815	coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
				816	*p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
				817	le64_to_cpu(ent->e_rec.e_blkno)) +
				818	coff;
				819
				820	if (ret_count)
				821	*ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
				822
				823	return 0;
				824	}
				825
				826
				827	return -ENOENT;
				828	}
				829
				830	#endif /* 0 */
				831
				832	int ocfs2_extent_map_get_blocks(struct inode *inode,
				833	u64 v_blkno, int count,
				834	u64 p_blkno, int ret_count)
				835	{
				836	int ret;
				837	u64 boff;
				838	u32 cpos, clusters;
				839	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
				840	struct ocfs2_extent_map_entry *ent = NULL;
				841	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
				842	struct ocfs2_extent_rec *rec;
				843
				844	*p_blkno = 0;
				845
				846	cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
				847	clusters = ocfs2_blocks_to_clusters(inode->i_sb,
				848	(u64)count + bpc - 1);
				849	if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
				850	ret = -EINVAL;
				851	mlog_errno(ret);
				852	return ret;
				853	}
				854
				855	if ((cpos + clusters) > em->em_clusters) {
				856	/*
				857	* Size changed underneath us on disk. Drop any
				858	* straddling records and update our idea of
				859	* i_clusters
				860	*/
				861	ocfs2_extent_map_drop(inode, em->em_clusters - 1);
				862	em->em_clusters = OCFS2_I(inode)->ip_clusters;
				863	}
				864
				865	ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
				866	if (ret) {
				867	mlog_errno(ret);
				868	return ret;
				869	}
				870
				871	if (ent)
				872	{
				873	rec = &ent->e_rec;
				874
				875	/* We should never find ourselves straddling an interval */
				876	if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
				877	ret = -ESRCH;
				878	mlog_errno(ret);
				879	return ret;
				880	}
				881
				882	boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
				883	le32_to_cpu(rec->e_cpos));
				884	boff += (v_blkno & (u64)(bpc - 1));
				885	*p_blkno = le64_to_cpu(rec->e_blkno) + boff;
				886
				887	if (ret_count) {
				888	*ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
				889	le32_to_cpu(rec->e_clusters)) - boff;
				890	}
				891
				892	return 0;
				893	}
				894
				895	return -ENOENT;
				896	}
				897
				898	int ocfs2_extent_map_init(struct inode *inode)
				899	{
				900	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
				901
				902	em->em_extents = RB_ROOT;
				903	em->em_clusters = 0;
				904
				905	return 0;
				906	}
				907
				908	/* Needs the lock */
				909	static void __ocfs2_extent_map_drop(struct inode *inode,
				910	u32 new_clusters,
				911	struct rb_node **free_head,
				912	struct ocfs2_extent_map_entry **tail_ent)
				913	{
				914	struct rb_node node, next;
				915	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
				916	struct ocfs2_extent_map_entry *ent;
				917
				918	*free_head = NULL;
				919
				920	ent = NULL;
				921	node = rb_last(&em->em_extents);
				922	while (node)
				923	{
				924	next = rb_prev(node);
				925
				926	ent = rb_entry(node, struct ocfs2_extent_map_entry,
				927	e_node);
				928	if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
				929	break;
				930
				931	rb_erase(&ent->e_node, &em->em_extents);
				932
				933	node->rb_right = *free_head;
				934	*free_head = node;
				935
				936	ent = NULL;
				937	node = next;
				938	}
				939
				940	/* Do we have an entry straddling new_clusters? */
				941	if (tail_ent) {
				942	if (ent &&
				943	((le32_to_cpu(ent->e_rec.e_cpos) +
				944	le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
				945	*tail_ent = ent;
				946	else
				947	*tail_ent = NULL;
				948	}
				949	}
				950
				951	static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
				952	{
				953	struct rb_node *node;
				954	struct ocfs2_extent_map_entry *ent;
				955
				956	while (free_head) {
				957	node = free_head;
				958	free_head = node->rb_right;
				959
				960	ent = rb_entry(node, struct ocfs2_extent_map_entry,
				961	e_node);
				962	kmem_cache_free(ocfs2_em_ent_cachep, ent);
				963	}
				964	}
				965
				966	/*
				967	* Remove all entries past new_clusters, inclusive of an entry that
				968	* contains new_clusters. This is effectively a cache forget.
				969	*
				970	* If you want to also clip the last extent by some number of clusters,
				971	* you need to call ocfs2_extent_map_trunc().
				972	* This code does not check or modify ip_clusters.
				973	*/
				974	int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
				975	{
				976	struct rb_node *free_head = NULL;
				977	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
				978	struct ocfs2_extent_map_entry *ent;
				979
				980	spin_lock(&OCFS2_I(inode)->ip_lock);
				981
				982	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
				983
				984	if (ent) {
				985	rb_erase(&ent->e_node, &em->em_extents);
				986	ent->e_node.rb_right = free_head;
				987	free_head = &ent->e_node;
				988	}
				989
				990	spin_unlock(&OCFS2_I(inode)->ip_lock);
				991
				992	if (free_head)
				993	__ocfs2_extent_map_drop_cleanup(free_head);
				994
				995	return 0;
				996	}
				997
				998	/*
				999	* Remove all entries past new_clusters and also clip any extent
				1000	* straddling new_clusters, if there is one. This does not check
				1001	* or modify ip_clusters
				1002	*/
				1003	int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
				1004	{
				1005	struct rb_node *free_head = NULL;
				1006	struct ocfs2_extent_map_entry *ent = NULL;
				1007
				1008	spin_lock(&OCFS2_I(inode)->ip_lock);
				1009
				1010	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
				1011
				1012	if (ent)
				1013	ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
				1014	le32_to_cpu(ent->e_rec.e_cpos));
				1015
				1016	OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
				1017
				1018	spin_unlock(&OCFS2_I(inode)->ip_lock);
				1019
				1020	if (free_head)
				1021	__ocfs2_extent_map_drop_cleanup(free_head);
				1022
				1023	return 0;
				1024	}
				1025
				1026	int __init init_ocfs2_extent_maps(void)
				1027	{
				1028	ocfs2_em_ent_cachep =
				1029	kmem_cache_create("ocfs2_em_ent",
				1030	sizeof(struct ocfs2_extent_map_entry),
				1031	0, SLAB_HWCACHE_ALIGN, NULL, NULL);
				1032	if (!ocfs2_em_ent_cachep)
				1033	return -ENOMEM;
				1034
				1035	return 0;
				1036	}
				1037
Adrian Bunk	0c6c98f	2006-01-07 20:07:02 +0100	[diff] [blame]	1038	void exit_ocfs2_extent_maps(void)
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1039	{
				1040	kmem_cache_destroy(ocfs2_em_ent_cachep);
				1041	}