Blame - fs/ocfs2/extent_map.c - kernel/msm-4.9

blob: 1a5c69071df642afbe99a04ed348b33c955c9861 [file] [log] [blame]

Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1	/* -- mode: c; c-basic-offset: 8; --
				2	* vim: noexpandtab sw=8 ts=8 sts=0:
				3	*
				4	* extent_map.c
				5	*
				6	* In-memory extent map for OCFS2. Man, this code was prettier in
				7	* the library.
				8	*
				9	* Copyright (C) 2004 Oracle. All rights reserved.
				10	*
				11	* This program is free software; you can redistribute it and/or
				12	* modify it under the terms of the GNU General Public
				13	* License, version 2, as published by the Free Software Foundation.
				14	*
				15	* This program is distributed in the hope that it will be useful,
				16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				18	* General Public License for more details.
				19	*
				20	* You should have received a copy of the GNU General Public
				21	* License along with this program; if not, write to the
				22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				23	* Boston, MA 021110-1307, USA.
				24	*/
				25
				26	#include <linux/fs.h>
				27	#include <linux/init.h>
				28	#include <linux/types.h>
				29	#include <linux/slab.h>
				30	#include <linux/rbtree.h>
				31
				32	#define MLOG_MASK_PREFIX ML_EXTENT_MAP
				33	#include <cluster/masklog.h>
				34
				35	#include "ocfs2.h"
				36
				37	#include "extent_map.h"
				38	#include "inode.h"
				39	#include "super.h"
				40
				41	#include "buffer_head_io.h"
				42
				43
				44	/*
				45	* SUCK SUCK SUCK
				46	* Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
				47	*/
				48
				49	struct ocfs2_extent_map_entry {
				50	struct rb_node e_node;
				51	int e_tree_depth;
				52	struct ocfs2_extent_rec e_rec;
				53	};
				54
				55	struct ocfs2_em_insert_context {
				56	int need_left;
				57	int need_right;
				58	struct ocfs2_extent_map_entry *new_ent;
				59	struct ocfs2_extent_map_entry *old_ent;
				60	struct ocfs2_extent_map_entry *left_ent;
				61	struct ocfs2_extent_map_entry *right_ent;
				62	};
				63
				64	static kmem_cache_t *ocfs2_em_ent_cachep = NULL;
				65
				66
				67	static struct ocfs2_extent_map_entry *
				68	ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
				69	u32 cpos, u32 clusters,
				70	struct rb_node ***ret_p,
				71	struct rb_node **ret_parent);
				72	static int ocfs2_extent_map_insert(struct inode *inode,
				73	struct ocfs2_extent_rec *rec,
				74	int tree_depth);
				75	static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
				76	struct ocfs2_extent_map_entry *ent);
				77	static int ocfs2_extent_map_find_leaf(struct inode *inode,
				78	u32 cpos, u32 clusters,
				79	struct ocfs2_extent_list *el);
				80	static int ocfs2_extent_map_lookup_read(struct inode *inode,
				81	u32 cpos, u32 clusters,
				82	struct ocfs2_extent_map_entry **ret_ent);
				83	static int ocfs2_extent_map_try_insert(struct inode *inode,
				84	struct ocfs2_extent_rec *rec,
				85	int tree_depth,
				86	struct ocfs2_em_insert_context *ctxt);
				87
				88	/* returns 1 only if the rec contains all the given clusters -- that is that
				89	* rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
				90	* clusters) is >= the argument's endpoint */
				91	static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
				92	u32 cpos, u32 clusters)
				93	{
				94	if (le32_to_cpu(rec->e_cpos) > cpos)
				95	return 0;
				96	if (cpos + clusters > le32_to_cpu(rec->e_cpos) +
				97	le32_to_cpu(rec->e_clusters))
				98	return 0;
				99	return 1;
				100	}
				101
				102
				103	/*
				104	* Find an entry in the tree that intersects the region passed in.
				105	* Note that this will find straddled intervals, it is up to the
				106	* callers to enforce any boundary conditions.
				107	*
				108	* Callers must hold ip_lock. This lookup is not guaranteed to return
				109	* a tree_depth 0 match, and as such can race inserts if the lock
				110	* were not held.
				111	*
				112	* The rb_node garbage lets insertion share the search. Trivial
				113	* callers pass NULL.
				114	*/
				115	static struct ocfs2_extent_map_entry *
				116	ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
				117	u32 cpos, u32 clusters,
				118	struct rb_node ***ret_p,
				119	struct rb_node **ret_parent)
				120	{
				121	struct rb_node **p = &em->em_extents.rb_node;
				122	struct rb_node *parent = NULL;
				123	struct ocfs2_extent_map_entry *ent = NULL;
				124
				125	while (*p)
				126	{
				127	parent = *p;
				128	ent = rb_entry(parent, struct ocfs2_extent_map_entry,
				129	e_node);
				130	if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
				131	p = &(*p)->rb_left;
				132	ent = NULL;
				133	} else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
				134	le32_to_cpu(ent->e_rec.e_clusters))) {
				135	p = &(*p)->rb_right;
				136	ent = NULL;
				137	} else
				138	break;
				139	}
				140
				141	if (ret_p != NULL)
				142	*ret_p = p;
				143	if (ret_parent != NULL)
				144	*ret_parent = parent;
				145	return ent;
				146	}
				147
				148	/*
				149	* Find the leaf containing the interval we want. While we're on our
				150	* way down the tree, fill in every record we see at any depth, because
				151	* we might want it later.
				152	*
				153	* Note that this code is run without ip_lock. That's because it
				154	* sleeps while reading. If someone is also filling the extent list at
				155	* the same time we are, we might have to restart.
				156	*/
				157	static int ocfs2_extent_map_find_leaf(struct inode *inode,
				158	u32 cpos, u32 clusters,
				159	struct ocfs2_extent_list *el)
				160	{
				161	int i, ret;
				162	struct buffer_head *eb_bh = NULL;
				163	u64 blkno;
				164	u32 rec_end;
				165	struct ocfs2_extent_block *eb;
				166	struct ocfs2_extent_rec *rec;
				167
				168	/*
				169	* The bh data containing the el cannot change here, because
				170	* we hold alloc_sem. So we can do this without other
				171	* locks.
				172	*/
				173	while (el->l_tree_depth)
				174	{
				175	blkno = 0;
				176	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
				177	rec = &el->l_recs[i];
				178	rec_end = (le32_to_cpu(rec->e_cpos) +
				179	le32_to_cpu(rec->e_clusters));
				180
				181	ret = -EBADR;
				182	if (rec_end > OCFS2_I(inode)->ip_clusters) {
				183	mlog_errno(ret);
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	184	ocfs2_error(inode->i_sb,
Mark Fasheh	b0697053	2006-03-03 10:24:33 -0800	[diff] [blame]	185	"Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	186	i,
Mark Fasheh	b0697053	2006-03-03 10:24:33 -0800	[diff] [blame]	187	(unsigned long long)le64_to_cpu(rec->e_blkno),
				188	(unsigned long long)OCFS2_I(inode)->ip_blkno,
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	189	OCFS2_I(inode)->ip_clusters);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	190	goto out_free;
				191	}
				192
				193	if (rec_end <= cpos) {
				194	ret = ocfs2_extent_map_insert(inode, rec,
				195	le16_to_cpu(el->l_tree_depth));
				196	if (ret && (ret != -EEXIST)) {
				197	mlog_errno(ret);
				198	goto out_free;
				199	}
				200	continue;
				201	}
				202	if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
				203	ret = ocfs2_extent_map_insert(inode, rec,
				204	le16_to_cpu(el->l_tree_depth));
				205	if (ret && (ret != -EEXIST)) {
				206	mlog_errno(ret);
				207	goto out_free;
				208	}
				209	continue;
				210	}
				211
				212	/*
				213	* We've found a record that matches our
				214	* interval. We don't insert it because we're
				215	* about to traverse it.
				216	*/
				217
				218	/* Check to see if we're stradling */
				219	ret = -ESRCH;
				220	if (!ocfs2_extent_rec_contains_clusters(rec,
				221	cpos,
				222	clusters)) {
				223	mlog_errno(ret);
				224	goto out_free;
				225	}
				226
				227	/*
				228	* If we've already found a record, the el has
				229	* two records covering the same interval.
				230	* EEEK!
				231	*/
				232	ret = -EBADR;
				233	if (blkno) {
				234	mlog_errno(ret);
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	235	ocfs2_error(inode->i_sb,
Mark Fasheh	b0697053	2006-03-03 10:24:33 -0800	[diff] [blame]	236	"Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n",
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	237	cpos, clusters,
Mark Fasheh	b0697053	2006-03-03 10:24:33 -0800	[diff] [blame]	238	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				239	(unsigned long long)blkno, i,
				240	(unsigned long long)le64_to_cpu(rec->e_blkno));
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	241	goto out_free;
				242	}
				243
				244	blkno = le64_to_cpu(rec->e_blkno);
				245	}
				246
				247	/*
				248	* We don't support holes, and we're still up
				249	* in the branches, so we'd better have found someone
				250	*/
				251	ret = -EBADR;
				252	if (!blkno) {
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	253	ocfs2_error(inode->i_sb,
Mark Fasheh	b0697053	2006-03-03 10:24:33 -0800	[diff] [blame]	254	"No record found for (cpos = %u, clusters = %u) on inode %llu\n",
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	255	cpos, clusters,
Mark Fasheh	b0697053	2006-03-03 10:24:33 -0800	[diff] [blame]	256	(unsigned long long)OCFS2_I(inode)->ip_blkno);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	257	mlog_errno(ret);
				258	goto out_free;
				259	}
				260
				261	if (eb_bh) {
				262	brelse(eb_bh);
				263	eb_bh = NULL;
				264	}
				265	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
				266	blkno, &eb_bh, OCFS2_BH_CACHED,
				267	inode);
				268	if (ret) {
				269	mlog_errno(ret);
				270	goto out_free;
				271	}
				272	eb = (struct ocfs2_extent_block *)eb_bh->b_data;
				273	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
				274	OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
				275	ret = -EIO;
				276	goto out_free;
				277	}
				278	el = &eb->h_list;
				279	}
				280
Eric Sesterhenn / snakebyte	ebdec83	2006-01-27 10:32:52 +0100	[diff] [blame]	281	BUG_ON(el->l_tree_depth);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	282
				283	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
				284	rec = &el->l_recs[i];
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	285
				286	if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
				287	OCFS2_I(inode)->ip_clusters) {
				288	ret = -EBADR;
				289	mlog_errno(ret);
				290	ocfs2_error(inode->i_sb,
Mark Fasheh	b0697053	2006-03-03 10:24:33 -0800	[diff] [blame]	291	"Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	292	i,
Mark Fasheh	b0697053	2006-03-03 10:24:33 -0800	[diff] [blame]	293	(unsigned long long)le64_to_cpu(rec->e_blkno),
				294	(unsigned long long)OCFS2_I(inode)->ip_blkno,
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	295	OCFS2_I(inode)->ip_clusters);
				296	return ret;
				297	}
				298
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	299	ret = ocfs2_extent_map_insert(inode, rec,
				300	le16_to_cpu(el->l_tree_depth));
				301	if (ret) {
				302	mlog_errno(ret);
				303	goto out_free;
				304	}
				305	}
				306
				307	ret = 0;
				308
				309	out_free:
				310	if (eb_bh)
				311	brelse(eb_bh);
				312
				313	return ret;
				314	}
				315
				316	/*
				317	* This lookup actually will read from disk. It has one invariant:
				318	* It will never re-traverse blocks. This means that all inserts should
				319	* be new regions or more granular regions (both allowed by insert).
				320	*/
				321	static int ocfs2_extent_map_lookup_read(struct inode *inode,
				322	u32 cpos,
				323	u32 clusters,
				324	struct ocfs2_extent_map_entry **ret_ent)
				325	{
				326	int ret;
				327	u64 blkno;
				328	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
				329	struct ocfs2_extent_map_entry *ent;
				330	struct buffer_head *bh = NULL;
				331	struct ocfs2_extent_block *eb;
				332	struct ocfs2_dinode *di;
				333	struct ocfs2_extent_list *el;
				334
				335	spin_lock(&OCFS2_I(inode)->ip_lock);
				336	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
				337	if (ent) {
				338	if (!ent->e_tree_depth) {
				339	spin_unlock(&OCFS2_I(inode)->ip_lock);
				340	*ret_ent = ent;
				341	return 0;
				342	}
				343	blkno = le64_to_cpu(ent->e_rec.e_blkno);
				344	spin_unlock(&OCFS2_I(inode)->ip_lock);
				345
				346	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
				347	OCFS2_BH_CACHED, inode);
				348	if (ret) {
				349	mlog_errno(ret);
				350	if (bh)
				351	brelse(bh);
				352	return ret;
				353	}
				354	eb = (struct ocfs2_extent_block *)bh->b_data;
				355	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
				356	OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
				357	brelse(bh);
				358	return -EIO;
				359	}
				360	el = &eb->h_list;
				361	} else {
				362	spin_unlock(&OCFS2_I(inode)->ip_lock);
				363
				364	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
				365	OCFS2_I(inode)->ip_blkno, &bh,
				366	OCFS2_BH_CACHED, inode);
				367	if (ret) {
				368	mlog_errno(ret);
				369	if (bh)
				370	brelse(bh);
				371	return ret;
				372	}
				373	di = (struct ocfs2_dinode *)bh->b_data;
				374	if (!OCFS2_IS_VALID_DINODE(di)) {
				375	brelse(bh);
				376	OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
				377	return -EIO;
				378	}
				379	el = &di->id2.i_list;
				380	}
				381
				382	ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
				383	brelse(bh);
				384	if (ret) {
				385	mlog_errno(ret);
				386	return ret;
				387	}
				388
				389	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
				390	if (!ent) {
				391	ret = -ESRCH;
				392	mlog_errno(ret);
				393	return ret;
				394	}
				395
Eric Sesterhenn / snakebyte	ebdec83	2006-01-27 10:32:52 +0100	[diff] [blame]	396	/* FIXME: Make sure this isn't a corruption */
				397	BUG_ON(ent->e_tree_depth);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	398
				399	*ret_ent = ent;
				400
				401	return 0;
				402	}
				403
				404	/*
				405	* Callers must hold ip_lock. This can insert pieces of the tree,
				406	* thus racing lookup if the lock weren't held.
				407	*/
				408	static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
				409	struct ocfs2_extent_map_entry *ent)
				410	{
				411	struct rb_node *p, parent;
				412	struct ocfs2_extent_map_entry *old_ent;
				413
				414	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
				415	le32_to_cpu(ent->e_rec.e_clusters),
				416	&p, &parent);
				417	if (old_ent)
				418	return -EEXIST;
				419
				420	rb_link_node(&ent->e_node, parent, p);
				421	rb_insert_color(&ent->e_node, &em->em_extents);
				422
				423	return 0;
				424	}
				425
				426
				427	/*
				428	* Simple rule: on any return code other than -EAGAIN, anything left
				429	* in the insert_context will be freed.
				430	*/
				431	static int ocfs2_extent_map_try_insert(struct inode *inode,
				432	struct ocfs2_extent_rec *rec,
				433	int tree_depth,
				434	struct ocfs2_em_insert_context *ctxt)
				435	{
				436	int ret;
				437	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
				438	struct ocfs2_extent_map_entry *old_ent;
				439
				440	ctxt->need_left = 0;
				441	ctxt->need_right = 0;
				442	ctxt->old_ent = NULL;
				443
				444	spin_lock(&OCFS2_I(inode)->ip_lock);
				445	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
				446	if (!ret) {
				447	ctxt->new_ent = NULL;
				448	goto out_unlock;
				449	}
				450
				451	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
				452	le32_to_cpu(rec->e_clusters), NULL,
				453	NULL);
				454
Eric Sesterhenn / snakebyte	ebdec83	2006-01-27 10:32:52 +0100	[diff] [blame]	455	BUG_ON(!old_ent);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	456
				457	ret = -EEXIST;
				458	if (old_ent->e_tree_depth < tree_depth)
				459	goto out_unlock;
				460
				461	if (old_ent->e_tree_depth == tree_depth) {
				462	if (!memcmp(rec, &old_ent->e_rec,
				463	sizeof(struct ocfs2_extent_rec)))
				464	ret = 0;
				465
				466	/* FIXME: Should this be ESRCH/EBADR??? */
				467	goto out_unlock;
				468	}
				469
				470	/*
				471	* We do it in this order specifically so that no actual tree
				472	* changes occur until we have all the pieces we need. We
				473	* don't want malloc failures to leave an inconsistent tree.
				474	* Whenever we drop the lock, another process could be
				475	* inserting. Also note that, if another process just beat us
				476	* to an insert, we might not need the same pieces we needed
				477	* the first go round. In the end, the pieces we need will
				478	* be used, and the pieces we don't will be freed.
				479	*/
				480	ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
				481	le32_to_cpu(old_ent->e_rec.e_cpos));
				482	ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
				483	le32_to_cpu(old_ent->e_rec.e_clusters)) >
				484	(le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
				485	ret = -EAGAIN;
				486	if (ctxt->need_left) {
				487	if (!ctxt->left_ent)
				488	goto out_unlock;
				489	(ctxt->left_ent) = old_ent;
				490	ctxt->left_ent->e_rec.e_clusters =
				491	cpu_to_le32(le32_to_cpu(rec->e_cpos) -
				492	le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
				493	}
				494	if (ctxt->need_right) {
				495	if (!ctxt->right_ent)
				496	goto out_unlock;
				497	(ctxt->right_ent) = old_ent;
				498	ctxt->right_ent->e_rec.e_cpos =
				499	cpu_to_le32(le32_to_cpu(rec->e_cpos) +
				500	le32_to_cpu(rec->e_clusters));
				501	ctxt->right_ent->e_rec.e_clusters =
				502	cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
				503	le32_to_cpu(old_ent->e_rec.e_clusters)) -
				504	le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
				505	}
				506
				507	rb_erase(&old_ent->e_node, &em->em_extents);
				508	/* Now that he's erased, set him up for deletion */
				509	ctxt->old_ent = old_ent;
				510
				511	if (ctxt->need_left) {
				512	ret = ocfs2_extent_map_insert_entry(em,
				513	ctxt->left_ent);
				514	if (ret)
				515	goto out_unlock;
				516	ctxt->left_ent = NULL;
				517	}
				518
				519	if (ctxt->need_right) {
				520	ret = ocfs2_extent_map_insert_entry(em,
				521	ctxt->right_ent);
				522	if (ret)
				523	goto out_unlock;
				524	ctxt->right_ent = NULL;
				525	}
				526
				527	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
				528
				529	if (!ret)
				530	ctxt->new_ent = NULL;
				531
				532	out_unlock:
				533	spin_unlock(&OCFS2_I(inode)->ip_lock);
				534
				535	return ret;
				536	}
				537
				538
				539	static int ocfs2_extent_map_insert(struct inode *inode,
				540	struct ocfs2_extent_rec *rec,
				541	int tree_depth)
				542	{
				543	int ret;
				544	struct ocfs2_em_insert_context ctxt = {0, };
				545
				546	if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
				547	OCFS2_I(inode)->ip_map.em_clusters) {
				548	ret = -EBADR;
				549	mlog_errno(ret);
				550	return ret;
				551	}
				552
				553	/* Zero e_clusters means a truncated tail record. It better be EOF */
				554	if (!rec->e_clusters) {
				555	if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
				556	OCFS2_I(inode)->ip_map.em_clusters) {
				557	ret = -EBADR;
				558	mlog_errno(ret);
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	559	ocfs2_error(inode->i_sb,
Mark Fasheh	b0697053	2006-03-03 10:24:33 -0800	[diff] [blame]	560	"Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n",
				561	(unsigned long long)le64_to_cpu(rec->e_blkno),
				562	(unsigned long long)OCFS2_I(inode)->ip_blkno);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	563	return ret;
				564	}
				565
				566	/* Ignore the truncated tail */
				567	return 0;
				568	}
				569
				570	ret = -ENOMEM;
				571	ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
Sunil Mushran	afae00ab	2006-04-12 14:37:00 -0700	[diff] [blame]	572	GFP_NOFS);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	573	if (!ctxt.new_ent) {
				574	mlog_errno(ret);
				575	return ret;
				576	}
				577
				578	ctxt.new_ent->e_rec = *rec;
				579	ctxt.new_ent->e_tree_depth = tree_depth;
				580
				581	do {
				582	ret = -ENOMEM;
				583	if (ctxt.need_left && !ctxt.left_ent) {
				584	ctxt.left_ent =
				585	kmem_cache_alloc(ocfs2_em_ent_cachep,
Sunil Mushran	afae00ab	2006-04-12 14:37:00 -0700	[diff] [blame]	586	GFP_NOFS);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	587	if (!ctxt.left_ent)
				588	break;
				589	}
				590	if (ctxt.need_right && !ctxt.right_ent) {
				591	ctxt.right_ent =
				592	kmem_cache_alloc(ocfs2_em_ent_cachep,
Sunil Mushran	afae00ab	2006-04-12 14:37:00 -0700	[diff] [blame]	593	GFP_NOFS);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	594	if (!ctxt.right_ent)
				595	break;
				596	}
				597
				598	ret = ocfs2_extent_map_try_insert(inode, rec,
				599	tree_depth, &ctxt);
				600	} while (ret == -EAGAIN);
				601
				602	if (ret < 0)
				603	mlog_errno(ret);
				604
				605	if (ctxt.left_ent)
				606	kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
				607	if (ctxt.right_ent)
				608	kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
				609	if (ctxt.old_ent)
				610	kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
				611	if (ctxt.new_ent)
				612	kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
				613
				614	return ret;
				615	}
				616
				617	/*
				618	* Append this record to the tail of the extent map. It must be
				619	* tree_depth 0. The record might be an extension of an existing
				620	* record, and as such that needs to be handled. eg:
				621	*
				622	* Existing record in the extent map:
				623	*
				624	* cpos = 10, len = 10
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	625	* \|---------\|
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	626	*
				627	* New Record:
				628	*
				629	* cpos = 10, len = 20
Joel Becker	110ba90	2006-02-28 17:58:36 -0800	[diff] [blame]	630	* \|------------------\|
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	631	*
				632	* The passed record is the new on-disk record. The new_clusters value
				633	* is how many clusters were added to the file. If the append is a
				634	* contiguous append, the new_clusters has been added to
				635	* rec->e_clusters. If the append is an entirely new extent, then
				636	* rec->e_clusters is == new_clusters.
				637	*/
				638	int ocfs2_extent_map_append(struct inode *inode,
				639	struct ocfs2_extent_rec *rec,
				640	u32 new_clusters)
				641	{
				642	int ret;
				643	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
				644	struct ocfs2_extent_map_entry *ent;
				645	struct ocfs2_extent_rec *old;
				646
				647	BUG_ON(!new_clusters);
				648	BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
				649
				650	if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
				651	/*
				652	* Size changed underneath us on disk. Drop any
				653	* straddling records and update our idea of
				654	* i_clusters
				655	*/
				656	ocfs2_extent_map_drop(inode, em->em_clusters - 1);
				657	em->em_clusters = OCFS2_I(inode)->ip_clusters;
				658	}
				659
				660	mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
				661	le32_to_cpu(rec->e_clusters)) !=
				662	(em->em_clusters + new_clusters),
Mark Fasheh	b0697053	2006-03-03 10:24:33 -0800	[diff] [blame]	663	"Inode %llu:\n"
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	664	"rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
				665	"em->em_clusters = %u + new_clusters = %u = %u\n",
Mark Fasheh	b0697053	2006-03-03 10:24:33 -0800	[diff] [blame]	666	(unsigned long long)OCFS2_I(inode)->ip_blkno,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	667	le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
				668	le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
				669	em->em_clusters, new_clusters,
				670	em->em_clusters + new_clusters);
				671
				672	em->em_clusters += new_clusters;
				673
				674	ret = -ENOENT;
				675	if (le32_to_cpu(rec->e_clusters) > new_clusters) {
				676	/* This is a contiguous append */
				677	ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
				678	NULL, NULL);
				679	if (ent) {
				680	old = &ent->e_rec;
				681	BUG_ON((le32_to_cpu(rec->e_cpos) +
				682	le32_to_cpu(rec->e_clusters)) !=
				683	(le32_to_cpu(old->e_cpos) +
				684	le32_to_cpu(old->e_clusters) +
				685	new_clusters));
				686	if (ent->e_tree_depth == 0) {
				687	BUG_ON(le32_to_cpu(old->e_cpos) !=
				688	le32_to_cpu(rec->e_cpos));
				689	BUG_ON(le64_to_cpu(old->e_blkno) !=
				690	le64_to_cpu(rec->e_blkno));
				691	ret = 0;
				692	}
				693	/*
				694	* Let non-leafs fall through as -ENOENT to
				695	* force insertion of the new leaf.
				696	*/
				697	le32_add_cpu(&old->e_clusters, new_clusters);
				698	}
				699	}
				700
				701	if (ret == -ENOENT)
				702	ret = ocfs2_extent_map_insert(inode, rec, 0);
				703	if (ret < 0)
				704	mlog_errno(ret);
				705	return ret;
				706	}
				707
				708	#if 0
				709	/* Code here is included but defined out as it completes the extent
				710	* map api and may be used in the future. */
				711
				712	/*
				713	* Look up the record containing this cluster offset. This record is
				714	* part of the extent map. Do not free it. Any changes you make to
				715	* it will reflect in the extent map. So, if your last extent
				716	* is (cpos = 10, clusters = 10) and you truncate the file by 5
				717	* clusters, you can do:
				718	*
				719	* ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
				720	* rec->e_clusters -= 5;
				721	*
				722	* The lookup does not read from disk. If the map isn't filled in for
				723	* an entry, you won't find it.
				724	*
				725	* Also note that the returned record is valid until alloc_sem is
				726	* dropped. After that, truncate and extend can happen. Caveat Emptor.
				727	*/
				728	int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
				729	struct ocfs2_extent_rec **rec,
				730	int *tree_depth)
				731	{
				732	int ret = -ENOENT;
				733	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
				734	struct ocfs2_extent_map_entry *ent;
				735
				736	*rec = NULL;
				737
				738	if (cpos >= OCFS2_I(inode)->ip_clusters)
				739	return -EINVAL;
				740
				741	if (cpos >= em->em_clusters) {
				742	/*
				743	* Size changed underneath us on disk. Drop any
				744	* straddling records and update our idea of
				745	* i_clusters
				746	*/
				747	ocfs2_extent_map_drop(inode, em->em_clusters - 1);
				748	em->em_clusters = OCFS2_I(inode)->ip_clusters ;
				749	}
				750
				751	ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
				752	NULL, NULL);
				753
				754	if (ent) {
				755	*rec = &ent->e_rec;
				756	if (tree_depth)
				757	*tree_depth = ent->e_tree_depth;
				758	ret = 0;
				759	}
				760
				761	return ret;
				762	}
				763
				764	int ocfs2_extent_map_get_clusters(struct inode *inode,
				765	u32 v_cpos, int count,
				766	u32 p_cpos, int ret_count)
				767	{
				768	int ret;
				769	u32 coff, ccount;
				770	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
				771	struct ocfs2_extent_map_entry *ent = NULL;
				772
				773	*p_cpos = ccount = 0;
				774
				775	if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
				776	return -EINVAL;
				777
				778	if ((v_cpos + count) > em->em_clusters) {
				779	/*
				780	* Size changed underneath us on disk. Drop any
				781	* straddling records and update our idea of
				782	* i_clusters
				783	*/
				784	ocfs2_extent_map_drop(inode, em->em_clusters - 1);
				785	em->em_clusters = OCFS2_I(inode)->ip_clusters;
				786	}
				787
				788
				789	ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
				790	if (ret)
				791	return ret;
				792
				793	if (ent) {
				794	/* We should never find ourselves straddling an interval */
				795	if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
				796	v_cpos,
				797	count))
				798	return -ESRCH;
				799
				800	coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
				801	*p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
				802	le64_to_cpu(ent->e_rec.e_blkno)) +
				803	coff;
				804
				805	if (ret_count)
				806	*ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
				807
				808	return 0;
				809	}
				810
				811
				812	return -ENOENT;
				813	}
				814
				815	#endif /* 0 */
				816
				817	int ocfs2_extent_map_get_blocks(struct inode *inode,
				818	u64 v_blkno, int count,
				819	u64 p_blkno, int ret_count)
				820	{
				821	int ret;
				822	u64 boff;
				823	u32 cpos, clusters;
				824	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
				825	struct ocfs2_extent_map_entry *ent = NULL;
				826	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
				827	struct ocfs2_extent_rec *rec;
				828
				829	*p_blkno = 0;
				830
				831	cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
				832	clusters = ocfs2_blocks_to_clusters(inode->i_sb,
				833	(u64)count + bpc - 1);
				834	if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
				835	ret = -EINVAL;
				836	mlog_errno(ret);
				837	return ret;
				838	}
				839
				840	if ((cpos + clusters) > em->em_clusters) {
				841	/*
				842	* Size changed underneath us on disk. Drop any
				843	* straddling records and update our idea of
				844	* i_clusters
				845	*/
				846	ocfs2_extent_map_drop(inode, em->em_clusters - 1);
				847	em->em_clusters = OCFS2_I(inode)->ip_clusters;
				848	}
				849
				850	ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
				851	if (ret) {
				852	mlog_errno(ret);
				853	return ret;
				854	}
				855
				856	if (ent)
				857	{
				858	rec = &ent->e_rec;
				859
				860	/* We should never find ourselves straddling an interval */
				861	if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
				862	ret = -ESRCH;
				863	mlog_errno(ret);
				864	return ret;
				865	}
				866
				867	boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
				868	le32_to_cpu(rec->e_cpos));
				869	boff += (v_blkno & (u64)(bpc - 1));
				870	*p_blkno = le64_to_cpu(rec->e_blkno) + boff;
				871
				872	if (ret_count) {
				873	*ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
				874	le32_to_cpu(rec->e_clusters)) - boff;
				875	}
				876
				877	return 0;
				878	}
				879
				880	return -ENOENT;
				881	}
				882
				883	int ocfs2_extent_map_init(struct inode *inode)
				884	{
				885	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
				886
				887	em->em_extents = RB_ROOT;
				888	em->em_clusters = 0;
				889
				890	return 0;
				891	}
				892
				893	/* Needs the lock */
				894	static void __ocfs2_extent_map_drop(struct inode *inode,
				895	u32 new_clusters,
				896	struct rb_node **free_head,
				897	struct ocfs2_extent_map_entry **tail_ent)
				898	{
				899	struct rb_node node, next;
				900	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
				901	struct ocfs2_extent_map_entry *ent;
				902
				903	*free_head = NULL;
				904
				905	ent = NULL;
				906	node = rb_last(&em->em_extents);
				907	while (node)
				908	{
				909	next = rb_prev(node);
				910
				911	ent = rb_entry(node, struct ocfs2_extent_map_entry,
				912	e_node);
				913	if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
				914	break;
				915
				916	rb_erase(&ent->e_node, &em->em_extents);
				917
				918	node->rb_right = *free_head;
				919	*free_head = node;
				920
				921	ent = NULL;
				922	node = next;
				923	}
				924
				925	/* Do we have an entry straddling new_clusters? */
				926	if (tail_ent) {
				927	if (ent &&
				928	((le32_to_cpu(ent->e_rec.e_cpos) +
				929	le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
				930	*tail_ent = ent;
				931	else
				932	*tail_ent = NULL;
				933	}
				934	}
				935
				936	static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
				937	{
				938	struct rb_node *node;
				939	struct ocfs2_extent_map_entry *ent;
				940
				941	while (free_head) {
				942	node = free_head;
				943	free_head = node->rb_right;
				944
				945	ent = rb_entry(node, struct ocfs2_extent_map_entry,
				946	e_node);
				947	kmem_cache_free(ocfs2_em_ent_cachep, ent);
				948	}
				949	}
				950
				951	/*
				952	* Remove all entries past new_clusters, inclusive of an entry that
				953	* contains new_clusters. This is effectively a cache forget.
				954	*
				955	* If you want to also clip the last extent by some number of clusters,
				956	* you need to call ocfs2_extent_map_trunc().
				957	* This code does not check or modify ip_clusters.
				958	*/
				959	int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
				960	{
				961	struct rb_node *free_head = NULL;
				962	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
				963	struct ocfs2_extent_map_entry *ent;
				964
				965	spin_lock(&OCFS2_I(inode)->ip_lock);
				966
				967	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
				968
				969	if (ent) {
				970	rb_erase(&ent->e_node, &em->em_extents);
				971	ent->e_node.rb_right = free_head;
				972	free_head = &ent->e_node;
				973	}
				974
				975	spin_unlock(&OCFS2_I(inode)->ip_lock);
				976
				977	if (free_head)
				978	__ocfs2_extent_map_drop_cleanup(free_head);
				979
				980	return 0;
				981	}
				982
				983	/*
				984	* Remove all entries past new_clusters and also clip any extent
				985	* straddling new_clusters, if there is one. This does not check
				986	* or modify ip_clusters
				987	*/
				988	int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
				989	{
				990	struct rb_node *free_head = NULL;
				991	struct ocfs2_extent_map_entry *ent = NULL;
				992
				993	spin_lock(&OCFS2_I(inode)->ip_lock);
				994
				995	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
				996
				997	if (ent)
				998	ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
				999	le32_to_cpu(ent->e_rec.e_cpos));
				1000
				1001	OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
				1002
				1003	spin_unlock(&OCFS2_I(inode)->ip_lock);
				1004
				1005	if (free_head)
				1006	__ocfs2_extent_map_drop_cleanup(free_head);
				1007
				1008	return 0;
				1009	}
				1010
				1011	int __init init_ocfs2_extent_maps(void)
				1012	{
				1013	ocfs2_em_ent_cachep =
				1014	kmem_cache_create("ocfs2_em_ent",
				1015	sizeof(struct ocfs2_extent_map_entry),
				1016	0, SLAB_HWCACHE_ALIGN, NULL, NULL);
				1017	if (!ocfs2_em_ent_cachep)
				1018	return -ENOMEM;
				1019
				1020	return 0;
				1021	}
				1022
Adrian Bunk	0c6c98f	2006-01-07 20:07:02 +0100	[diff] [blame]	1023	void exit_ocfs2_extent_maps(void)
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1024	{
				1025	kmem_cache_destroy(ocfs2_em_ent_cachep);
				1026	}