blob: f2fb40cd296a2890b1af082794c1324ba6969c0d [file] [log] [blame]
Mark Fashehccd979b2005-12-15 14:31:24 -08001/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * extent_map.c
5 *
6 * In-memory extent map for OCFS2. Man, this code was prettier in
7 * the library.
8 *
9 * Copyright (C) 2004 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License, version 2, as published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/init.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/rbtree.h>
31
32#define MLOG_MASK_PREFIX ML_EXTENT_MAP
33#include <cluster/masklog.h>
34
35#include "ocfs2.h"
36
37#include "extent_map.h"
38#include "inode.h"
39#include "super.h"
40
41#include "buffer_head_io.h"
42
43
44/*
45 * SUCK SUCK SUCK
46 * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
47 */
48
49struct ocfs2_extent_map_entry {
50 struct rb_node e_node;
51 int e_tree_depth;
52 struct ocfs2_extent_rec e_rec;
53};
54
55struct ocfs2_em_insert_context {
56 int need_left;
57 int need_right;
58 struct ocfs2_extent_map_entry *new_ent;
59 struct ocfs2_extent_map_entry *old_ent;
60 struct ocfs2_extent_map_entry *left_ent;
61 struct ocfs2_extent_map_entry *right_ent;
62};
63
64static kmem_cache_t *ocfs2_em_ent_cachep = NULL;
65
66
67static struct ocfs2_extent_map_entry *
68ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
69 u32 cpos, u32 clusters,
70 struct rb_node ***ret_p,
71 struct rb_node **ret_parent);
72static int ocfs2_extent_map_insert(struct inode *inode,
73 struct ocfs2_extent_rec *rec,
74 int tree_depth);
75static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
76 struct ocfs2_extent_map_entry *ent);
77static int ocfs2_extent_map_find_leaf(struct inode *inode,
78 u32 cpos, u32 clusters,
79 struct ocfs2_extent_list *el);
80static int ocfs2_extent_map_lookup_read(struct inode *inode,
81 u32 cpos, u32 clusters,
82 struct ocfs2_extent_map_entry **ret_ent);
83static int ocfs2_extent_map_try_insert(struct inode *inode,
84 struct ocfs2_extent_rec *rec,
85 int tree_depth,
86 struct ocfs2_em_insert_context *ctxt);
87
88/* returns 1 only if the rec contains all the given clusters -- that is that
89 * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
90 * clusters) is >= the argument's endpoint */
91static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
92 u32 cpos, u32 clusters)
93{
94 if (le32_to_cpu(rec->e_cpos) > cpos)
95 return 0;
96 if (cpos + clusters > le32_to_cpu(rec->e_cpos) +
97 le32_to_cpu(rec->e_clusters))
98 return 0;
99 return 1;
100}
101
102
103/*
104 * Find an entry in the tree that intersects the region passed in.
105 * Note that this will find straddled intervals, it is up to the
106 * callers to enforce any boundary conditions.
107 *
108 * Callers must hold ip_lock. This lookup is not guaranteed to return
109 * a tree_depth 0 match, and as such can race inserts if the lock
110 * were not held.
111 *
112 * The rb_node garbage lets insertion share the search. Trivial
113 * callers pass NULL.
114 */
115static struct ocfs2_extent_map_entry *
116ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
117 u32 cpos, u32 clusters,
118 struct rb_node ***ret_p,
119 struct rb_node **ret_parent)
120{
121 struct rb_node **p = &em->em_extents.rb_node;
122 struct rb_node *parent = NULL;
123 struct ocfs2_extent_map_entry *ent = NULL;
124
125 while (*p)
126 {
127 parent = *p;
128 ent = rb_entry(parent, struct ocfs2_extent_map_entry,
129 e_node);
130 if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
131 p = &(*p)->rb_left;
132 ent = NULL;
133 } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
134 le32_to_cpu(ent->e_rec.e_clusters))) {
135 p = &(*p)->rb_right;
136 ent = NULL;
137 } else
138 break;
139 }
140
141 if (ret_p != NULL)
142 *ret_p = p;
143 if (ret_parent != NULL)
144 *ret_parent = parent;
145 return ent;
146}
147
148/*
149 * Find the leaf containing the interval we want. While we're on our
150 * way down the tree, fill in every record we see at any depth, because
151 * we might want it later.
152 *
153 * Note that this code is run without ip_lock. That's because it
154 * sleeps while reading. If someone is also filling the extent list at
155 * the same time we are, we might have to restart.
156 */
157static int ocfs2_extent_map_find_leaf(struct inode *inode,
158 u32 cpos, u32 clusters,
159 struct ocfs2_extent_list *el)
160{
161 int i, ret;
162 struct buffer_head *eb_bh = NULL;
163 u64 blkno;
164 u32 rec_end;
165 struct ocfs2_extent_block *eb;
166 struct ocfs2_extent_rec *rec;
167
168 /*
169 * The bh data containing the el cannot change here, because
170 * we hold alloc_sem. So we can do this without other
171 * locks.
172 */
173 while (el->l_tree_depth)
174 {
175 blkno = 0;
176 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
177 rec = &el->l_recs[i];
178 rec_end = (le32_to_cpu(rec->e_cpos) +
179 le32_to_cpu(rec->e_clusters));
180
181 ret = -EBADR;
182 if (rec_end > OCFS2_I(inode)->ip_clusters) {
183 mlog_errno(ret);
184 goto out_free;
185 }
186
187 if (rec_end <= cpos) {
188 ret = ocfs2_extent_map_insert(inode, rec,
189 le16_to_cpu(el->l_tree_depth));
190 if (ret && (ret != -EEXIST)) {
191 mlog_errno(ret);
192 goto out_free;
193 }
194 continue;
195 }
196 if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
197 ret = ocfs2_extent_map_insert(inode, rec,
198 le16_to_cpu(el->l_tree_depth));
199 if (ret && (ret != -EEXIST)) {
200 mlog_errno(ret);
201 goto out_free;
202 }
203 continue;
204 }
205
206 /*
207 * We've found a record that matches our
208 * interval. We don't insert it because we're
209 * about to traverse it.
210 */
211
212 /* Check to see if we're stradling */
213 ret = -ESRCH;
214 if (!ocfs2_extent_rec_contains_clusters(rec,
215 cpos,
216 clusters)) {
217 mlog_errno(ret);
218 goto out_free;
219 }
220
221 /*
222 * If we've already found a record, the el has
223 * two records covering the same interval.
224 * EEEK!
225 */
226 ret = -EBADR;
227 if (blkno) {
228 mlog_errno(ret);
229 goto out_free;
230 }
231
232 blkno = le64_to_cpu(rec->e_blkno);
233 }
234
235 /*
236 * We don't support holes, and we're still up
237 * in the branches, so we'd better have found someone
238 */
239 ret = -EBADR;
240 if (!blkno) {
241 mlog_errno(ret);
242 goto out_free;
243 }
244
245 if (eb_bh) {
246 brelse(eb_bh);
247 eb_bh = NULL;
248 }
249 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
250 blkno, &eb_bh, OCFS2_BH_CACHED,
251 inode);
252 if (ret) {
253 mlog_errno(ret);
254 goto out_free;
255 }
256 eb = (struct ocfs2_extent_block *)eb_bh->b_data;
257 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
258 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
259 ret = -EIO;
260 goto out_free;
261 }
262 el = &eb->h_list;
263 }
264
265 if (el->l_tree_depth)
266 BUG();
267
268 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
269 rec = &el->l_recs[i];
270 ret = ocfs2_extent_map_insert(inode, rec,
271 le16_to_cpu(el->l_tree_depth));
272 if (ret) {
273 mlog_errno(ret);
274 goto out_free;
275 }
276 }
277
278 ret = 0;
279
280out_free:
281 if (eb_bh)
282 brelse(eb_bh);
283
284 return ret;
285}
286
287/*
288 * This lookup actually will read from disk. It has one invariant:
289 * It will never re-traverse blocks. This means that all inserts should
290 * be new regions or more granular regions (both allowed by insert).
291 */
292static int ocfs2_extent_map_lookup_read(struct inode *inode,
293 u32 cpos,
294 u32 clusters,
295 struct ocfs2_extent_map_entry **ret_ent)
296{
297 int ret;
298 u64 blkno;
299 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
300 struct ocfs2_extent_map_entry *ent;
301 struct buffer_head *bh = NULL;
302 struct ocfs2_extent_block *eb;
303 struct ocfs2_dinode *di;
304 struct ocfs2_extent_list *el;
305
306 spin_lock(&OCFS2_I(inode)->ip_lock);
307 ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
308 if (ent) {
309 if (!ent->e_tree_depth) {
310 spin_unlock(&OCFS2_I(inode)->ip_lock);
311 *ret_ent = ent;
312 return 0;
313 }
314 blkno = le64_to_cpu(ent->e_rec.e_blkno);
315 spin_unlock(&OCFS2_I(inode)->ip_lock);
316
317 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
318 OCFS2_BH_CACHED, inode);
319 if (ret) {
320 mlog_errno(ret);
321 if (bh)
322 brelse(bh);
323 return ret;
324 }
325 eb = (struct ocfs2_extent_block *)bh->b_data;
326 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
327 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
328 brelse(bh);
329 return -EIO;
330 }
331 el = &eb->h_list;
332 } else {
333 spin_unlock(&OCFS2_I(inode)->ip_lock);
334
335 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
336 OCFS2_I(inode)->ip_blkno, &bh,
337 OCFS2_BH_CACHED, inode);
338 if (ret) {
339 mlog_errno(ret);
340 if (bh)
341 brelse(bh);
342 return ret;
343 }
344 di = (struct ocfs2_dinode *)bh->b_data;
345 if (!OCFS2_IS_VALID_DINODE(di)) {
346 brelse(bh);
347 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
348 return -EIO;
349 }
350 el = &di->id2.i_list;
351 }
352
353 ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
354 brelse(bh);
355 if (ret) {
356 mlog_errno(ret);
357 return ret;
358 }
359
360 ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
361 if (!ent) {
362 ret = -ESRCH;
363 mlog_errno(ret);
364 return ret;
365 }
366
367 if (ent->e_tree_depth)
368 BUG(); /* FIXME: Make sure this isn't a corruption */
369
370 *ret_ent = ent;
371
372 return 0;
373}
374
375/*
376 * Callers must hold ip_lock. This can insert pieces of the tree,
377 * thus racing lookup if the lock weren't held.
378 */
379static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
380 struct ocfs2_extent_map_entry *ent)
381{
382 struct rb_node **p, *parent;
383 struct ocfs2_extent_map_entry *old_ent;
384
385 old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
386 le32_to_cpu(ent->e_rec.e_clusters),
387 &p, &parent);
388 if (old_ent)
389 return -EEXIST;
390
391 rb_link_node(&ent->e_node, parent, p);
392 rb_insert_color(&ent->e_node, &em->em_extents);
393
394 return 0;
395}
396
397
398/*
399 * Simple rule: on any return code other than -EAGAIN, anything left
400 * in the insert_context will be freed.
401 */
402static int ocfs2_extent_map_try_insert(struct inode *inode,
403 struct ocfs2_extent_rec *rec,
404 int tree_depth,
405 struct ocfs2_em_insert_context *ctxt)
406{
407 int ret;
408 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
409 struct ocfs2_extent_map_entry *old_ent;
410
411 ctxt->need_left = 0;
412 ctxt->need_right = 0;
413 ctxt->old_ent = NULL;
414
415 spin_lock(&OCFS2_I(inode)->ip_lock);
416 ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
417 if (!ret) {
418 ctxt->new_ent = NULL;
419 goto out_unlock;
420 }
421
422 old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
423 le32_to_cpu(rec->e_clusters), NULL,
424 NULL);
425
426 if (!old_ent)
427 BUG();
428
429 ret = -EEXIST;
430 if (old_ent->e_tree_depth < tree_depth)
431 goto out_unlock;
432
433 if (old_ent->e_tree_depth == tree_depth) {
434 if (!memcmp(rec, &old_ent->e_rec,
435 sizeof(struct ocfs2_extent_rec)))
436 ret = 0;
437
438 /* FIXME: Should this be ESRCH/EBADR??? */
439 goto out_unlock;
440 }
441
442 /*
443 * We do it in this order specifically so that no actual tree
444 * changes occur until we have all the pieces we need. We
445 * don't want malloc failures to leave an inconsistent tree.
446 * Whenever we drop the lock, another process could be
447 * inserting. Also note that, if another process just beat us
448 * to an insert, we might not need the same pieces we needed
449 * the first go round. In the end, the pieces we need will
450 * be used, and the pieces we don't will be freed.
451 */
452 ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
453 le32_to_cpu(old_ent->e_rec.e_cpos));
454 ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
455 le32_to_cpu(old_ent->e_rec.e_clusters)) >
456 (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
457 ret = -EAGAIN;
458 if (ctxt->need_left) {
459 if (!ctxt->left_ent)
460 goto out_unlock;
461 *(ctxt->left_ent) = *old_ent;
462 ctxt->left_ent->e_rec.e_clusters =
463 cpu_to_le32(le32_to_cpu(rec->e_cpos) -
464 le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
465 }
466 if (ctxt->need_right) {
467 if (!ctxt->right_ent)
468 goto out_unlock;
469 *(ctxt->right_ent) = *old_ent;
470 ctxt->right_ent->e_rec.e_cpos =
471 cpu_to_le32(le32_to_cpu(rec->e_cpos) +
472 le32_to_cpu(rec->e_clusters));
473 ctxt->right_ent->e_rec.e_clusters =
474 cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
475 le32_to_cpu(old_ent->e_rec.e_clusters)) -
476 le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
477 }
478
479 rb_erase(&old_ent->e_node, &em->em_extents);
480 /* Now that he's erased, set him up for deletion */
481 ctxt->old_ent = old_ent;
482
483 if (ctxt->need_left) {
484 ret = ocfs2_extent_map_insert_entry(em,
485 ctxt->left_ent);
486 if (ret)
487 goto out_unlock;
488 ctxt->left_ent = NULL;
489 }
490
491 if (ctxt->need_right) {
492 ret = ocfs2_extent_map_insert_entry(em,
493 ctxt->right_ent);
494 if (ret)
495 goto out_unlock;
496 ctxt->right_ent = NULL;
497 }
498
499 ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
500
501 if (!ret)
502 ctxt->new_ent = NULL;
503
504out_unlock:
505 spin_unlock(&OCFS2_I(inode)->ip_lock);
506
507 return ret;
508}
509
510
511static int ocfs2_extent_map_insert(struct inode *inode,
512 struct ocfs2_extent_rec *rec,
513 int tree_depth)
514{
515 int ret;
516 struct ocfs2_em_insert_context ctxt = {0, };
517
518 if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
519 OCFS2_I(inode)->ip_map.em_clusters) {
520 ret = -EBADR;
521 mlog_errno(ret);
522 return ret;
523 }
524
525 /* Zero e_clusters means a truncated tail record. It better be EOF */
526 if (!rec->e_clusters) {
527 if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
528 OCFS2_I(inode)->ip_map.em_clusters) {
529 ret = -EBADR;
530 mlog_errno(ret);
531 return ret;
532 }
533
534 /* Ignore the truncated tail */
535 return 0;
536 }
537
538 ret = -ENOMEM;
539 ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
540 GFP_KERNEL);
541 if (!ctxt.new_ent) {
542 mlog_errno(ret);
543 return ret;
544 }
545
546 ctxt.new_ent->e_rec = *rec;
547 ctxt.new_ent->e_tree_depth = tree_depth;
548
549 do {
550 ret = -ENOMEM;
551 if (ctxt.need_left && !ctxt.left_ent) {
552 ctxt.left_ent =
553 kmem_cache_alloc(ocfs2_em_ent_cachep,
554 GFP_KERNEL);
555 if (!ctxt.left_ent)
556 break;
557 }
558 if (ctxt.need_right && !ctxt.right_ent) {
559 ctxt.right_ent =
560 kmem_cache_alloc(ocfs2_em_ent_cachep,
561 GFP_KERNEL);
562 if (!ctxt.right_ent)
563 break;
564 }
565
566 ret = ocfs2_extent_map_try_insert(inode, rec,
567 tree_depth, &ctxt);
568 } while (ret == -EAGAIN);
569
570 if (ret < 0)
571 mlog_errno(ret);
572
573 if (ctxt.left_ent)
574 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
575 if (ctxt.right_ent)
576 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
577 if (ctxt.old_ent)
578 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
579 if (ctxt.new_ent)
580 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
581
582 return ret;
583}
584
585/*
586 * Append this record to the tail of the extent map. It must be
587 * tree_depth 0. The record might be an extension of an existing
588 * record, and as such that needs to be handled. eg:
589 *
590 * Existing record in the extent map:
591 *
592 * cpos = 10, len = 10
593 * |---------|
594 *
595 * New Record:
596 *
597 * cpos = 10, len = 20
598 * |------------------|
599 *
600 * The passed record is the new on-disk record. The new_clusters value
601 * is how many clusters were added to the file. If the append is a
602 * contiguous append, the new_clusters has been added to
603 * rec->e_clusters. If the append is an entirely new extent, then
604 * rec->e_clusters is == new_clusters.
605 */
606int ocfs2_extent_map_append(struct inode *inode,
607 struct ocfs2_extent_rec *rec,
608 u32 new_clusters)
609{
610 int ret;
611 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
612 struct ocfs2_extent_map_entry *ent;
613 struct ocfs2_extent_rec *old;
614
615 BUG_ON(!new_clusters);
616 BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
617
618 if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
619 /*
620 * Size changed underneath us on disk. Drop any
621 * straddling records and update our idea of
622 * i_clusters
623 */
624 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
625 em->em_clusters = OCFS2_I(inode)->ip_clusters;
626 }
627
628 mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
629 le32_to_cpu(rec->e_clusters)) !=
630 (em->em_clusters + new_clusters),
631 "Inode %"MLFu64":\n"
632 "rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
633 "em->em_clusters = %u + new_clusters = %u = %u\n",
634 OCFS2_I(inode)->ip_blkno,
635 le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
636 le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
637 em->em_clusters, new_clusters,
638 em->em_clusters + new_clusters);
639
640 em->em_clusters += new_clusters;
641
642 ret = -ENOENT;
643 if (le32_to_cpu(rec->e_clusters) > new_clusters) {
644 /* This is a contiguous append */
645 ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
646 NULL, NULL);
647 if (ent) {
648 old = &ent->e_rec;
649 BUG_ON((le32_to_cpu(rec->e_cpos) +
650 le32_to_cpu(rec->e_clusters)) !=
651 (le32_to_cpu(old->e_cpos) +
652 le32_to_cpu(old->e_clusters) +
653 new_clusters));
654 if (ent->e_tree_depth == 0) {
655 BUG_ON(le32_to_cpu(old->e_cpos) !=
656 le32_to_cpu(rec->e_cpos));
657 BUG_ON(le64_to_cpu(old->e_blkno) !=
658 le64_to_cpu(rec->e_blkno));
659 ret = 0;
660 }
661 /*
662 * Let non-leafs fall through as -ENOENT to
663 * force insertion of the new leaf.
664 */
665 le32_add_cpu(&old->e_clusters, new_clusters);
666 }
667 }
668
669 if (ret == -ENOENT)
670 ret = ocfs2_extent_map_insert(inode, rec, 0);
671 if (ret < 0)
672 mlog_errno(ret);
673 return ret;
674}
675
676#if 0
677/* Code here is included but defined out as it completes the extent
678 * map api and may be used in the future. */
679
680/*
681 * Look up the record containing this cluster offset. This record is
682 * part of the extent map. Do not free it. Any changes you make to
683 * it will reflect in the extent map. So, if your last extent
684 * is (cpos = 10, clusters = 10) and you truncate the file by 5
685 * clusters, you can do:
686 *
687 * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
688 * rec->e_clusters -= 5;
689 *
690 * The lookup does not read from disk. If the map isn't filled in for
691 * an entry, you won't find it.
692 *
693 * Also note that the returned record is valid until alloc_sem is
694 * dropped. After that, truncate and extend can happen. Caveat Emptor.
695 */
696int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
697 struct ocfs2_extent_rec **rec,
698 int *tree_depth)
699{
700 int ret = -ENOENT;
701 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
702 struct ocfs2_extent_map_entry *ent;
703
704 *rec = NULL;
705
706 if (cpos >= OCFS2_I(inode)->ip_clusters)
707 return -EINVAL;
708
709 if (cpos >= em->em_clusters) {
710 /*
711 * Size changed underneath us on disk. Drop any
712 * straddling records and update our idea of
713 * i_clusters
714 */
715 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
716 em->em_clusters = OCFS2_I(inode)->ip_clusters ;
717 }
718
719 ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
720 NULL, NULL);
721
722 if (ent) {
723 *rec = &ent->e_rec;
724 if (tree_depth)
725 *tree_depth = ent->e_tree_depth;
726 ret = 0;
727 }
728
729 return ret;
730}
731
732int ocfs2_extent_map_get_clusters(struct inode *inode,
733 u32 v_cpos, int count,
734 u32 *p_cpos, int *ret_count)
735{
736 int ret;
737 u32 coff, ccount;
738 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
739 struct ocfs2_extent_map_entry *ent = NULL;
740
741 *p_cpos = ccount = 0;
742
743 if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
744 return -EINVAL;
745
746 if ((v_cpos + count) > em->em_clusters) {
747 /*
748 * Size changed underneath us on disk. Drop any
749 * straddling records and update our idea of
750 * i_clusters
751 */
752 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
753 em->em_clusters = OCFS2_I(inode)->ip_clusters;
754 }
755
756
757 ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
758 if (ret)
759 return ret;
760
761 if (ent) {
762 /* We should never find ourselves straddling an interval */
763 if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
764 v_cpos,
765 count))
766 return -ESRCH;
767
768 coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
769 *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
770 le64_to_cpu(ent->e_rec.e_blkno)) +
771 coff;
772
773 if (ret_count)
774 *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
775
776 return 0;
777 }
778
779
780 return -ENOENT;
781}
782
783#endif /* 0 */
784
785int ocfs2_extent_map_get_blocks(struct inode *inode,
786 u64 v_blkno, int count,
787 u64 *p_blkno, int *ret_count)
788{
789 int ret;
790 u64 boff;
791 u32 cpos, clusters;
792 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
793 struct ocfs2_extent_map_entry *ent = NULL;
794 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
795 struct ocfs2_extent_rec *rec;
796
797 *p_blkno = 0;
798
799 cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
800 clusters = ocfs2_blocks_to_clusters(inode->i_sb,
801 (u64)count + bpc - 1);
802 if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
803 ret = -EINVAL;
804 mlog_errno(ret);
805 return ret;
806 }
807
808 if ((cpos + clusters) > em->em_clusters) {
809 /*
810 * Size changed underneath us on disk. Drop any
811 * straddling records and update our idea of
812 * i_clusters
813 */
814 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
815 em->em_clusters = OCFS2_I(inode)->ip_clusters;
816 }
817
818 ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
819 if (ret) {
820 mlog_errno(ret);
821 return ret;
822 }
823
824 if (ent)
825 {
826 rec = &ent->e_rec;
827
828 /* We should never find ourselves straddling an interval */
829 if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
830 ret = -ESRCH;
831 mlog_errno(ret);
832 return ret;
833 }
834
835 boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
836 le32_to_cpu(rec->e_cpos));
837 boff += (v_blkno & (u64)(bpc - 1));
838 *p_blkno = le64_to_cpu(rec->e_blkno) + boff;
839
840 if (ret_count) {
841 *ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
842 le32_to_cpu(rec->e_clusters)) - boff;
843 }
844
845 return 0;
846 }
847
848 return -ENOENT;
849}
850
851int ocfs2_extent_map_init(struct inode *inode)
852{
853 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
854
855 em->em_extents = RB_ROOT;
856 em->em_clusters = 0;
857
858 return 0;
859}
860
861/* Needs the lock */
862static void __ocfs2_extent_map_drop(struct inode *inode,
863 u32 new_clusters,
864 struct rb_node **free_head,
865 struct ocfs2_extent_map_entry **tail_ent)
866{
867 struct rb_node *node, *next;
868 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
869 struct ocfs2_extent_map_entry *ent;
870
871 *free_head = NULL;
872
873 ent = NULL;
874 node = rb_last(&em->em_extents);
875 while (node)
876 {
877 next = rb_prev(node);
878
879 ent = rb_entry(node, struct ocfs2_extent_map_entry,
880 e_node);
881 if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
882 break;
883
884 rb_erase(&ent->e_node, &em->em_extents);
885
886 node->rb_right = *free_head;
887 *free_head = node;
888
889 ent = NULL;
890 node = next;
891 }
892
893 /* Do we have an entry straddling new_clusters? */
894 if (tail_ent) {
895 if (ent &&
896 ((le32_to_cpu(ent->e_rec.e_cpos) +
897 le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
898 *tail_ent = ent;
899 else
900 *tail_ent = NULL;
901 }
902}
903
904static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
905{
906 struct rb_node *node;
907 struct ocfs2_extent_map_entry *ent;
908
909 while (free_head) {
910 node = free_head;
911 free_head = node->rb_right;
912
913 ent = rb_entry(node, struct ocfs2_extent_map_entry,
914 e_node);
915 kmem_cache_free(ocfs2_em_ent_cachep, ent);
916 }
917}
918
919/*
920 * Remove all entries past new_clusters, inclusive of an entry that
921 * contains new_clusters. This is effectively a cache forget.
922 *
923 * If you want to also clip the last extent by some number of clusters,
924 * you need to call ocfs2_extent_map_trunc().
925 * This code does not check or modify ip_clusters.
926 */
927int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
928{
929 struct rb_node *free_head = NULL;
930 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
931 struct ocfs2_extent_map_entry *ent;
932
933 spin_lock(&OCFS2_I(inode)->ip_lock);
934
935 __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
936
937 if (ent) {
938 rb_erase(&ent->e_node, &em->em_extents);
939 ent->e_node.rb_right = free_head;
940 free_head = &ent->e_node;
941 }
942
943 spin_unlock(&OCFS2_I(inode)->ip_lock);
944
945 if (free_head)
946 __ocfs2_extent_map_drop_cleanup(free_head);
947
948 return 0;
949}
950
951/*
952 * Remove all entries past new_clusters and also clip any extent
953 * straddling new_clusters, if there is one. This does not check
954 * or modify ip_clusters
955 */
956int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
957{
958 struct rb_node *free_head = NULL;
959 struct ocfs2_extent_map_entry *ent = NULL;
960
961 spin_lock(&OCFS2_I(inode)->ip_lock);
962
963 __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
964
965 if (ent)
966 ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
967 le32_to_cpu(ent->e_rec.e_cpos));
968
969 OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
970
971 spin_unlock(&OCFS2_I(inode)->ip_lock);
972
973 if (free_head)
974 __ocfs2_extent_map_drop_cleanup(free_head);
975
976 return 0;
977}
978
979int __init init_ocfs2_extent_maps(void)
980{
981 ocfs2_em_ent_cachep =
982 kmem_cache_create("ocfs2_em_ent",
983 sizeof(struct ocfs2_extent_map_entry),
984 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
985 if (!ocfs2_em_ent_cachep)
986 return -ENOMEM;
987
988 return 0;
989}
990
991void __exit exit_ocfs2_extent_maps(void)
992{
993 kmem_cache_destroy(ocfs2_em_ent_cachep);
994}