blob: f27e5378caf249ecccca6676a8fb2e8beda2be63 [file] [log] [blame]
Mark Fashehccd979b2005-12-15 14:31:24 -08001/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * alloc.c
5 *
6 * Extent allocs and frees
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30
31#define MLOG_MASK_PREFIX ML_DISK_ALLOC
32#include <cluster/masklog.h>
33
34#include "ocfs2.h"
35
36#include "alloc.h"
37#include "dlmglue.h"
38#include "extent_map.h"
39#include "inode.h"
40#include "journal.h"
41#include "localalloc.h"
42#include "suballoc.h"
43#include "sysfile.h"
44#include "file.h"
45#include "super.h"
46#include "uptodate.h"
47
48#include "buffer_head_io.h"
49
50static int ocfs2_extent_contig(struct inode *inode,
51 struct ocfs2_extent_rec *ext,
52 u64 blkno);
53
54static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
Mark Fasheh1fabe142006-10-09 18:11:45 -070055 handle_t *handle,
Mark Fashehccd979b2005-12-15 14:31:24 -080056 struct inode *inode,
57 int wanted,
58 struct ocfs2_alloc_context *meta_ac,
59 struct buffer_head *bhs[]);
60
61static int ocfs2_add_branch(struct ocfs2_super *osb,
Mark Fasheh1fabe142006-10-09 18:11:45 -070062 handle_t *handle,
Mark Fashehccd979b2005-12-15 14:31:24 -080063 struct inode *inode,
64 struct buffer_head *fe_bh,
65 struct buffer_head *eb_bh,
66 struct buffer_head *last_eb_bh,
67 struct ocfs2_alloc_context *meta_ac);
68
69static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
Mark Fasheh1fabe142006-10-09 18:11:45 -070070 handle_t *handle,
Mark Fashehccd979b2005-12-15 14:31:24 -080071 struct inode *inode,
72 struct buffer_head *fe_bh,
73 struct ocfs2_alloc_context *meta_ac,
74 struct buffer_head **ret_new_eb_bh);
75
76static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
Mark Fasheh1fabe142006-10-09 18:11:45 -070077 handle_t *handle,
Mark Fashehccd979b2005-12-15 14:31:24 -080078 struct inode *inode,
79 struct buffer_head *fe_bh,
80 u64 blkno,
81 u32 new_clusters);
82
83static int ocfs2_find_branch_target(struct ocfs2_super *osb,
84 struct inode *inode,
85 struct buffer_head *fe_bh,
86 struct buffer_head **target_bh);
87
88static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
89 struct inode *inode,
90 struct ocfs2_dinode *fe,
91 unsigned int new_i_clusters,
92 struct buffer_head *old_last_eb,
93 struct buffer_head **new_last_eb);
94
95static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
96
97static int ocfs2_extent_contig(struct inode *inode,
98 struct ocfs2_extent_rec *ext,
99 u64 blkno)
100{
101 return blkno == (le64_to_cpu(ext->e_blkno) +
102 ocfs2_clusters_to_blocks(inode->i_sb,
103 le32_to_cpu(ext->e_clusters)));
104}
105
106/*
107 * How many free extents have we got before we need more meta data?
108 */
109int ocfs2_num_free_extents(struct ocfs2_super *osb,
110 struct inode *inode,
111 struct ocfs2_dinode *fe)
112{
113 int retval;
114 struct ocfs2_extent_list *el;
115 struct ocfs2_extent_block *eb;
116 struct buffer_head *eb_bh = NULL;
117
118 mlog_entry_void();
119
120 if (!OCFS2_IS_VALID_DINODE(fe)) {
121 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
122 retval = -EIO;
123 goto bail;
124 }
125
126 if (fe->i_last_eb_blk) {
127 retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
128 &eb_bh, OCFS2_BH_CACHED, inode);
129 if (retval < 0) {
130 mlog_errno(retval);
131 goto bail;
132 }
133 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
134 el = &eb->h_list;
135 } else
136 el = &fe->id2.i_list;
137
138 BUG_ON(el->l_tree_depth != 0);
139
140 retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
141bail:
142 if (eb_bh)
143 brelse(eb_bh);
144
145 mlog_exit(retval);
146 return retval;
147}
148
149/* expects array to already be allocated
150 *
151 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
152 * l_count for you
153 */
154static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
Mark Fasheh1fabe142006-10-09 18:11:45 -0700155 handle_t *handle,
Mark Fashehccd979b2005-12-15 14:31:24 -0800156 struct inode *inode,
157 int wanted,
158 struct ocfs2_alloc_context *meta_ac,
159 struct buffer_head *bhs[])
160{
161 int count, status, i;
162 u16 suballoc_bit_start;
163 u32 num_got;
164 u64 first_blkno;
165 struct ocfs2_extent_block *eb;
166
167 mlog_entry_void();
168
169 count = 0;
170 while (count < wanted) {
171 status = ocfs2_claim_metadata(osb,
172 handle,
173 meta_ac,
174 wanted - count,
175 &suballoc_bit_start,
176 &num_got,
177 &first_blkno);
178 if (status < 0) {
179 mlog_errno(status);
180 goto bail;
181 }
182
183 for(i = count; i < (num_got + count); i++) {
184 bhs[i] = sb_getblk(osb->sb, first_blkno);
185 if (bhs[i] == NULL) {
186 status = -EIO;
187 mlog_errno(status);
188 goto bail;
189 }
190 ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
191
192 status = ocfs2_journal_access(handle, inode, bhs[i],
193 OCFS2_JOURNAL_ACCESS_CREATE);
194 if (status < 0) {
195 mlog_errno(status);
196 goto bail;
197 }
198
199 memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
200 eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
201 /* Ok, setup the minimal stuff here. */
202 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
203 eb->h_blkno = cpu_to_le64(first_blkno);
204 eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
205
206#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
207 /* we always use slot zero's suballocator */
208 eb->h_suballoc_slot = 0;
209#else
210 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
211#endif
212 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
213 eb->h_list.l_count =
214 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
215
216 suballoc_bit_start++;
217 first_blkno++;
218
219 /* We'll also be dirtied by the caller, so
220 * this isn't absolutely necessary. */
221 status = ocfs2_journal_dirty(handle, bhs[i]);
222 if (status < 0) {
223 mlog_errno(status);
224 goto bail;
225 }
226 }
227
228 count += num_got;
229 }
230
231 status = 0;
232bail:
233 if (status < 0) {
234 for(i = 0; i < wanted; i++) {
235 if (bhs[i])
236 brelse(bhs[i]);
237 bhs[i] = NULL;
238 }
239 }
240 mlog_exit(status);
241 return status;
242}
243
244/*
245 * Add an entire tree branch to our inode. eb_bh is the extent block
246 * to start at, if we don't want to start the branch at the dinode
247 * structure.
248 *
249 * last_eb_bh is required as we have to update it's next_leaf pointer
250 * for the new last extent block.
251 *
252 * the new branch will be 'empty' in the sense that every block will
253 * contain a single record with e_clusters == 0.
254 */
255static int ocfs2_add_branch(struct ocfs2_super *osb,
Mark Fasheh1fabe142006-10-09 18:11:45 -0700256 handle_t *handle,
Mark Fashehccd979b2005-12-15 14:31:24 -0800257 struct inode *inode,
258 struct buffer_head *fe_bh,
259 struct buffer_head *eb_bh,
260 struct buffer_head *last_eb_bh,
261 struct ocfs2_alloc_context *meta_ac)
262{
263 int status, new_blocks, i;
264 u64 next_blkno, new_last_eb_blk;
265 struct buffer_head *bh;
266 struct buffer_head **new_eb_bhs = NULL;
267 struct ocfs2_dinode *fe;
268 struct ocfs2_extent_block *eb;
269 struct ocfs2_extent_list *eb_el;
270 struct ocfs2_extent_list *el;
271
272 mlog_entry_void();
273
274 BUG_ON(!last_eb_bh);
275
276 fe = (struct ocfs2_dinode *) fe_bh->b_data;
277
278 if (eb_bh) {
279 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
280 el = &eb->h_list;
281 } else
282 el = &fe->id2.i_list;
283
284 /* we never add a branch to a leaf. */
285 BUG_ON(!el->l_tree_depth);
286
287 new_blocks = le16_to_cpu(el->l_tree_depth);
288
289 /* allocate the number of new eb blocks we need */
290 new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
291 GFP_KERNEL);
292 if (!new_eb_bhs) {
293 status = -ENOMEM;
294 mlog_errno(status);
295 goto bail;
296 }
297
298 status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
299 meta_ac, new_eb_bhs);
300 if (status < 0) {
301 mlog_errno(status);
302 goto bail;
303 }
304
305 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
306 * linked with the rest of the tree.
307 * conversly, new_eb_bhs[0] is the new bottommost leaf.
308 *
309 * when we leave the loop, new_last_eb_blk will point to the
310 * newest leaf, and next_blkno will point to the topmost extent
311 * block. */
312 next_blkno = new_last_eb_blk = 0;
313 for(i = 0; i < new_blocks; i++) {
314 bh = new_eb_bhs[i];
315 eb = (struct ocfs2_extent_block *) bh->b_data;
316 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
317 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
318 status = -EIO;
319 goto bail;
320 }
321 eb_el = &eb->h_list;
322
323 status = ocfs2_journal_access(handle, inode, bh,
324 OCFS2_JOURNAL_ACCESS_CREATE);
325 if (status < 0) {
326 mlog_errno(status);
327 goto bail;
328 }
329
330 eb->h_next_leaf_blk = 0;
331 eb_el->l_tree_depth = cpu_to_le16(i);
332 eb_el->l_next_free_rec = cpu_to_le16(1);
333 eb_el->l_recs[0].e_cpos = fe->i_clusters;
334 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
335 eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
336 if (!eb_el->l_tree_depth)
337 new_last_eb_blk = le64_to_cpu(eb->h_blkno);
338
339 status = ocfs2_journal_dirty(handle, bh);
340 if (status < 0) {
341 mlog_errno(status);
342 goto bail;
343 }
344
345 next_blkno = le64_to_cpu(eb->h_blkno);
346 }
347
348 /* This is a bit hairy. We want to update up to three blocks
349 * here without leaving any of them in an inconsistent state
350 * in case of error. We don't have to worry about
351 * journal_dirty erroring as it won't unless we've aborted the
352 * handle (in which case we would never be here) so reserving
353 * the write with journal_access is all we need to do. */
354 status = ocfs2_journal_access(handle, inode, last_eb_bh,
355 OCFS2_JOURNAL_ACCESS_WRITE);
356 if (status < 0) {
357 mlog_errno(status);
358 goto bail;
359 }
360 status = ocfs2_journal_access(handle, inode, fe_bh,
361 OCFS2_JOURNAL_ACCESS_WRITE);
362 if (status < 0) {
363 mlog_errno(status);
364 goto bail;
365 }
366 if (eb_bh) {
367 status = ocfs2_journal_access(handle, inode, eb_bh,
368 OCFS2_JOURNAL_ACCESS_WRITE);
369 if (status < 0) {
370 mlog_errno(status);
371 goto bail;
372 }
373 }
374
375 /* Link the new branch into the rest of the tree (el will
376 * either be on the fe, or the extent block passed in. */
377 i = le16_to_cpu(el->l_next_free_rec);
378 el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
379 el->l_recs[i].e_cpos = fe->i_clusters;
380 el->l_recs[i].e_clusters = 0;
381 le16_add_cpu(&el->l_next_free_rec, 1);
382
383 /* fe needs a new last extent block pointer, as does the
384 * next_leaf on the previously last-extent-block. */
385 fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
386
387 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
388 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
389
390 status = ocfs2_journal_dirty(handle, last_eb_bh);
391 if (status < 0)
392 mlog_errno(status);
393 status = ocfs2_journal_dirty(handle, fe_bh);
394 if (status < 0)
395 mlog_errno(status);
396 if (eb_bh) {
397 status = ocfs2_journal_dirty(handle, eb_bh);
398 if (status < 0)
399 mlog_errno(status);
400 }
401
402 status = 0;
403bail:
404 if (new_eb_bhs) {
405 for (i = 0; i < new_blocks; i++)
406 if (new_eb_bhs[i])
407 brelse(new_eb_bhs[i]);
408 kfree(new_eb_bhs);
409 }
410
411 mlog_exit(status);
412 return status;
413}
414
415/*
416 * adds another level to the allocation tree.
417 * returns back the new extent block so you can add a branch to it
418 * after this call.
419 */
420static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
Mark Fasheh1fabe142006-10-09 18:11:45 -0700421 handle_t *handle,
Mark Fashehccd979b2005-12-15 14:31:24 -0800422 struct inode *inode,
423 struct buffer_head *fe_bh,
424 struct ocfs2_alloc_context *meta_ac,
425 struct buffer_head **ret_new_eb_bh)
426{
427 int status, i;
428 struct buffer_head *new_eb_bh = NULL;
429 struct ocfs2_dinode *fe;
430 struct ocfs2_extent_block *eb;
431 struct ocfs2_extent_list *fe_el;
432 struct ocfs2_extent_list *eb_el;
433
434 mlog_entry_void();
435
436 status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
437 &new_eb_bh);
438 if (status < 0) {
439 mlog_errno(status);
440 goto bail;
441 }
442
443 eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
444 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
445 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
446 status = -EIO;
447 goto bail;
448 }
449
450 eb_el = &eb->h_list;
451 fe = (struct ocfs2_dinode *) fe_bh->b_data;
452 fe_el = &fe->id2.i_list;
453
454 status = ocfs2_journal_access(handle, inode, new_eb_bh,
455 OCFS2_JOURNAL_ACCESS_CREATE);
456 if (status < 0) {
457 mlog_errno(status);
458 goto bail;
459 }
460
461 /* copy the fe data into the new extent block */
462 eb_el->l_tree_depth = fe_el->l_tree_depth;
463 eb_el->l_next_free_rec = fe_el->l_next_free_rec;
464 for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
465 eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
466 eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
467 eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
468 }
469
470 status = ocfs2_journal_dirty(handle, new_eb_bh);
471 if (status < 0) {
472 mlog_errno(status);
473 goto bail;
474 }
475
476 status = ocfs2_journal_access(handle, inode, fe_bh,
477 OCFS2_JOURNAL_ACCESS_WRITE);
478 if (status < 0) {
479 mlog_errno(status);
480 goto bail;
481 }
482
483 /* update fe now */
484 le16_add_cpu(&fe_el->l_tree_depth, 1);
485 fe_el->l_recs[0].e_cpos = 0;
486 fe_el->l_recs[0].e_blkno = eb->h_blkno;
487 fe_el->l_recs[0].e_clusters = fe->i_clusters;
488 for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
489 fe_el->l_recs[i].e_cpos = 0;
490 fe_el->l_recs[i].e_clusters = 0;
491 fe_el->l_recs[i].e_blkno = 0;
492 }
493 fe_el->l_next_free_rec = cpu_to_le16(1);
494
495 /* If this is our 1st tree depth shift, then last_eb_blk
496 * becomes the allocated extent block */
497 if (fe_el->l_tree_depth == cpu_to_le16(1))
498 fe->i_last_eb_blk = eb->h_blkno;
499
500 status = ocfs2_journal_dirty(handle, fe_bh);
501 if (status < 0) {
502 mlog_errno(status);
503 goto bail;
504 }
505
506 *ret_new_eb_bh = new_eb_bh;
507 new_eb_bh = NULL;
508 status = 0;
509bail:
510 if (new_eb_bh)
511 brelse(new_eb_bh);
512
513 mlog_exit(status);
514 return status;
515}
516
517/*
518 * Expects the tree to already have room in the rightmost leaf for the
519 * extent. Updates all the extent blocks (and the dinode) on the way
520 * down.
521 */
522static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
Mark Fasheh1fabe142006-10-09 18:11:45 -0700523 handle_t *handle,
Mark Fashehccd979b2005-12-15 14:31:24 -0800524 struct inode *inode,
525 struct buffer_head *fe_bh,
526 u64 start_blk,
527 u32 new_clusters)
528{
529 int status, i, num_bhs = 0;
530 u64 next_blkno;
531 u16 next_free;
532 struct buffer_head **eb_bhs = NULL;
533 struct ocfs2_dinode *fe;
534 struct ocfs2_extent_block *eb;
535 struct ocfs2_extent_list *el;
536
537 mlog_entry_void();
538
539 status = ocfs2_journal_access(handle, inode, fe_bh,
540 OCFS2_JOURNAL_ACCESS_WRITE);
541 if (status < 0) {
542 mlog_errno(status);
543 goto bail;
544 }
545
546 fe = (struct ocfs2_dinode *) fe_bh->b_data;
547 el = &fe->id2.i_list;
548 if (el->l_tree_depth) {
549 /* This is another operation where we want to be
550 * careful about our tree updates. An error here means
551 * none of the previous changes we made should roll
552 * forward. As a result, we have to record the buffers
553 * for this part of the tree in an array and reserve a
554 * journal write to them before making any changes. */
555 num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
556 eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
557 GFP_KERNEL);
558 if (!eb_bhs) {
559 status = -ENOMEM;
560 mlog_errno(status);
561 goto bail;
562 }
563
564 i = 0;
565 while(el->l_tree_depth) {
566 next_free = le16_to_cpu(el->l_next_free_rec);
567 if (next_free == 0) {
568 ocfs2_error(inode->i_sb,
Mark Fashehb06970532006-03-03 10:24:33 -0800569 "Dinode %llu has a bad extent list",
570 (unsigned long long)OCFS2_I(inode)->ip_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -0800571 status = -EIO;
572 goto bail;
573 }
574 next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
575
576 BUG_ON(i >= num_bhs);
577 status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
578 OCFS2_BH_CACHED, inode);
579 if (status < 0) {
580 mlog_errno(status);
581 goto bail;
582 }
583 eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
584 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
585 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
586 eb);
587 status = -EIO;
588 goto bail;
589 }
590
591 status = ocfs2_journal_access(handle, inode, eb_bhs[i],
592 OCFS2_JOURNAL_ACCESS_WRITE);
593 if (status < 0) {
594 mlog_errno(status);
595 goto bail;
596 }
597
598 el = &eb->h_list;
599 i++;
600 /* When we leave this loop, eb_bhs[num_bhs - 1] will
601 * hold the bottom-most leaf extent block. */
602 }
603 BUG_ON(el->l_tree_depth);
604
605 el = &fe->id2.i_list;
606 /* If we have tree depth, then the fe update is
607 * trivial, and we want to switch el out for the
608 * bottom-most leaf in order to update it with the
609 * actual extent data below. */
610 next_free = le16_to_cpu(el->l_next_free_rec);
611 if (next_free == 0) {
612 ocfs2_error(inode->i_sb,
Mark Fashehb06970532006-03-03 10:24:33 -0800613 "Dinode %llu has a bad extent list",
614 (unsigned long long)OCFS2_I(inode)->ip_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -0800615 status = -EIO;
616 goto bail;
617 }
618 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
619 new_clusters);
620 /* (num_bhs - 1) to avoid the leaf */
621 for(i = 0; i < (num_bhs - 1); i++) {
622 eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
623 el = &eb->h_list;
624
625 /* finally, make our actual change to the
626 * intermediate extent blocks. */
627 next_free = le16_to_cpu(el->l_next_free_rec);
628 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
629 new_clusters);
630
631 status = ocfs2_journal_dirty(handle, eb_bhs[i]);
632 if (status < 0)
633 mlog_errno(status);
634 }
635 BUG_ON(i != (num_bhs - 1));
636 /* note that the leaf block wasn't touched in
637 * the loop above */
638 eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
639 el = &eb->h_list;
640 BUG_ON(el->l_tree_depth);
641 }
642
643 /* yay, we can finally add the actual extent now! */
644 i = le16_to_cpu(el->l_next_free_rec) - 1;
645 if (le16_to_cpu(el->l_next_free_rec) &&
646 ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
647 le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
648 } else if (le16_to_cpu(el->l_next_free_rec) &&
649 (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
650 /* having an empty extent at eof is legal. */
651 if (el->l_recs[i].e_cpos != fe->i_clusters) {
652 ocfs2_error(inode->i_sb,
Mark Fashehb06970532006-03-03 10:24:33 -0800653 "Dinode %llu trailing extent is bad: "
Mark Fashehccd979b2005-12-15 14:31:24 -0800654 "cpos (%u) != number of clusters (%u)",
Mark Fashehb06970532006-03-03 10:24:33 -0800655 (unsigned long long)OCFS2_I(inode)->ip_blkno,
Mark Fashehccd979b2005-12-15 14:31:24 -0800656 le32_to_cpu(el->l_recs[i].e_cpos),
657 le32_to_cpu(fe->i_clusters));
658 status = -EIO;
659 goto bail;
660 }
661 el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
662 el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
663 } else {
664 /* No contiguous record, or no empty record at eof, so
665 * we add a new one. */
666
667 BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
668 le16_to_cpu(el->l_count));
669 i = le16_to_cpu(el->l_next_free_rec);
670
671 el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
672 el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
673 el->l_recs[i].e_cpos = fe->i_clusters;
674 le16_add_cpu(&el->l_next_free_rec, 1);
675 }
676
677 /*
678 * extent_map errors are not fatal, so they are ignored outside
679 * of flushing the thing.
680 */
681 status = ocfs2_extent_map_append(inode, &el->l_recs[i],
682 new_clusters);
683 if (status) {
684 mlog_errno(status);
685 ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
686 }
687
688 status = ocfs2_journal_dirty(handle, fe_bh);
689 if (status < 0)
690 mlog_errno(status);
691 if (fe->id2.i_list.l_tree_depth) {
692 status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
693 if (status < 0)
694 mlog_errno(status);
695 }
696
697 status = 0;
698bail:
699 if (eb_bhs) {
700 for (i = 0; i < num_bhs; i++)
701 if (eb_bhs[i])
702 brelse(eb_bhs[i]);
703 kfree(eb_bhs);
704 }
705
706 mlog_exit(status);
707 return status;
708}
709
710/*
711 * Should only be called when there is no space left in any of the
712 * leaf nodes. What we want to do is find the lowest tree depth
713 * non-leaf extent block with room for new records. There are three
714 * valid results of this search:
715 *
716 * 1) a lowest extent block is found, then we pass it back in
717 * *lowest_eb_bh and return '0'
718 *
719 * 2) the search fails to find anything, but the dinode has room. We
720 * pass NULL back in *lowest_eb_bh, but still return '0'
721 *
722 * 3) the search fails to find anything AND the dinode is full, in
723 * which case we return > 0
724 *
725 * return status < 0 indicates an error.
726 */
727static int ocfs2_find_branch_target(struct ocfs2_super *osb,
728 struct inode *inode,
729 struct buffer_head *fe_bh,
730 struct buffer_head **target_bh)
731{
732 int status = 0, i;
733 u64 blkno;
734 struct ocfs2_dinode *fe;
735 struct ocfs2_extent_block *eb;
736 struct ocfs2_extent_list *el;
737 struct buffer_head *bh = NULL;
738 struct buffer_head *lowest_bh = NULL;
739
740 mlog_entry_void();
741
742 *target_bh = NULL;
743
744 fe = (struct ocfs2_dinode *) fe_bh->b_data;
745 el = &fe->id2.i_list;
746
747 while(le16_to_cpu(el->l_tree_depth) > 1) {
748 if (le16_to_cpu(el->l_next_free_rec) == 0) {
Mark Fashehb06970532006-03-03 10:24:33 -0800749 ocfs2_error(inode->i_sb, "Dinode %llu has empty "
Mark Fashehccd979b2005-12-15 14:31:24 -0800750 "extent list (next_free_rec == 0)",
Mark Fashehb06970532006-03-03 10:24:33 -0800751 (unsigned long long)OCFS2_I(inode)->ip_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -0800752 status = -EIO;
753 goto bail;
754 }
755 i = le16_to_cpu(el->l_next_free_rec) - 1;
756 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
757 if (!blkno) {
Mark Fashehb06970532006-03-03 10:24:33 -0800758 ocfs2_error(inode->i_sb, "Dinode %llu has extent "
Mark Fashehccd979b2005-12-15 14:31:24 -0800759 "list where extent # %d has no physical "
760 "block start",
Mark Fashehb06970532006-03-03 10:24:33 -0800761 (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
Mark Fashehccd979b2005-12-15 14:31:24 -0800762 status = -EIO;
763 goto bail;
764 }
765
766 if (bh) {
767 brelse(bh);
768 bh = NULL;
769 }
770
771 status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
772 inode);
773 if (status < 0) {
774 mlog_errno(status);
775 goto bail;
776 }
777
778 eb = (struct ocfs2_extent_block *) bh->b_data;
779 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
780 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
781 status = -EIO;
782 goto bail;
783 }
784 el = &eb->h_list;
785
786 if (le16_to_cpu(el->l_next_free_rec) <
787 le16_to_cpu(el->l_count)) {
788 if (lowest_bh)
789 brelse(lowest_bh);
790 lowest_bh = bh;
791 get_bh(lowest_bh);
792 }
793 }
794
795 /* If we didn't find one and the fe doesn't have any room,
796 * then return '1' */
797 if (!lowest_bh
798 && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
799 status = 1;
800
801 *target_bh = lowest_bh;
802bail:
803 if (bh)
804 brelse(bh);
805
806 mlog_exit(status);
807 return status;
808}
809
810/* the caller needs to update fe->i_clusters */
811int ocfs2_insert_extent(struct ocfs2_super *osb,
Mark Fasheh1fabe142006-10-09 18:11:45 -0700812 handle_t *handle,
Mark Fashehccd979b2005-12-15 14:31:24 -0800813 struct inode *inode,
814 struct buffer_head *fe_bh,
815 u64 start_blk,
816 u32 new_clusters,
817 struct ocfs2_alloc_context *meta_ac)
818{
819 int status, i, shift;
820 struct buffer_head *last_eb_bh = NULL;
821 struct buffer_head *bh = NULL;
822 struct ocfs2_dinode *fe;
823 struct ocfs2_extent_block *eb;
824 struct ocfs2_extent_list *el;
825
826 mlog_entry_void();
827
Mark Fashehb06970532006-03-03 10:24:33 -0800828 mlog(0, "add %u clusters starting at block %llu to inode %llu\n",
829 new_clusters, (unsigned long long)start_blk,
830 (unsigned long long)OCFS2_I(inode)->ip_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -0800831
832 fe = (struct ocfs2_dinode *) fe_bh->b_data;
833 el = &fe->id2.i_list;
834
835 if (el->l_tree_depth) {
836 /* jump to end of tree */
837 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
838 &last_eb_bh, OCFS2_BH_CACHED, inode);
839 if (status < 0) {
840 mlog_exit(status);
841 goto bail;
842 }
843 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
844 el = &eb->h_list;
845 }
846
847 /* Can we allocate without adding/shifting tree bits? */
848 i = le16_to_cpu(el->l_next_free_rec) - 1;
849 if (le16_to_cpu(el->l_next_free_rec) == 0
850 || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))
851 || le32_to_cpu(el->l_recs[i].e_clusters) == 0
852 || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
853 goto out_add;
854
855 mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
856 "tree now.\n");
857
858 shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
859 if (shift < 0) {
860 status = shift;
861 mlog_errno(status);
862 goto bail;
863 }
864
865 /* We traveled all the way to the bottom of the allocation tree
866 * and didn't find room for any more extents - we need to add
867 * another tree level */
868 if (shift) {
869 /* if we hit a leaf, we'd better be empty :) */
870 BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
871 le16_to_cpu(el->l_count));
872 BUG_ON(bh);
873 mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
874 "(current = %u)\n",
875 le16_to_cpu(fe->id2.i_list.l_tree_depth));
876
877 /* ocfs2_shift_tree_depth will return us a buffer with
878 * the new extent block (so we can pass that to
879 * ocfs2_add_branch). */
880 status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
881 meta_ac, &bh);
882 if (status < 0) {
883 mlog_errno(status);
884 goto bail;
885 }
886 /* Special case: we have room now if we shifted from
887 * tree_depth 0 */
888 if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1))
889 goto out_add;
890 }
891
892 /* call ocfs2_add_branch to add the final part of the tree with
893 * the new data. */
894 mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh);
895 status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
896 meta_ac);
897 if (status < 0) {
898 mlog_errno(status);
899 goto bail;
900 }
901
902out_add:
903 /* Finally, we can add clusters. */
904 status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh,
905 start_blk, new_clusters);
906 if (status < 0)
907 mlog_errno(status);
908
909bail:
910 if (bh)
911 brelse(bh);
912
913 if (last_eb_bh)
914 brelse(last_eb_bh);
915
916 mlog_exit(status);
917 return status;
918}
919
920static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
921{
922 struct buffer_head *tl_bh = osb->osb_tl_bh;
923 struct ocfs2_dinode *di;
924 struct ocfs2_truncate_log *tl;
925
926 di = (struct ocfs2_dinode *) tl_bh->b_data;
927 tl = &di->id2.i_dealloc;
928
929 mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
930 "slot %d, invalid truncate log parameters: used = "
931 "%u, count = %u\n", osb->slot_num,
932 le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
933 return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
934}
935
936static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
937 unsigned int new_start)
938{
939 unsigned int tail_index;
940 unsigned int current_tail;
941
942 /* No records, nothing to coalesce */
943 if (!le16_to_cpu(tl->tl_used))
944 return 0;
945
946 tail_index = le16_to_cpu(tl->tl_used) - 1;
947 current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
948 current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
949
950 return current_tail == new_start;
951}
952
953static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
Mark Fasheh1fabe142006-10-09 18:11:45 -0700954 handle_t *handle,
Mark Fashehccd979b2005-12-15 14:31:24 -0800955 u64 start_blk,
956 unsigned int num_clusters)
957{
958 int status, index;
959 unsigned int start_cluster, tl_count;
960 struct inode *tl_inode = osb->osb_tl_inode;
961 struct buffer_head *tl_bh = osb->osb_tl_bh;
962 struct ocfs2_dinode *di;
963 struct ocfs2_truncate_log *tl;
964
Mark Fashehb06970532006-03-03 10:24:33 -0800965 mlog_entry("start_blk = %llu, num_clusters = %u\n",
966 (unsigned long long)start_blk, num_clusters);
Mark Fashehccd979b2005-12-15 14:31:24 -0800967
Jes Sorensen1b1dcc12006-01-09 15:59:24 -0800968 BUG_ON(mutex_trylock(&tl_inode->i_mutex));
Mark Fashehccd979b2005-12-15 14:31:24 -0800969
970 start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
971
972 di = (struct ocfs2_dinode *) tl_bh->b_data;
973 tl = &di->id2.i_dealloc;
974 if (!OCFS2_IS_VALID_DINODE(di)) {
975 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
976 status = -EIO;
977 goto bail;
978 }
979
980 tl_count = le16_to_cpu(tl->tl_count);
981 mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
982 tl_count == 0,
Mark Fashehb06970532006-03-03 10:24:33 -0800983 "Truncate record count on #%llu invalid "
984 "wanted %u, actual %u\n",
985 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
Mark Fashehccd979b2005-12-15 14:31:24 -0800986 ocfs2_truncate_recs_per_inode(osb->sb),
987 le16_to_cpu(tl->tl_count));
988
989 /* Caller should have known to flush before calling us. */
990 index = le16_to_cpu(tl->tl_used);
991 if (index >= tl_count) {
992 status = -ENOSPC;
993 mlog_errno(status);
994 goto bail;
995 }
996
997 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
998 OCFS2_JOURNAL_ACCESS_WRITE);
999 if (status < 0) {
1000 mlog_errno(status);
1001 goto bail;
1002 }
1003
1004 mlog(0, "Log truncate of %u clusters starting at cluster %u to "
Mark Fashehb06970532006-03-03 10:24:33 -08001005 "%llu (index = %d)\n", num_clusters, start_cluster,
1006 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
Mark Fashehccd979b2005-12-15 14:31:24 -08001007
1008 if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
1009 /*
1010 * Move index back to the record we are coalescing with.
1011 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
1012 */
1013 index--;
1014
1015 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
1016 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
1017 index, le32_to_cpu(tl->tl_recs[index].t_start),
1018 num_clusters);
1019 } else {
1020 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
1021 tl->tl_used = cpu_to_le16(index + 1);
1022 }
1023 tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
1024
1025 status = ocfs2_journal_dirty(handle, tl_bh);
1026 if (status < 0) {
1027 mlog_errno(status);
1028 goto bail;
1029 }
1030
1031bail:
1032 mlog_exit(status);
1033 return status;
1034}
1035
1036static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
Mark Fasheh1fabe142006-10-09 18:11:45 -07001037 handle_t *handle,
Mark Fashehccd979b2005-12-15 14:31:24 -08001038 struct inode *data_alloc_inode,
1039 struct buffer_head *data_alloc_bh)
1040{
1041 int status = 0;
1042 int i;
1043 unsigned int num_clusters;
1044 u64 start_blk;
1045 struct ocfs2_truncate_rec rec;
1046 struct ocfs2_dinode *di;
1047 struct ocfs2_truncate_log *tl;
1048 struct inode *tl_inode = osb->osb_tl_inode;
1049 struct buffer_head *tl_bh = osb->osb_tl_bh;
1050
1051 mlog_entry_void();
1052
1053 di = (struct ocfs2_dinode *) tl_bh->b_data;
1054 tl = &di->id2.i_dealloc;
1055 i = le16_to_cpu(tl->tl_used) - 1;
1056 while (i >= 0) {
1057 /* Caller has given us at least enough credits to
1058 * update the truncate log dinode */
1059 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
1060 OCFS2_JOURNAL_ACCESS_WRITE);
1061 if (status < 0) {
1062 mlog_errno(status);
1063 goto bail;
1064 }
1065
1066 tl->tl_used = cpu_to_le16(i);
1067
1068 status = ocfs2_journal_dirty(handle, tl_bh);
1069 if (status < 0) {
1070 mlog_errno(status);
1071 goto bail;
1072 }
1073
1074 /* TODO: Perhaps we can calculate the bulk of the
1075 * credits up front rather than extending like
1076 * this. */
1077 status = ocfs2_extend_trans(handle,
1078 OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
1079 if (status < 0) {
1080 mlog_errno(status);
1081 goto bail;
1082 }
1083
1084 rec = tl->tl_recs[i];
1085 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
1086 le32_to_cpu(rec.t_start));
1087 num_clusters = le32_to_cpu(rec.t_clusters);
1088
1089 /* if start_blk is not set, we ignore the record as
1090 * invalid. */
1091 if (start_blk) {
1092 mlog(0, "free record %d, start = %u, clusters = %u\n",
1093 i, le32_to_cpu(rec.t_start), num_clusters);
1094
1095 status = ocfs2_free_clusters(handle, data_alloc_inode,
1096 data_alloc_bh, start_blk,
1097 num_clusters);
1098 if (status < 0) {
1099 mlog_errno(status);
1100 goto bail;
1101 }
1102 }
1103 i--;
1104 }
1105
1106bail:
1107 mlog_exit(status);
1108 return status;
1109}
1110
Jes Sorensen1b1dcc12006-01-09 15:59:24 -08001111/* Expects you to already be holding tl_inode->i_mutex */
Mark Fashehccd979b2005-12-15 14:31:24 -08001112static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
1113{
1114 int status;
1115 unsigned int num_to_flush;
Mark Fasheh1fabe142006-10-09 18:11:45 -07001116 handle_t *handle;
Mark Fashehccd979b2005-12-15 14:31:24 -08001117 struct inode *tl_inode = osb->osb_tl_inode;
1118 struct inode *data_alloc_inode = NULL;
1119 struct buffer_head *tl_bh = osb->osb_tl_bh;
1120 struct buffer_head *data_alloc_bh = NULL;
1121 struct ocfs2_dinode *di;
1122 struct ocfs2_truncate_log *tl;
1123
1124 mlog_entry_void();
1125
Jes Sorensen1b1dcc12006-01-09 15:59:24 -08001126 BUG_ON(mutex_trylock(&tl_inode->i_mutex));
Mark Fashehccd979b2005-12-15 14:31:24 -08001127
1128 di = (struct ocfs2_dinode *) tl_bh->b_data;
1129 tl = &di->id2.i_dealloc;
1130 if (!OCFS2_IS_VALID_DINODE(di)) {
1131 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
1132 status = -EIO;
Mark Fashehe08dc8b2006-10-05 15:58:48 -07001133 goto out;
Mark Fashehccd979b2005-12-15 14:31:24 -08001134 }
1135
1136 num_to_flush = le16_to_cpu(tl->tl_used);
Mark Fashehb06970532006-03-03 10:24:33 -08001137 mlog(0, "Flush %u records from truncate log #%llu\n",
1138 num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -08001139 if (!num_to_flush) {
1140 status = 0;
Mark Fashehe08dc8b2006-10-05 15:58:48 -07001141 goto out;
Mark Fashehccd979b2005-12-15 14:31:24 -08001142 }
1143
1144 data_alloc_inode = ocfs2_get_system_file_inode(osb,
1145 GLOBAL_BITMAP_SYSTEM_INODE,
1146 OCFS2_INVALID_SLOT);
1147 if (!data_alloc_inode) {
1148 status = -EINVAL;
1149 mlog(ML_ERROR, "Could not get bitmap inode!\n");
Mark Fashehe08dc8b2006-10-05 15:58:48 -07001150 goto out;
Mark Fashehccd979b2005-12-15 14:31:24 -08001151 }
1152
Mark Fashehe08dc8b2006-10-05 15:58:48 -07001153 mutex_lock(&data_alloc_inode->i_mutex);
1154
Mark Fasheh4bcec182006-10-09 16:02:40 -07001155 status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1);
Mark Fashehccd979b2005-12-15 14:31:24 -08001156 if (status < 0) {
1157 mlog_errno(status);
Mark Fashehe08dc8b2006-10-05 15:58:48 -07001158 goto out_mutex;
Mark Fashehccd979b2005-12-15 14:31:24 -08001159 }
1160
Mark Fasheh65eff9c2006-10-09 17:26:22 -07001161 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
Mark Fashehccd979b2005-12-15 14:31:24 -08001162 if (IS_ERR(handle)) {
1163 status = PTR_ERR(handle);
Mark Fashehccd979b2005-12-15 14:31:24 -08001164 mlog_errno(status);
Mark Fashehe08dc8b2006-10-05 15:58:48 -07001165 goto out_unlock;
Mark Fashehccd979b2005-12-15 14:31:24 -08001166 }
1167
1168 status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
1169 data_alloc_bh);
Mark Fashehe08dc8b2006-10-05 15:58:48 -07001170 if (status < 0)
Mark Fashehccd979b2005-12-15 14:31:24 -08001171 mlog_errno(status);
Mark Fashehccd979b2005-12-15 14:31:24 -08001172
Mark Fasheh02dc1af2006-10-09 16:48:10 -07001173 ocfs2_commit_trans(osb, handle);
Mark Fashehccd979b2005-12-15 14:31:24 -08001174
Mark Fashehe08dc8b2006-10-05 15:58:48 -07001175out_unlock:
1176 brelse(data_alloc_bh);
1177 ocfs2_meta_unlock(data_alloc_inode, 1);
Mark Fashehccd979b2005-12-15 14:31:24 -08001178
Mark Fashehe08dc8b2006-10-05 15:58:48 -07001179out_mutex:
1180 mutex_unlock(&data_alloc_inode->i_mutex);
1181 iput(data_alloc_inode);
Mark Fashehccd979b2005-12-15 14:31:24 -08001182
Mark Fashehe08dc8b2006-10-05 15:58:48 -07001183out:
Mark Fashehccd979b2005-12-15 14:31:24 -08001184 mlog_exit(status);
1185 return status;
1186}
1187
1188int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
1189{
1190 int status;
1191 struct inode *tl_inode = osb->osb_tl_inode;
1192
Jes Sorensen1b1dcc12006-01-09 15:59:24 -08001193 mutex_lock(&tl_inode->i_mutex);
Mark Fashehccd979b2005-12-15 14:31:24 -08001194 status = __ocfs2_flush_truncate_log(osb);
Jes Sorensen1b1dcc12006-01-09 15:59:24 -08001195 mutex_unlock(&tl_inode->i_mutex);
Mark Fashehccd979b2005-12-15 14:31:24 -08001196
1197 return status;
1198}
1199
David Howellsc4028952006-11-22 14:57:56 +00001200static void ocfs2_truncate_log_worker(struct work_struct *work)
Mark Fashehccd979b2005-12-15 14:31:24 -08001201{
1202 int status;
David Howellsc4028952006-11-22 14:57:56 +00001203 struct ocfs2_super *osb =
1204 container_of(work, struct ocfs2_super,
1205 osb_truncate_log_wq.work);
Mark Fashehccd979b2005-12-15 14:31:24 -08001206
1207 mlog_entry_void();
1208
1209 status = ocfs2_flush_truncate_log(osb);
1210 if (status < 0)
1211 mlog_errno(status);
1212
1213 mlog_exit(status);
1214}
1215
1216#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
1217void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
1218 int cancel)
1219{
1220 if (osb->osb_tl_inode) {
1221 /* We want to push off log flushes while truncates are
1222 * still running. */
1223 if (cancel)
1224 cancel_delayed_work(&osb->osb_truncate_log_wq);
1225
1226 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
1227 OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
1228 }
1229}
1230
1231static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
1232 int slot_num,
1233 struct inode **tl_inode,
1234 struct buffer_head **tl_bh)
1235{
1236 int status;
1237 struct inode *inode = NULL;
1238 struct buffer_head *bh = NULL;
1239
1240 inode = ocfs2_get_system_file_inode(osb,
1241 TRUNCATE_LOG_SYSTEM_INODE,
1242 slot_num);
1243 if (!inode) {
1244 status = -EINVAL;
1245 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
1246 goto bail;
1247 }
1248
1249 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
1250 OCFS2_BH_CACHED, inode);
1251 if (status < 0) {
1252 iput(inode);
1253 mlog_errno(status);
1254 goto bail;
1255 }
1256
1257 *tl_inode = inode;
1258 *tl_bh = bh;
1259bail:
1260 mlog_exit(status);
1261 return status;
1262}
1263
1264/* called during the 1st stage of node recovery. we stamp a clean
1265 * truncate log and pass back a copy for processing later. if the
1266 * truncate log does not require processing, a *tl_copy is set to
1267 * NULL. */
1268int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
1269 int slot_num,
1270 struct ocfs2_dinode **tl_copy)
1271{
1272 int status;
1273 struct inode *tl_inode = NULL;
1274 struct buffer_head *tl_bh = NULL;
1275 struct ocfs2_dinode *di;
1276 struct ocfs2_truncate_log *tl;
1277
1278 *tl_copy = NULL;
1279
1280 mlog(0, "recover truncate log from slot %d\n", slot_num);
1281
1282 status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
1283 if (status < 0) {
1284 mlog_errno(status);
1285 goto bail;
1286 }
1287
1288 di = (struct ocfs2_dinode *) tl_bh->b_data;
1289 tl = &di->id2.i_dealloc;
1290 if (!OCFS2_IS_VALID_DINODE(di)) {
1291 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
1292 status = -EIO;
1293 goto bail;
1294 }
1295
1296 if (le16_to_cpu(tl->tl_used)) {
1297 mlog(0, "We'll have %u logs to recover\n",
1298 le16_to_cpu(tl->tl_used));
1299
1300 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
1301 if (!(*tl_copy)) {
1302 status = -ENOMEM;
1303 mlog_errno(status);
1304 goto bail;
1305 }
1306
1307 /* Assuming the write-out below goes well, this copy
1308 * will be passed back to recovery for processing. */
1309 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
1310
1311 /* All we need to do to clear the truncate log is set
1312 * tl_used. */
1313 tl->tl_used = 0;
1314
1315 status = ocfs2_write_block(osb, tl_bh, tl_inode);
1316 if (status < 0) {
1317 mlog_errno(status);
1318 goto bail;
1319 }
1320 }
1321
1322bail:
1323 if (tl_inode)
1324 iput(tl_inode);
1325 if (tl_bh)
1326 brelse(tl_bh);
1327
1328 if (status < 0 && (*tl_copy)) {
1329 kfree(*tl_copy);
1330 *tl_copy = NULL;
1331 }
1332
1333 mlog_exit(status);
1334 return status;
1335}
1336
1337int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
1338 struct ocfs2_dinode *tl_copy)
1339{
1340 int status = 0;
1341 int i;
1342 unsigned int clusters, num_recs, start_cluster;
1343 u64 start_blk;
Mark Fasheh1fabe142006-10-09 18:11:45 -07001344 handle_t *handle;
Mark Fashehccd979b2005-12-15 14:31:24 -08001345 struct inode *tl_inode = osb->osb_tl_inode;
1346 struct ocfs2_truncate_log *tl;
1347
1348 mlog_entry_void();
1349
1350 if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
1351 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
1352 return -EINVAL;
1353 }
1354
1355 tl = &tl_copy->id2.i_dealloc;
1356 num_recs = le16_to_cpu(tl->tl_used);
Mark Fashehb06970532006-03-03 10:24:33 -08001357 mlog(0, "cleanup %u records from %llu\n", num_recs,
1358 (unsigned long long)tl_copy->i_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -08001359
Jes Sorensen1b1dcc12006-01-09 15:59:24 -08001360 mutex_lock(&tl_inode->i_mutex);
Mark Fashehccd979b2005-12-15 14:31:24 -08001361 for(i = 0; i < num_recs; i++) {
1362 if (ocfs2_truncate_log_needs_flush(osb)) {
1363 status = __ocfs2_flush_truncate_log(osb);
1364 if (status < 0) {
1365 mlog_errno(status);
1366 goto bail_up;
1367 }
1368 }
1369
Mark Fasheh65eff9c2006-10-09 17:26:22 -07001370 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
Mark Fashehccd979b2005-12-15 14:31:24 -08001371 if (IS_ERR(handle)) {
1372 status = PTR_ERR(handle);
1373 mlog_errno(status);
1374 goto bail_up;
1375 }
1376
1377 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
1378 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
1379 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
1380
1381 status = ocfs2_truncate_log_append(osb, handle,
1382 start_blk, clusters);
Mark Fasheh02dc1af2006-10-09 16:48:10 -07001383 ocfs2_commit_trans(osb, handle);
Mark Fashehccd979b2005-12-15 14:31:24 -08001384 if (status < 0) {
1385 mlog_errno(status);
1386 goto bail_up;
1387 }
1388 }
1389
1390bail_up:
Jes Sorensen1b1dcc12006-01-09 15:59:24 -08001391 mutex_unlock(&tl_inode->i_mutex);
Mark Fashehccd979b2005-12-15 14:31:24 -08001392
1393 mlog_exit(status);
1394 return status;
1395}
1396
1397void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
1398{
1399 int status;
1400 struct inode *tl_inode = osb->osb_tl_inode;
1401
1402 mlog_entry_void();
1403
1404 if (tl_inode) {
1405 cancel_delayed_work(&osb->osb_truncate_log_wq);
1406 flush_workqueue(ocfs2_wq);
1407
1408 status = ocfs2_flush_truncate_log(osb);
1409 if (status < 0)
1410 mlog_errno(status);
1411
1412 brelse(osb->osb_tl_bh);
1413 iput(osb->osb_tl_inode);
1414 }
1415
1416 mlog_exit_void();
1417}
1418
1419int ocfs2_truncate_log_init(struct ocfs2_super *osb)
1420{
1421 int status;
1422 struct inode *tl_inode = NULL;
1423 struct buffer_head *tl_bh = NULL;
1424
1425 mlog_entry_void();
1426
1427 status = ocfs2_get_truncate_log_info(osb,
1428 osb->slot_num,
1429 &tl_inode,
1430 &tl_bh);
1431 if (status < 0)
1432 mlog_errno(status);
1433
1434 /* ocfs2_truncate_log_shutdown keys on the existence of
1435 * osb->osb_tl_inode so we don't set any of the osb variables
1436 * until we're sure all is well. */
David Howellsc4028952006-11-22 14:57:56 +00001437 INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
1438 ocfs2_truncate_log_worker);
Mark Fashehccd979b2005-12-15 14:31:24 -08001439 osb->osb_tl_bh = tl_bh;
1440 osb->osb_tl_inode = tl_inode;
1441
1442 mlog_exit(status);
1443 return status;
1444}
1445
1446/* This function will figure out whether the currently last extent
1447 * block will be deleted, and if it will, what the new last extent
1448 * block will be so we can update his h_next_leaf_blk field, as well
1449 * as the dinodes i_last_eb_blk */
1450static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
1451 struct inode *inode,
1452 struct ocfs2_dinode *fe,
1453 u32 new_i_clusters,
1454 struct buffer_head *old_last_eb,
1455 struct buffer_head **new_last_eb)
1456{
1457 int i, status = 0;
1458 u64 block = 0;
1459 struct ocfs2_extent_block *eb;
1460 struct ocfs2_extent_list *el;
1461 struct buffer_head *bh = NULL;
1462
1463 *new_last_eb = NULL;
1464
1465 if (!OCFS2_IS_VALID_DINODE(fe)) {
1466 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
1467 status = -EIO;
1468 goto bail;
1469 }
1470
1471 /* we have no tree, so of course, no last_eb. */
1472 if (!fe->id2.i_list.l_tree_depth)
1473 goto bail;
1474
1475 /* trunc to zero special case - this makes tree_depth = 0
1476 * regardless of what it is. */
1477 if (!new_i_clusters)
1478 goto bail;
1479
1480 eb = (struct ocfs2_extent_block *) old_last_eb->b_data;
1481 el = &(eb->h_list);
1482 BUG_ON(!el->l_next_free_rec);
1483
1484 /* Make sure that this guy will actually be empty after we
1485 * clear away the data. */
1486 if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
1487 goto bail;
1488
1489 /* Ok, at this point, we know that last_eb will definitely
1490 * change, so lets traverse the tree and find the second to
1491 * last extent block. */
1492 el = &(fe->id2.i_list);
1493 /* go down the tree, */
1494 do {
1495 for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
1496 if (le32_to_cpu(el->l_recs[i].e_cpos) <
1497 new_i_clusters) {
1498 block = le64_to_cpu(el->l_recs[i].e_blkno);
1499 break;
1500 }
1501 }
1502 BUG_ON(i < 0);
1503
1504 if (bh) {
1505 brelse(bh);
1506 bh = NULL;
1507 }
1508
1509 status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED,
1510 inode);
1511 if (status < 0) {
1512 mlog_errno(status);
1513 goto bail;
1514 }
1515 eb = (struct ocfs2_extent_block *) bh->b_data;
1516 el = &eb->h_list;
1517 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1518 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1519 status = -EIO;
1520 goto bail;
1521 }
1522 } while (el->l_tree_depth);
1523
1524 *new_last_eb = bh;
1525 get_bh(*new_last_eb);
Mark Fashehb06970532006-03-03 10:24:33 -08001526 mlog(0, "returning block %llu\n",
1527 (unsigned long long)le64_to_cpu(eb->h_blkno));
Mark Fashehccd979b2005-12-15 14:31:24 -08001528bail:
1529 if (bh)
1530 brelse(bh);
1531
1532 return status;
1533}
1534
1535static int ocfs2_do_truncate(struct ocfs2_super *osb,
1536 unsigned int clusters_to_del,
1537 struct inode *inode,
1538 struct buffer_head *fe_bh,
1539 struct buffer_head *old_last_eb_bh,
Mark Fasheh1fabe142006-10-09 18:11:45 -07001540 handle_t *handle,
Mark Fashehccd979b2005-12-15 14:31:24 -08001541 struct ocfs2_truncate_context *tc)
1542{
1543 int status, i, depth;
1544 struct ocfs2_dinode *fe;
1545 struct ocfs2_extent_block *eb;
1546 struct ocfs2_extent_block *last_eb = NULL;
1547 struct ocfs2_extent_list *el;
1548 struct buffer_head *eb_bh = NULL;
1549 struct buffer_head *last_eb_bh = NULL;
1550 u64 next_eb = 0;
1551 u64 delete_blk = 0;
1552
1553 fe = (struct ocfs2_dinode *) fe_bh->b_data;
1554
1555 status = ocfs2_find_new_last_ext_blk(osb,
1556 inode,
1557 fe,
1558 le32_to_cpu(fe->i_clusters) -
1559 clusters_to_del,
1560 old_last_eb_bh,
1561 &last_eb_bh);
1562 if (status < 0) {
1563 mlog_errno(status);
1564 goto bail;
1565 }
1566 if (last_eb_bh)
1567 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1568
1569 status = ocfs2_journal_access(handle, inode, fe_bh,
1570 OCFS2_JOURNAL_ACCESS_WRITE);
1571 if (status < 0) {
1572 mlog_errno(status);
1573 goto bail;
1574 }
1575 el = &(fe->id2.i_list);
1576
1577 spin_lock(&OCFS2_I(inode)->ip_lock);
1578 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
1579 clusters_to_del;
1580 spin_unlock(&OCFS2_I(inode)->ip_lock);
1581 le32_add_cpu(&fe->i_clusters, -clusters_to_del);
1582 fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
1583 fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
1584
1585 i = le16_to_cpu(el->l_next_free_rec) - 1;
1586
1587 BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
1588 le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
1589 /* tree depth zero, we can just delete the clusters, otherwise
1590 * we need to record the offset of the next level extent block
1591 * as we may overwrite it. */
1592 if (!el->l_tree_depth)
1593 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
1594 + ocfs2_clusters_to_blocks(osb->sb,
1595 le32_to_cpu(el->l_recs[i].e_clusters));
1596 else
1597 next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
1598
1599 if (!el->l_recs[i].e_clusters) {
1600 /* if we deleted the whole extent record, then clear
1601 * out the other fields and update the extent
1602 * list. For depth > 0 trees, we've already recorded
1603 * the extent block in 'next_eb' */
1604 el->l_recs[i].e_cpos = 0;
1605 el->l_recs[i].e_blkno = 0;
1606 BUG_ON(!el->l_next_free_rec);
1607 le16_add_cpu(&el->l_next_free_rec, -1);
1608 }
1609
1610 depth = le16_to_cpu(el->l_tree_depth);
1611 if (!fe->i_clusters) {
1612 /* trunc to zero is a special case. */
1613 el->l_tree_depth = 0;
1614 fe->i_last_eb_blk = 0;
1615 } else if (last_eb)
1616 fe->i_last_eb_blk = last_eb->h_blkno;
1617
1618 status = ocfs2_journal_dirty(handle, fe_bh);
1619 if (status < 0) {
1620 mlog_errno(status);
1621 goto bail;
1622 }
1623
1624 if (last_eb) {
1625 /* If there will be a new last extent block, then by
1626 * definition, there cannot be any leaves to the right of
1627 * him. */
1628 status = ocfs2_journal_access(handle, inode, last_eb_bh,
1629 OCFS2_JOURNAL_ACCESS_WRITE);
1630 if (status < 0) {
1631 mlog_errno(status);
1632 goto bail;
1633 }
1634 last_eb->h_next_leaf_blk = 0;
1635 status = ocfs2_journal_dirty(handle, last_eb_bh);
1636 if (status < 0) {
1637 mlog_errno(status);
1638 goto bail;
1639 }
1640 }
1641
1642 /* if our tree depth > 0, update all the tree blocks below us. */
1643 while (depth) {
Mark Fashehb06970532006-03-03 10:24:33 -08001644 mlog(0, "traveling tree (depth = %d, next_eb = %llu)\n",
1645 depth, (unsigned long long)next_eb);
Mark Fashehccd979b2005-12-15 14:31:24 -08001646 status = ocfs2_read_block(osb, next_eb, &eb_bh,
1647 OCFS2_BH_CACHED, inode);
1648 if (status < 0) {
1649 mlog_errno(status);
1650 goto bail;
1651 }
1652 eb = (struct ocfs2_extent_block *)eb_bh->b_data;
1653 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1654 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1655 status = -EIO;
1656 goto bail;
1657 }
1658 el = &(eb->h_list);
1659
1660 status = ocfs2_journal_access(handle, inode, eb_bh,
1661 OCFS2_JOURNAL_ACCESS_WRITE);
1662 if (status < 0) {
1663 mlog_errno(status);
1664 goto bail;
1665 }
1666
1667 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
1668 BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1));
1669
1670 i = le16_to_cpu(el->l_next_free_rec) - 1;
1671
Mark Fashehb06970532006-03-03 10:24:33 -08001672 mlog(0, "extent block %llu, before: record %d: "
1673 "(%u, %u, %llu), next = %u\n",
1674 (unsigned long long)le64_to_cpu(eb->h_blkno), i,
Mark Fashehccd979b2005-12-15 14:31:24 -08001675 le32_to_cpu(el->l_recs[i].e_cpos),
1676 le32_to_cpu(el->l_recs[i].e_clusters),
Mark Fashehb06970532006-03-03 10:24:33 -08001677 (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
Mark Fashehccd979b2005-12-15 14:31:24 -08001678 le16_to_cpu(el->l_next_free_rec));
1679
1680 BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
1681 le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
1682
1683 next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
1684 /* bottom-most block requires us to delete data.*/
1685 if (!el->l_tree_depth)
1686 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
1687 + ocfs2_clusters_to_blocks(osb->sb,
1688 le32_to_cpu(el->l_recs[i].e_clusters));
1689 if (!el->l_recs[i].e_clusters) {
1690 el->l_recs[i].e_cpos = 0;
1691 el->l_recs[i].e_blkno = 0;
1692 BUG_ON(!el->l_next_free_rec);
1693 le16_add_cpu(&el->l_next_free_rec, -1);
1694 }
Mark Fashehb06970532006-03-03 10:24:33 -08001695 mlog(0, "extent block %llu, after: record %d: "
1696 "(%u, %u, %llu), next = %u\n",
1697 (unsigned long long)le64_to_cpu(eb->h_blkno), i,
Mark Fashehccd979b2005-12-15 14:31:24 -08001698 le32_to_cpu(el->l_recs[i].e_cpos),
1699 le32_to_cpu(el->l_recs[i].e_clusters),
Mark Fashehb06970532006-03-03 10:24:33 -08001700 (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
Mark Fashehccd979b2005-12-15 14:31:24 -08001701 le16_to_cpu(el->l_next_free_rec));
1702
1703 status = ocfs2_journal_dirty(handle, eb_bh);
1704 if (status < 0) {
1705 mlog_errno(status);
1706 goto bail;
1707 }
1708
1709 if (!el->l_next_free_rec) {
1710 mlog(0, "deleting this extent block.\n");
1711
1712 ocfs2_remove_from_cache(inode, eb_bh);
1713
Mark Fashehccd979b2005-12-15 14:31:24 -08001714 BUG_ON(el->l_recs[0].e_clusters);
1715 BUG_ON(el->l_recs[0].e_cpos);
1716 BUG_ON(el->l_recs[0].e_blkno);
Mark Fasheheb357462006-08-09 13:23:08 -07001717 if (eb->h_suballoc_slot == 0) {
1718 /*
1719 * This code only understands how to
1720 * lock the suballocator in slot 0,
1721 * which is fine because allocation is
1722 * only ever done out of that
1723 * suballocator too. A future version
1724 * might change that however, so avoid
1725 * a free if we don't know how to
1726 * handle it. This way an fs incompat
1727 * bit will not be necessary.
1728 */
1729 status = ocfs2_free_extent_block(handle,
1730 tc->tc_ext_alloc_inode,
1731 tc->tc_ext_alloc_bh,
1732 eb);
1733 if (status < 0) {
1734 mlog_errno(status);
1735 goto bail;
1736 }
Mark Fashehccd979b2005-12-15 14:31:24 -08001737 }
1738 }
1739 brelse(eb_bh);
1740 eb_bh = NULL;
1741 depth--;
1742 }
1743
1744 BUG_ON(!delete_blk);
1745 status = ocfs2_truncate_log_append(osb, handle, delete_blk,
1746 clusters_to_del);
1747 if (status < 0) {
1748 mlog_errno(status);
1749 goto bail;
1750 }
1751 status = 0;
1752bail:
1753 if (!status)
1754 ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters));
1755 else
1756 ocfs2_extent_map_drop(inode, 0);
1757 mlog_exit(status);
1758 return status;
1759}
1760
1761/*
1762 * It is expected, that by the time you call this function,
1763 * inode->i_size and fe->i_size have been adjusted.
1764 *
1765 * WARNING: This will kfree the truncate context
1766 */
1767int ocfs2_commit_truncate(struct ocfs2_super *osb,
1768 struct inode *inode,
1769 struct buffer_head *fe_bh,
1770 struct ocfs2_truncate_context *tc)
1771{
1772 int status, i, credits, tl_sem = 0;
1773 u32 clusters_to_del, target_i_clusters;
1774 u64 last_eb = 0;
1775 struct ocfs2_dinode *fe;
1776 struct ocfs2_extent_block *eb;
1777 struct ocfs2_extent_list *el;
1778 struct buffer_head *last_eb_bh;
Mark Fasheh1fabe142006-10-09 18:11:45 -07001779 handle_t *handle = NULL;
Mark Fashehccd979b2005-12-15 14:31:24 -08001780 struct inode *tl_inode = osb->osb_tl_inode;
1781
1782 mlog_entry_void();
1783
1784 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1785
1786 target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
1787 i_size_read(inode));
1788
1789 last_eb_bh = tc->tc_last_eb_bh;
1790 tc->tc_last_eb_bh = NULL;
1791
1792 fe = (struct ocfs2_dinode *) fe_bh->b_data;
1793
1794 if (fe->id2.i_list.l_tree_depth) {
1795 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1796 el = &eb->h_list;
1797 } else
1798 el = &fe->id2.i_list;
1799 last_eb = le64_to_cpu(fe->i_last_eb_blk);
1800start:
1801 mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, "
Mark Fashehb06970532006-03-03 10:24:33 -08001802 "last_eb = %llu, fe->i_last_eb_blk = %llu, "
Mark Fashehccd979b2005-12-15 14:31:24 -08001803 "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
Mark Fashehb06970532006-03-03 10:24:33 -08001804 le32_to_cpu(fe->i_clusters), (unsigned long long)last_eb,
1805 (unsigned long long)le64_to_cpu(fe->i_last_eb_blk),
Mark Fashehccd979b2005-12-15 14:31:24 -08001806 le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh);
1807
1808 if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
1809 mlog(0, "last_eb changed!\n");
1810 BUG_ON(!fe->id2.i_list.l_tree_depth);
1811 last_eb = le64_to_cpu(fe->i_last_eb_blk);
1812 /* i_last_eb_blk may have changed, read it if
1813 * necessary. We don't have to worry about the
1814 * truncate to zero case here (where there becomes no
1815 * last_eb) because we never loop back after our work
1816 * is done. */
1817 if (last_eb_bh) {
1818 brelse(last_eb_bh);
1819 last_eb_bh = NULL;
1820 }
1821
1822 status = ocfs2_read_block(osb, last_eb,
1823 &last_eb_bh, OCFS2_BH_CACHED,
1824 inode);
1825 if (status < 0) {
1826 mlog_errno(status);
1827 goto bail;
1828 }
1829 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1830 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1831 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1832 status = -EIO;
1833 goto bail;
1834 }
1835 el = &(eb->h_list);
1836 }
1837
1838 /* by now, el will point to the extent list on the bottom most
1839 * portion of this tree. */
1840 i = le16_to_cpu(el->l_next_free_rec) - 1;
1841 if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters)
1842 clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
1843 else
1844 clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
1845 le32_to_cpu(el->l_recs[i].e_cpos)) -
1846 target_i_clusters;
1847
1848 mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
1849
Jes Sorensen1b1dcc12006-01-09 15:59:24 -08001850 mutex_lock(&tl_inode->i_mutex);
Mark Fashehccd979b2005-12-15 14:31:24 -08001851 tl_sem = 1;
1852 /* ocfs2_truncate_log_needs_flush guarantees us at least one
1853 * record is free for use. If there isn't any, we flush to get
1854 * an empty truncate log. */
1855 if (ocfs2_truncate_log_needs_flush(osb)) {
1856 status = __ocfs2_flush_truncate_log(osb);
1857 if (status < 0) {
1858 mlog_errno(status);
1859 goto bail;
1860 }
1861 }
1862
1863 credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
1864 fe, el);
Mark Fasheh65eff9c2006-10-09 17:26:22 -07001865 handle = ocfs2_start_trans(osb, credits);
Mark Fashehccd979b2005-12-15 14:31:24 -08001866 if (IS_ERR(handle)) {
1867 status = PTR_ERR(handle);
1868 handle = NULL;
1869 mlog_errno(status);
1870 goto bail;
1871 }
1872
1873 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1874 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
1875 if (status < 0)
1876 mlog_errno(status);
1877
1878 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
1879 last_eb_bh, handle, tc);
1880 if (status < 0) {
1881 mlog_errno(status);
1882 goto bail;
1883 }
1884
Jes Sorensen1b1dcc12006-01-09 15:59:24 -08001885 mutex_unlock(&tl_inode->i_mutex);
Mark Fashehccd979b2005-12-15 14:31:24 -08001886 tl_sem = 0;
1887
Mark Fasheh02dc1af2006-10-09 16:48:10 -07001888 ocfs2_commit_trans(osb, handle);
Mark Fashehccd979b2005-12-15 14:31:24 -08001889 handle = NULL;
1890
1891 BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters);
1892 if (le32_to_cpu(fe->i_clusters) > target_i_clusters)
1893 goto start;
1894bail:
1895 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1896
1897 ocfs2_schedule_truncate_log_flush(osb, 1);
1898
1899 if (tl_sem)
Jes Sorensen1b1dcc12006-01-09 15:59:24 -08001900 mutex_unlock(&tl_inode->i_mutex);
Mark Fashehccd979b2005-12-15 14:31:24 -08001901
1902 if (handle)
Mark Fasheh02dc1af2006-10-09 16:48:10 -07001903 ocfs2_commit_trans(osb, handle);
Mark Fashehccd979b2005-12-15 14:31:24 -08001904
1905 if (last_eb_bh)
1906 brelse(last_eb_bh);
1907
1908 /* This will drop the ext_alloc cluster lock for us */
1909 ocfs2_free_truncate_context(tc);
1910
1911 mlog_exit(status);
1912 return status;
1913}
1914
1915
1916/*
1917 * Expects the inode to already be locked. This will figure out which
1918 * inodes need to be locked and will put them on the returned truncate
1919 * context.
1920 */
1921int ocfs2_prepare_truncate(struct ocfs2_super *osb,
1922 struct inode *inode,
1923 struct buffer_head *fe_bh,
1924 struct ocfs2_truncate_context **tc)
1925{
1926 int status, metadata_delete;
1927 unsigned int new_i_clusters;
1928 struct ocfs2_dinode *fe;
1929 struct ocfs2_extent_block *eb;
1930 struct ocfs2_extent_list *el;
1931 struct buffer_head *last_eb_bh = NULL;
1932 struct inode *ext_alloc_inode = NULL;
1933 struct buffer_head *ext_alloc_bh = NULL;
1934
1935 mlog_entry_void();
1936
1937 *tc = NULL;
1938
1939 new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
1940 i_size_read(inode));
1941 fe = (struct ocfs2_dinode *) fe_bh->b_data;
1942
1943 mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
Mark Fashehb06970532006-03-03 10:24:33 -08001944 "%llu\n", fe->i_clusters, new_i_clusters,
1945 (unsigned long long)fe->i_size);
Mark Fashehccd979b2005-12-15 14:31:24 -08001946
1947 if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
Mark Fashehb06970532006-03-03 10:24:33 -08001948 ocfs2_error(inode->i_sb, "Dinode %llu has cluster count "
1949 "%u and size %llu whereas struct inode has "
Mark Fashehccd979b2005-12-15 14:31:24 -08001950 "cluster count %u and size %llu which caused an "
1951 "invalid truncate to %u clusters.",
Mark Fashehb06970532006-03-03 10:24:33 -08001952 (unsigned long long)le64_to_cpu(fe->i_blkno),
Mark Fashehccd979b2005-12-15 14:31:24 -08001953 le32_to_cpu(fe->i_clusters),
Mark Fashehb06970532006-03-03 10:24:33 -08001954 (unsigned long long)le64_to_cpu(fe->i_size),
Mark Fashehccd979b2005-12-15 14:31:24 -08001955 OCFS2_I(inode)->ip_clusters, i_size_read(inode),
1956 new_i_clusters);
1957 mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
1958 status = -EIO;
1959 goto bail;
1960 }
1961
Robert P. J. Daycd861282006-12-13 00:34:52 -08001962 *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
Mark Fashehccd979b2005-12-15 14:31:24 -08001963 if (!(*tc)) {
1964 status = -ENOMEM;
1965 mlog_errno(status);
1966 goto bail;
1967 }
1968
1969 metadata_delete = 0;
1970 if (fe->id2.i_list.l_tree_depth) {
1971 /* If we have a tree, then the truncate may result in
1972 * metadata deletes. Figure this out from the
1973 * rightmost leaf block.*/
1974 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
1975 &last_eb_bh, OCFS2_BH_CACHED, inode);
1976 if (status < 0) {
1977 mlog_errno(status);
1978 goto bail;
1979 }
1980 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1981 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1982 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1983
1984 brelse(last_eb_bh);
1985 status = -EIO;
1986 goto bail;
1987 }
1988 el = &(eb->h_list);
1989 if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters)
1990 metadata_delete = 1;
1991 }
1992
1993 (*tc)->tc_last_eb_bh = last_eb_bh;
1994
1995 if (metadata_delete) {
1996 mlog(0, "Will have to delete metadata for this trunc. "
1997 "locking allocator.\n");
1998 ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
1999 if (!ext_alloc_inode) {
2000 status = -ENOMEM;
2001 mlog_errno(status);
2002 goto bail;
2003 }
2004
Jes Sorensen1b1dcc12006-01-09 15:59:24 -08002005 mutex_lock(&ext_alloc_inode->i_mutex);
Mark Fashehccd979b2005-12-15 14:31:24 -08002006 (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
2007
Mark Fasheh4bcec182006-10-09 16:02:40 -07002008 status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1);
Mark Fashehccd979b2005-12-15 14:31:24 -08002009 if (status < 0) {
2010 mlog_errno(status);
2011 goto bail;
2012 }
2013 (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
2014 (*tc)->tc_ext_alloc_locked = 1;
2015 }
2016
2017 status = 0;
2018bail:
2019 if (status < 0) {
2020 if (*tc)
2021 ocfs2_free_truncate_context(*tc);
2022 *tc = NULL;
2023 }
2024 mlog_exit_void();
2025 return status;
2026}
2027
2028static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
2029{
2030 if (tc->tc_ext_alloc_inode) {
2031 if (tc->tc_ext_alloc_locked)
2032 ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
2033
Jes Sorensen1b1dcc12006-01-09 15:59:24 -08002034 mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex);
Mark Fashehccd979b2005-12-15 14:31:24 -08002035 iput(tc->tc_ext_alloc_inode);
2036 }
2037
2038 if (tc->tc_ext_alloc_bh)
2039 brelse(tc->tc_ext_alloc_bh);
2040
2041 if (tc->tc_last_eb_bh)
2042 brelse(tc->tc_last_eb_bh);
2043
2044 kfree(tc);
2045}