blob: 7ac68cac041d9f7c0765a819cbec49c8b57a3b12 [file] [log] [blame]
Mark Fashehccd979b2005-12-15 14:31:24 -08001/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * suballoc.c
5 *
6 * metadata alloc and free
7 * Inspired by ext3 block groups.
8 *
9 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 */
26
27#include <linux/fs.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h>
31
32#define MLOG_MASK_PREFIX ML_DISK_ALLOC
33#include <cluster/masklog.h>
34
35#include "ocfs2.h"
36
37#include "alloc.h"
38#include "dlmglue.h"
39#include "inode.h"
40#include "journal.h"
41#include "localalloc.h"
42#include "suballoc.h"
43#include "super.h"
44#include "sysfile.h"
45#include "uptodate.h"
46
47#include "buffer_head_io.h"
48
49static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
50static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
51static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
52static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
53 struct inode *alloc_inode,
54 struct buffer_head *bg_bh,
55 u64 group_blkno,
56 u16 my_chain,
57 struct ocfs2_chain_list *cl);
58static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
59 struct inode *alloc_inode,
60 struct buffer_head *bh);
61
62static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
63 struct ocfs2_alloc_context *ac);
64
65static int ocfs2_cluster_group_search(struct inode *inode,
66 struct buffer_head *group_bh,
67 u32 bits_wanted, u32 min_bits,
68 u16 *bit_off, u16 *bits_found);
69static int ocfs2_block_group_search(struct inode *inode,
70 struct buffer_head *group_bh,
71 u32 bits_wanted, u32 min_bits,
72 u16 *bit_off, u16 *bits_found);
73static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
74 u32 bits_wanted,
75 u32 min_bits,
76 u16 *bit_off,
77 unsigned int *num_bits,
78 u64 *bg_blkno);
79static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
80 struct ocfs2_alloc_context *ac,
81 u32 bits_wanted,
82 u32 min_bits,
83 u16 *bit_off,
84 unsigned int *num_bits,
85 u64 *bg_blkno);
86static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
87 int nr);
Mark Fashehccd979b2005-12-15 14:31:24 -080088static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
89 struct inode *alloc_inode,
90 struct ocfs2_group_desc *bg,
91 struct buffer_head *group_bh,
92 unsigned int bit_off,
93 unsigned int num_bits);
94static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
95 struct inode *alloc_inode,
96 struct ocfs2_group_desc *bg,
97 struct buffer_head *group_bh,
98 unsigned int bit_off,
99 unsigned int num_bits);
100
101static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
102 struct inode *alloc_inode,
103 struct buffer_head *fe_bh,
104 struct buffer_head *bg_bh,
105 struct buffer_head *prev_bg_bh,
106 u16 chain);
107static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
108 u32 wanted);
109static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
110 struct inode *alloc_inode,
111 struct buffer_head *alloc_bh,
112 unsigned int start_bit,
113 u64 bg_blkno,
114 unsigned int count);
115static inline u64 ocfs2_which_suballoc_group(u64 block,
116 unsigned int bit);
117static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
118 u64 bg_blkno,
119 u16 bg_bit_off);
120static inline u64 ocfs2_which_cluster_group(struct inode *inode,
121 u32 cluster);
122static inline void ocfs2_block_to_cluster_group(struct inode *inode,
123 u64 data_blkno,
124 u64 *bg_blkno,
125 u16 *bg_bit_off);
126
127void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
128{
129 if (ac->ac_inode)
130 iput(ac->ac_inode);
131 if (ac->ac_bh)
132 brelse(ac->ac_bh);
133 kfree(ac);
134}
135
136static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
137{
138 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
139}
140
Mark Fasheh7bf72ed2006-05-03 17:46:50 -0700141/* somewhat more expensive than our other checks, so use sparingly. */
142static int ocfs2_check_group_descriptor(struct super_block *sb,
143 struct ocfs2_dinode *di,
144 struct ocfs2_group_desc *gd)
145{
146 unsigned int max_bits;
147
148 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
149 OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd);
150 return -EIO;
151 }
152
153 if (di->i_blkno != gd->bg_parent_dinode) {
154 ocfs2_error(sb, "Group descriptor # %llu has bad parent "
155 "pointer (%llu, expected %llu)",
156 (unsigned long long)le64_to_cpu(gd->bg_blkno),
157 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
158 (unsigned long long)le64_to_cpu(di->i_blkno));
159 return -EIO;
160 }
161
162 max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
163 if (le16_to_cpu(gd->bg_bits) > max_bits) {
164 ocfs2_error(sb, "Group descriptor # %llu has bit count of %u",
165 (unsigned long long)le64_to_cpu(gd->bg_blkno),
166 le16_to_cpu(gd->bg_bits));
167 return -EIO;
168 }
169
170 if (le16_to_cpu(gd->bg_chain) >=
171 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
172 ocfs2_error(sb, "Group descriptor # %llu has bad chain %u",
173 (unsigned long long)le64_to_cpu(gd->bg_blkno),
174 le16_to_cpu(gd->bg_chain));
175 return -EIO;
176 }
177
178 if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
179 ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
180 "claims that %u are free",
181 (unsigned long long)le64_to_cpu(gd->bg_blkno),
182 le16_to_cpu(gd->bg_bits),
183 le16_to_cpu(gd->bg_free_bits_count));
184 return -EIO;
185 }
186
187 if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
188 ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
189 "max bitmap bits of %u",
190 (unsigned long long)le64_to_cpu(gd->bg_blkno),
191 le16_to_cpu(gd->bg_bits),
192 8 * le16_to_cpu(gd->bg_size));
193 return -EIO;
194 }
195
196 return 0;
197}
198
Mark Fashehccd979b2005-12-15 14:31:24 -0800199static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
200 struct inode *alloc_inode,
201 struct buffer_head *bg_bh,
202 u64 group_blkno,
203 u16 my_chain,
204 struct ocfs2_chain_list *cl)
205{
206 int status = 0;
207 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
208 struct super_block * sb = alloc_inode->i_sb;
209
210 mlog_entry_void();
211
212 if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
Mark Fashehb06970532006-03-03 10:24:33 -0800213 ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
214 "b_blocknr (%llu)",
215 (unsigned long long)group_blkno,
Mark Fashehccd979b2005-12-15 14:31:24 -0800216 (unsigned long long) bg_bh->b_blocknr);
217 status = -EIO;
218 goto bail;
219 }
220
221 status = ocfs2_journal_access(handle,
222 alloc_inode,
223 bg_bh,
224 OCFS2_JOURNAL_ACCESS_CREATE);
225 if (status < 0) {
226 mlog_errno(status);
227 goto bail;
228 }
229
230 memset(bg, 0, sb->s_blocksize);
231 strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
232 bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
233 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
234 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
235 bg->bg_chain = cpu_to_le16(my_chain);
236 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
237 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
238 bg->bg_blkno = cpu_to_le64(group_blkno);
239 /* set the 1st bit in the bitmap to account for the descriptor block */
240 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
241 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
242
243 status = ocfs2_journal_dirty(handle, bg_bh);
244 if (status < 0)
245 mlog_errno(status);
246
247 /* There is no need to zero out or otherwise initialize the
248 * other blocks in a group - All valid FS metadata in a block
249 * group stores the superblock fs_generation value at
250 * allocation time. */
251
252bail:
253 mlog_exit(status);
254 return status;
255}
256
257static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
258{
259 u16 curr, best;
260
261 best = curr = 0;
262 while (curr < le16_to_cpu(cl->cl_count)) {
263 if (le32_to_cpu(cl->cl_recs[best].c_total) >
264 le32_to_cpu(cl->cl_recs[curr].c_total))
265 best = curr;
266 curr++;
267 }
268 return best;
269}
270
271/*
272 * We expect the block group allocator to already be locked.
273 */
274static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
275 struct inode *alloc_inode,
276 struct buffer_head *bh)
277{
278 int status, credits;
279 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
280 struct ocfs2_chain_list *cl;
281 struct ocfs2_alloc_context *ac = NULL;
282 struct ocfs2_journal_handle *handle = NULL;
283 u32 bit_off, num_bits;
284 u16 alloc_rec;
285 u64 bg_blkno;
286 struct buffer_head *bg_bh = NULL;
287 struct ocfs2_group_desc *bg;
288
289 BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
290
291 mlog_entry_void();
292
293 handle = ocfs2_alloc_handle(osb);
294 if (!handle) {
295 status = -ENOMEM;
296 mlog_errno(status);
297 goto bail;
298 }
299
300 cl = &fe->id2.i_chain;
301 status = ocfs2_reserve_clusters(osb,
302 handle,
303 le16_to_cpu(cl->cl_cpg),
304 &ac);
305 if (status < 0) {
306 if (status != -ENOSPC)
307 mlog_errno(status);
308 goto bail;
309 }
310
311 credits = ocfs2_calc_group_alloc_credits(osb->sb,
312 le16_to_cpu(cl->cl_cpg));
313 handle = ocfs2_start_trans(osb, handle, credits);
314 if (IS_ERR(handle)) {
315 status = PTR_ERR(handle);
316 handle = NULL;
317 mlog_errno(status);
318 goto bail;
319 }
320
321 status = ocfs2_claim_clusters(osb,
322 handle,
323 ac,
324 le16_to_cpu(cl->cl_cpg),
325 &bit_off,
326 &num_bits);
327 if (status < 0) {
328 if (status != -ENOSPC)
329 mlog_errno(status);
330 goto bail;
331 }
332
333 alloc_rec = ocfs2_find_smallest_chain(cl);
334
335 /* setup the group */
336 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
Mark Fashehb06970532006-03-03 10:24:33 -0800337 mlog(0, "new descriptor, record %u, at block %llu\n",
338 alloc_rec, (unsigned long long)bg_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -0800339
340 bg_bh = sb_getblk(osb->sb, bg_blkno);
341 if (!bg_bh) {
342 status = -EIO;
343 mlog_errno(status);
344 goto bail;
345 }
346 ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
347
348 status = ocfs2_block_group_fill(handle,
349 alloc_inode,
350 bg_bh,
351 bg_blkno,
352 alloc_rec,
353 cl);
354 if (status < 0) {
355 mlog_errno(status);
356 goto bail;
357 }
358
359 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
360
361 status = ocfs2_journal_access(handle, alloc_inode,
362 bh, OCFS2_JOURNAL_ACCESS_WRITE);
363 if (status < 0) {
364 mlog_errno(status);
365 goto bail;
366 }
367
368 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
369 le16_to_cpu(bg->bg_free_bits_count));
370 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
371 cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno);
372 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
373 le16_add_cpu(&cl->cl_next_free_rec, 1);
374
375 le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
376 le16_to_cpu(bg->bg_free_bits_count));
377 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
378 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
379
380 status = ocfs2_journal_dirty(handle, bh);
381 if (status < 0) {
382 mlog_errno(status);
383 goto bail;
384 }
385
386 spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
387 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
388 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
389 le32_to_cpu(fe->i_clusters)));
390 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
391 i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
392 alloc_inode->i_blocks =
393 ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
394
395 status = 0;
396bail:
397 if (handle)
398 ocfs2_commit_trans(handle);
399
400 if (ac)
401 ocfs2_free_alloc_context(ac);
402
403 if (bg_bh)
404 brelse(bg_bh);
405
406 mlog_exit(status);
407 return status;
408}
409
410static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
411 struct ocfs2_alloc_context *ac)
412{
413 int status;
414 u32 bits_wanted = ac->ac_bits_wanted;
415 struct inode *alloc_inode = ac->ac_inode;
416 struct buffer_head *bh = NULL;
417 struct ocfs2_journal_handle *handle = ac->ac_handle;
418 struct ocfs2_dinode *fe;
419 u32 free_bits;
420
421 mlog_entry_void();
422
423 BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
424
425 ocfs2_handle_add_inode(handle, alloc_inode);
426 status = ocfs2_meta_lock(alloc_inode, handle, &bh, 1);
427 if (status < 0) {
428 mlog_errno(status);
429 goto bail;
430 }
431
432 fe = (struct ocfs2_dinode *) bh->b_data;
433 if (!OCFS2_IS_VALID_DINODE(fe)) {
434 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
435 status = -EIO;
436 goto bail;
437 }
438 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
Mark Fashehb06970532006-03-03 10:24:33 -0800439 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
440 (unsigned long long)le64_to_cpu(fe->i_blkno));
Mark Fashehccd979b2005-12-15 14:31:24 -0800441 status = -EIO;
442 goto bail;
443 }
444
445 free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
446 le32_to_cpu(fe->id1.bitmap1.i_used);
447
448 if (bits_wanted > free_bits) {
449 /* cluster bitmap never grows */
450 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
451 mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
452 bits_wanted, free_bits);
453 status = -ENOSPC;
454 goto bail;
455 }
456
457 status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
458 if (status < 0) {
459 if (status != -ENOSPC)
460 mlog_errno(status);
461 goto bail;
462 }
463 atomic_inc(&osb->alloc_stats.bg_extends);
464
465 /* You should never ask for this much metadata */
466 BUG_ON(bits_wanted >
467 (le32_to_cpu(fe->id1.bitmap1.i_total)
468 - le32_to_cpu(fe->id1.bitmap1.i_used)));
469 }
470
471 get_bh(bh);
472 ac->ac_bh = bh;
473bail:
474 if (bh)
475 brelse(bh);
476
477 mlog_exit(status);
478 return status;
479}
480
481int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
482 struct ocfs2_journal_handle *handle,
483 struct ocfs2_dinode *fe,
484 struct ocfs2_alloc_context **ac)
485{
486 int status;
487 struct inode *alloc_inode = NULL;
488
489 *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
490 if (!(*ac)) {
491 status = -ENOMEM;
492 mlog_errno(status);
493 goto bail;
494 }
495
496 (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
497 (*ac)->ac_handle = handle;
498 (*ac)->ac_which = OCFS2_AC_USE_META;
499
500#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
501 alloc_inode = ocfs2_get_system_file_inode(osb,
502 EXTENT_ALLOC_SYSTEM_INODE,
503 0);
504#else
505 alloc_inode = ocfs2_get_system_file_inode(osb,
506 EXTENT_ALLOC_SYSTEM_INODE,
507 osb->slot_num);
508#endif
509 if (!alloc_inode) {
510 status = -ENOMEM;
511 mlog_errno(status);
512 goto bail;
513 }
514
515 (*ac)->ac_inode = igrab(alloc_inode);
516 (*ac)->ac_group_search = ocfs2_block_group_search;
517
518 status = ocfs2_reserve_suballoc_bits(osb, (*ac));
519 if (status < 0) {
520 if (status != -ENOSPC)
521 mlog_errno(status);
522 goto bail;
523 }
524
525 status = 0;
526bail:
527 if ((status < 0) && *ac) {
528 ocfs2_free_alloc_context(*ac);
529 *ac = NULL;
530 }
531
532 if (alloc_inode)
533 iput(alloc_inode);
534
535 mlog_exit(status);
536 return status;
537}
538
539int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
540 struct ocfs2_journal_handle *handle,
541 struct ocfs2_alloc_context **ac)
542{
543 int status;
544 struct inode *alloc_inode = NULL;
545
546 *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
547 if (!(*ac)) {
548 status = -ENOMEM;
549 mlog_errno(status);
550 goto bail;
551 }
552
553 (*ac)->ac_bits_wanted = 1;
554 (*ac)->ac_handle = handle;
555 (*ac)->ac_which = OCFS2_AC_USE_INODE;
556
557 alloc_inode = ocfs2_get_system_file_inode(osb,
558 INODE_ALLOC_SYSTEM_INODE,
559 osb->slot_num);
560 if (!alloc_inode) {
561 status = -ENOMEM;
562 mlog_errno(status);
563 goto bail;
564 }
565
566 (*ac)->ac_inode = igrab(alloc_inode);
567 (*ac)->ac_group_search = ocfs2_block_group_search;
568
569 status = ocfs2_reserve_suballoc_bits(osb, *ac);
570 if (status < 0) {
571 if (status != -ENOSPC)
572 mlog_errno(status);
573 goto bail;
574 }
575
576 status = 0;
577bail:
578 if ((status < 0) && *ac) {
579 ocfs2_free_alloc_context(*ac);
580 *ac = NULL;
581 }
582
583 if (alloc_inode)
584 iput(alloc_inode);
585
586 mlog_exit(status);
587 return status;
588}
589
590/* local alloc code has to do the same thing, so rather than do this
591 * twice.. */
592int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
593 struct ocfs2_alloc_context *ac)
594{
595 int status;
596
597 ac->ac_inode = ocfs2_get_system_file_inode(osb,
598 GLOBAL_BITMAP_SYSTEM_INODE,
599 OCFS2_INVALID_SLOT);
600 if (!ac->ac_inode) {
601 status = -EINVAL;
602 mlog(ML_ERROR, "Could not get bitmap inode!\n");
603 goto bail;
604 }
605 ac->ac_which = OCFS2_AC_USE_MAIN;
606 ac->ac_group_search = ocfs2_cluster_group_search;
607
608 status = ocfs2_reserve_suballoc_bits(osb, ac);
609 if (status < 0 && status != -ENOSPC)
610 mlog_errno(status);
611bail:
612 return status;
613}
614
615/* Callers don't need to care which bitmap (local alloc or main) to
616 * use so we figure it out for them, but unfortunately this clutters
617 * things a bit. */
618int ocfs2_reserve_clusters(struct ocfs2_super *osb,
619 struct ocfs2_journal_handle *handle,
620 u32 bits_wanted,
621 struct ocfs2_alloc_context **ac)
622{
623 int status;
624
625 mlog_entry_void();
626
627 BUG_ON(!handle);
628
629 *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
630 if (!(*ac)) {
631 status = -ENOMEM;
632 mlog_errno(status);
633 goto bail;
634 }
635
636 (*ac)->ac_bits_wanted = bits_wanted;
637 (*ac)->ac_handle = handle;
638
639 status = -ENOSPC;
640 if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
641 status = ocfs2_reserve_local_alloc_bits(osb,
642 handle,
643 bits_wanted,
644 *ac);
645 if ((status < 0) && (status != -ENOSPC)) {
646 mlog_errno(status);
647 goto bail;
648 } else if (status == -ENOSPC) {
649 /* reserve_local_bits will return enospc with
650 * the local alloc inode still locked, so we
651 * can change this safely here. */
652 mlog(0, "Disabling local alloc\n");
653 /* We set to OCFS2_LA_DISABLED so that umount
654 * can clean up what's left of the local
655 * allocation */
656 osb->local_alloc_state = OCFS2_LA_DISABLED;
657 }
658 }
659
660 if (status == -ENOSPC) {
661 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
662 if (status < 0) {
663 if (status != -ENOSPC)
664 mlog_errno(status);
665 goto bail;
666 }
667 }
668
669 status = 0;
670bail:
671 if ((status < 0) && *ac) {
672 ocfs2_free_alloc_context(*ac);
673 *ac = NULL;
674 }
675
676 mlog_exit(status);
677 return status;
678}
679
680/*
681 * More or less lifted from ext3. I'll leave their description below:
682 *
683 * "For ext3 allocations, we must not reuse any blocks which are
684 * allocated in the bitmap buffer's "last committed data" copy. This
685 * prevents deletes from freeing up the page for reuse until we have
686 * committed the delete transaction.
687 *
688 * If we didn't do this, then deleting something and reallocating it as
689 * data would allow the old block to be overwritten before the
690 * transaction committed (because we force data to disk before commit).
691 * This would lead to corruption if we crashed between overwriting the
692 * data and committing the delete.
693 *
694 * @@@ We may want to make this allocation behaviour conditional on
695 * data-writes at some point, and disable it for metadata allocations or
696 * sync-data inodes."
697 *
698 * Note: OCFS2 already does this differently for metadata vs data
699 * allocations, as those bitmaps are seperate and undo access is never
700 * called on a metadata group descriptor.
701 */
702static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
703 int nr)
704{
705 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
706
707 if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
708 return 0;
709 if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
710 return 1;
711
712 bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
713 return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
714}
715
716static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
717 struct buffer_head *bg_bh,
718 unsigned int bits_wanted,
Mark Fasheh7bf72ed2006-05-03 17:46:50 -0700719 unsigned int total_bits,
Mark Fashehccd979b2005-12-15 14:31:24 -0800720 u16 *bit_off,
721 u16 *bits_found)
722{
723 void *bitmap;
724 u16 best_offset, best_size;
725 int offset, start, found, status = 0;
726 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
727
728 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
729 OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
730 return -EIO;
731 }
732
733 found = start = best_offset = best_size = 0;
734 bitmap = bg->bg_bitmap;
735
Mark Fasheh7bf72ed2006-05-03 17:46:50 -0700736 while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
737 if (offset == total_bits)
Mark Fashehccd979b2005-12-15 14:31:24 -0800738 break;
739
740 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
741 /* We found a zero, but we can't use it as it
742 * hasn't been put to disk yet! */
743 found = 0;
744 start = offset + 1;
745 } else if (offset == start) {
746 /* we found a zero */
747 found++;
748 /* move start to the next bit to test */
749 start++;
750 } else {
751 /* got a zero after some ones */
752 found = 1;
753 start = offset + 1;
754 }
755 if (found > best_size) {
756 best_size = found;
757 best_offset = start - found;
758 }
759 /* we got everything we needed */
760 if (found == bits_wanted) {
761 /* mlog(0, "Found it all!\n"); */
762 break;
763 }
764 }
765
766 /* XXX: I think the first clause is equivalent to the second
767 * - jlbec */
768 if (found == bits_wanted) {
769 *bit_off = start - found;
770 *bits_found = found;
771 } else if (best_size) {
772 *bit_off = best_offset;
773 *bits_found = best_size;
774 } else {
775 status = -ENOSPC;
776 /* No error log here -- see the comment above
777 * ocfs2_test_bg_bit_allocatable */
778 }
779
780 return status;
781}
782
783static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
784 struct inode *alloc_inode,
785 struct ocfs2_group_desc *bg,
786 struct buffer_head *group_bh,
787 unsigned int bit_off,
788 unsigned int num_bits)
789{
790 int status;
791 void *bitmap = bg->bg_bitmap;
792 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
793
794 mlog_entry_void();
795
796 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
797 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
798 status = -EIO;
799 goto bail;
800 }
801 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
802
803 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
804 num_bits);
805
806 if (ocfs2_is_cluster_bitmap(alloc_inode))
807 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
808
809 status = ocfs2_journal_access(handle,
810 alloc_inode,
811 group_bh,
812 journal_type);
813 if (status < 0) {
814 mlog_errno(status);
815 goto bail;
816 }
817
818 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
819
820 while(num_bits--)
821 ocfs2_set_bit(bit_off++, bitmap);
822
823 status = ocfs2_journal_dirty(handle,
824 group_bh);
825 if (status < 0) {
826 mlog_errno(status);
827 goto bail;
828 }
829
830bail:
831 mlog_exit(status);
832 return status;
833}
834
835/* find the one with the most empty bits */
836static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
837{
838 u16 curr, best;
839
840 BUG_ON(!cl->cl_next_free_rec);
841
842 best = curr = 0;
843 while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
844 if (le32_to_cpu(cl->cl_recs[curr].c_free) >
845 le32_to_cpu(cl->cl_recs[best].c_free))
846 best = curr;
847 curr++;
848 }
849
850 BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
851 return best;
852}
853
854static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
855 struct inode *alloc_inode,
856 struct buffer_head *fe_bh,
857 struct buffer_head *bg_bh,
858 struct buffer_head *prev_bg_bh,
859 u16 chain)
860{
861 int status;
862 /* there is a really tiny chance the journal calls could fail,
863 * but we wouldn't want inconsistent blocks in *any* case. */
864 u64 fe_ptr, bg_ptr, prev_bg_ptr;
865 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
866 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
867 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
868
869 if (!OCFS2_IS_VALID_DINODE(fe)) {
870 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
871 status = -EIO;
872 goto out;
873 }
874 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
875 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
876 status = -EIO;
877 goto out;
878 }
879 if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
880 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
881 status = -EIO;
882 goto out;
883 }
884
Mark Fashehb06970532006-03-03 10:24:33 -0800885 mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
886 (unsigned long long)fe->i_blkno, chain,
887 (unsigned long long)bg->bg_blkno,
888 (unsigned long long)prev_bg->bg_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -0800889
890 fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
891 bg_ptr = le64_to_cpu(bg->bg_next_group);
892 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
893
894 status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
895 OCFS2_JOURNAL_ACCESS_WRITE);
896 if (status < 0) {
897 mlog_errno(status);
898 goto out_rollback;
899 }
900
901 prev_bg->bg_next_group = bg->bg_next_group;
902
903 status = ocfs2_journal_dirty(handle, prev_bg_bh);
904 if (status < 0) {
905 mlog_errno(status);
906 goto out_rollback;
907 }
908
909 status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
910 OCFS2_JOURNAL_ACCESS_WRITE);
911 if (status < 0) {
912 mlog_errno(status);
913 goto out_rollback;
914 }
915
916 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
917
918 status = ocfs2_journal_dirty(handle, bg_bh);
919 if (status < 0) {
920 mlog_errno(status);
921 goto out_rollback;
922 }
923
924 status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
925 OCFS2_JOURNAL_ACCESS_WRITE);
926 if (status < 0) {
927 mlog_errno(status);
928 goto out_rollback;
929 }
930
931 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
932
933 status = ocfs2_journal_dirty(handle, fe_bh);
934 if (status < 0) {
935 mlog_errno(status);
936 goto out_rollback;
937 }
938
939 status = 0;
940out_rollback:
941 if (status < 0) {
942 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
943 bg->bg_next_group = cpu_to_le64(bg_ptr);
944 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
945 }
946out:
947 mlog_exit(status);
948 return status;
949}
950
951static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
952 u32 wanted)
953{
954 return le16_to_cpu(bg->bg_free_bits_count) > wanted;
955}
956
957/* return 0 on success, -ENOSPC to keep searching and any other < 0
958 * value on error. */
959static int ocfs2_cluster_group_search(struct inode *inode,
960 struct buffer_head *group_bh,
961 u32 bits_wanted, u32 min_bits,
962 u16 *bit_off, u16 *bits_found)
963{
964 int search = -ENOSPC;
965 int ret;
Mark Fasheh7bf72ed2006-05-03 17:46:50 -0700966 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
Mark Fashehccd979b2005-12-15 14:31:24 -0800967 u16 tmp_off, tmp_found;
Mark Fasheh7bf72ed2006-05-03 17:46:50 -0700968 unsigned int max_bits, gd_cluster_off;
Mark Fashehccd979b2005-12-15 14:31:24 -0800969
970 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
971
Mark Fasheh7bf72ed2006-05-03 17:46:50 -0700972 if (gd->bg_free_bits_count) {
973 max_bits = le16_to_cpu(gd->bg_bits);
974
975 /* Tail groups in cluster bitmaps which aren't cpg
976 * aligned are prone to partial extention by a failed
977 * fs resize. If the file system resize never got to
978 * update the dinode cluster count, then we don't want
979 * to trust any clusters past it, regardless of what
980 * the group descriptor says. */
981 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
982 le64_to_cpu(gd->bg_blkno));
983 if ((gd_cluster_off + max_bits) >
984 OCFS2_I(inode)->ip_clusters) {
985 max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
986 mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
987 (unsigned long long)le64_to_cpu(gd->bg_blkno),
988 le16_to_cpu(gd->bg_bits),
989 OCFS2_I(inode)->ip_clusters, max_bits);
990 }
991
Mark Fashehccd979b2005-12-15 14:31:24 -0800992 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
993 group_bh, bits_wanted,
Mark Fasheh7bf72ed2006-05-03 17:46:50 -0700994 max_bits,
Mark Fashehccd979b2005-12-15 14:31:24 -0800995 &tmp_off, &tmp_found);
996 if (ret)
997 return ret;
998
999 /* ocfs2_block_group_find_clear_bits() might
1000 * return success, but we still want to return
1001 * -ENOSPC unless it found the minimum number
1002 * of bits. */
1003 if (min_bits <= tmp_found) {
1004 *bit_off = tmp_off;
1005 *bits_found = tmp_found;
1006 search = 0; /* success */
1007 }
1008 }
1009
1010 return search;
1011}
1012
1013static int ocfs2_block_group_search(struct inode *inode,
1014 struct buffer_head *group_bh,
1015 u32 bits_wanted, u32 min_bits,
1016 u16 *bit_off, u16 *bits_found)
1017{
1018 int ret = -ENOSPC;
1019 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1020
1021 BUG_ON(min_bits != 1);
1022 BUG_ON(ocfs2_is_cluster_bitmap(inode));
1023
1024 if (bg->bg_free_bits_count)
1025 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1026 group_bh, bits_wanted,
Mark Fasheh7bf72ed2006-05-03 17:46:50 -07001027 le16_to_cpu(bg->bg_bits),
Mark Fashehccd979b2005-12-15 14:31:24 -08001028 bit_off, bits_found);
1029
1030 return ret;
1031}
1032
1033static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1034 u32 bits_wanted,
1035 u32 min_bits,
1036 u16 *bit_off,
1037 unsigned int *num_bits,
1038 u64 *bg_blkno)
1039{
1040 int status;
1041 u16 chain, tmp_bits;
1042 u32 tmp_used;
1043 u64 next_group;
1044 struct ocfs2_journal_handle *handle = ac->ac_handle;
1045 struct inode *alloc_inode = ac->ac_inode;
1046 struct buffer_head *group_bh = NULL;
1047 struct buffer_head *prev_group_bh = NULL;
1048 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1049 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1050 struct ocfs2_group_desc *bg;
1051
1052 chain = ac->ac_chain;
Mark Fashehb06970532006-03-03 10:24:33 -08001053 mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
1054 bits_wanted, chain,
1055 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -08001056
1057 status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
1058 le64_to_cpu(cl->cl_recs[chain].c_blkno),
1059 &group_bh, OCFS2_BH_CACHED, alloc_inode);
1060 if (status < 0) {
1061 mlog_errno(status);
1062 goto bail;
1063 }
1064 bg = (struct ocfs2_group_desc *) group_bh->b_data;
Mark Fasheh7bf72ed2006-05-03 17:46:50 -07001065 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
1066 if (status) {
1067 mlog_errno(status);
Mark Fashehccd979b2005-12-15 14:31:24 -08001068 goto bail;
1069 }
1070
1071 status = -ENOSPC;
1072 /* for now, the chain search is a bit simplistic. We just use
1073 * the 1st group with any empty bits. */
1074 while ((status = ac->ac_group_search(alloc_inode, group_bh,
1075 bits_wanted, min_bits, bit_off,
1076 &tmp_bits)) == -ENOSPC) {
1077 if (!bg->bg_next_group)
1078 break;
1079
1080 if (prev_group_bh) {
1081 brelse(prev_group_bh);
1082 prev_group_bh = NULL;
1083 }
1084 next_group = le64_to_cpu(bg->bg_next_group);
1085 prev_group_bh = group_bh;
1086 group_bh = NULL;
1087 status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
1088 next_group, &group_bh,
1089 OCFS2_BH_CACHED, alloc_inode);
1090 if (status < 0) {
1091 mlog_errno(status);
1092 goto bail;
1093 }
1094 bg = (struct ocfs2_group_desc *) group_bh->b_data;
Mark Fasheh7bf72ed2006-05-03 17:46:50 -07001095 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
1096 if (status) {
1097 mlog_errno(status);
Mark Fashehccd979b2005-12-15 14:31:24 -08001098 goto bail;
1099 }
1100 }
1101 if (status < 0) {
1102 if (status != -ENOSPC)
1103 mlog_errno(status);
1104 goto bail;
1105 }
1106
Mark Fashehb06970532006-03-03 10:24:33 -08001107 mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1108 tmp_bits, (unsigned long long)bg->bg_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -08001109
1110 *num_bits = tmp_bits;
1111
1112 BUG_ON(*num_bits == 0);
1113
1114 /*
1115 * Keep track of previous block descriptor read. When
1116 * we find a target, if we have read more than X
1117 * number of descriptors, and the target is reasonably
1118 * empty, relink him to top of his chain.
1119 *
1120 * We've read 0 extra blocks and only send one more to
1121 * the transaction, yet the next guy to search has a
1122 * much easier time.
1123 *
1124 * Do this *after* figuring out how many bits we're taking out
1125 * of our target group.
1126 */
1127 if (ac->ac_allow_chain_relink &&
1128 (prev_group_bh) &&
1129 (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
1130 status = ocfs2_relink_block_group(handle, alloc_inode,
1131 ac->ac_bh, group_bh,
1132 prev_group_bh, chain);
1133 if (status < 0) {
1134 mlog_errno(status);
1135 goto bail;
1136 }
1137 }
1138
1139 /* Ok, claim our bits now: set the info on dinode, chainlist
1140 * and then the group */
1141 status = ocfs2_journal_access(handle,
1142 alloc_inode,
1143 ac->ac_bh,
1144 OCFS2_JOURNAL_ACCESS_WRITE);
1145 if (status < 0) {
1146 mlog_errno(status);
1147 goto bail;
1148 }
1149
1150 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1151 fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1152 le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1153
1154 status = ocfs2_journal_dirty(handle,
1155 ac->ac_bh);
1156 if (status < 0) {
1157 mlog_errno(status);
1158 goto bail;
1159 }
1160
1161 status = ocfs2_block_group_set_bits(handle,
1162 alloc_inode,
1163 bg,
1164 group_bh,
1165 *bit_off,
1166 *num_bits);
1167 if (status < 0) {
1168 mlog_errno(status);
1169 goto bail;
1170 }
1171
Mark Fashehb06970532006-03-03 10:24:33 -08001172 mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
1173 (unsigned long long)fe->i_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -08001174
1175 *bg_blkno = le64_to_cpu(bg->bg_blkno);
1176bail:
1177 if (group_bh)
1178 brelse(group_bh);
1179 if (prev_group_bh)
1180 brelse(prev_group_bh);
1181
1182 mlog_exit(status);
1183 return status;
1184}
1185
1186/* will give out up to bits_wanted contiguous bits. */
1187static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1188 struct ocfs2_alloc_context *ac,
1189 u32 bits_wanted,
1190 u32 min_bits,
1191 u16 *bit_off,
1192 unsigned int *num_bits,
1193 u64 *bg_blkno)
1194{
1195 int status;
1196 u16 victim, i;
1197 struct ocfs2_chain_list *cl;
1198 struct ocfs2_dinode *fe;
1199
1200 mlog_entry_void();
1201
1202 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1203 BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1204 BUG_ON(!ac->ac_bh);
1205
1206 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1207 if (!OCFS2_IS_VALID_DINODE(fe)) {
1208 OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
1209 status = -EIO;
1210 goto bail;
1211 }
1212 if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1213 le32_to_cpu(fe->id1.bitmap1.i_total)) {
Mark Fashehb06970532006-03-03 10:24:33 -08001214 ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
1215 "bits but only %u total.",
1216 (unsigned long long)le64_to_cpu(fe->i_blkno),
Mark Fashehccd979b2005-12-15 14:31:24 -08001217 le32_to_cpu(fe->id1.bitmap1.i_used),
1218 le32_to_cpu(fe->id1.bitmap1.i_total));
1219 status = -EIO;
1220 goto bail;
1221 }
1222
1223 cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1224
1225 victim = ocfs2_find_victim_chain(cl);
1226 ac->ac_chain = victim;
1227 ac->ac_allow_chain_relink = 1;
1228
1229 status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off,
1230 num_bits, bg_blkno);
1231 if (!status)
1232 goto bail;
1233 if (status < 0 && status != -ENOSPC) {
1234 mlog_errno(status);
1235 goto bail;
1236 }
1237
1238 mlog(0, "Search of victim chain %u came up with nothing, "
1239 "trying all chains now.\n", victim);
1240
1241 /* If we didn't pick a good victim, then just default to
1242 * searching each chain in order. Don't allow chain relinking
1243 * because we only calculate enough journal credits for one
1244 * relink per alloc. */
1245 ac->ac_allow_chain_relink = 0;
1246 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1247 if (i == victim)
1248 continue;
1249 if (!cl->cl_recs[i].c_free)
1250 continue;
1251
1252 ac->ac_chain = i;
1253 status = ocfs2_search_chain(ac, bits_wanted, min_bits,
1254 bit_off, num_bits,
1255 bg_blkno);
1256 if (!status)
1257 break;
1258 if (status < 0 && status != -ENOSPC) {
1259 mlog_errno(status);
1260 goto bail;
1261 }
1262 }
1263bail:
1264
1265 mlog_exit(status);
1266 return status;
1267}
1268
1269int ocfs2_claim_metadata(struct ocfs2_super *osb,
1270 struct ocfs2_journal_handle *handle,
1271 struct ocfs2_alloc_context *ac,
1272 u32 bits_wanted,
1273 u16 *suballoc_bit_start,
1274 unsigned int *num_bits,
1275 u64 *blkno_start)
1276{
1277 int status;
1278 u64 bg_blkno;
1279
1280 BUG_ON(!ac);
1281 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1282 BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1283 BUG_ON(ac->ac_handle != handle);
1284
1285 status = ocfs2_claim_suballoc_bits(osb,
1286 ac,
1287 bits_wanted,
1288 1,
1289 suballoc_bit_start,
1290 num_bits,
1291 &bg_blkno);
1292 if (status < 0) {
1293 mlog_errno(status);
1294 goto bail;
1295 }
1296 atomic_inc(&osb->alloc_stats.bg_allocs);
1297
1298 *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
1299 ac->ac_bits_given += (*num_bits);
1300 status = 0;
1301bail:
1302 mlog_exit(status);
1303 return status;
1304}
1305
1306int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1307 struct ocfs2_journal_handle *handle,
1308 struct ocfs2_alloc_context *ac,
1309 u16 *suballoc_bit,
1310 u64 *fe_blkno)
1311{
1312 int status;
1313 unsigned int num_bits;
1314 u64 bg_blkno;
1315
1316 mlog_entry_void();
1317
1318 BUG_ON(!ac);
1319 BUG_ON(ac->ac_bits_given != 0);
1320 BUG_ON(ac->ac_bits_wanted != 1);
1321 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1322 BUG_ON(ac->ac_handle != handle);
1323
1324 status = ocfs2_claim_suballoc_bits(osb,
1325 ac,
1326 1,
1327 1,
1328 suballoc_bit,
1329 &num_bits,
1330 &bg_blkno);
1331 if (status < 0) {
1332 mlog_errno(status);
1333 goto bail;
1334 }
1335 atomic_inc(&osb->alloc_stats.bg_allocs);
1336
1337 BUG_ON(num_bits != 1);
1338
1339 *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1340 ac->ac_bits_given++;
1341 status = 0;
1342bail:
1343 mlog_exit(status);
1344 return status;
1345}
1346
1347/* translate a group desc. blkno and it's bitmap offset into
1348 * disk cluster offset. */
1349static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1350 u64 bg_blkno,
1351 u16 bg_bit_off)
1352{
1353 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1354 u32 cluster = 0;
1355
1356 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1357
1358 if (bg_blkno != osb->first_cluster_group_blkno)
1359 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
1360 cluster += (u32) bg_bit_off;
1361 return cluster;
1362}
1363
1364/* given a cluster offset, calculate which block group it belongs to
1365 * and return that block offset. */
1366static inline u64 ocfs2_which_cluster_group(struct inode *inode,
1367 u32 cluster)
1368{
1369 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1370 u32 group_no;
1371
1372 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1373
1374 group_no = cluster / osb->bitmap_cpg;
1375 if (!group_no)
1376 return osb->first_cluster_group_blkno;
1377 return ocfs2_clusters_to_blocks(inode->i_sb,
1378 group_no * osb->bitmap_cpg);
1379}
1380
1381/* given the block number of a cluster start, calculate which cluster
1382 * group and descriptor bitmap offset that corresponds to. */
1383static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1384 u64 data_blkno,
1385 u64 *bg_blkno,
1386 u16 *bg_bit_off)
1387{
1388 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1389 u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
1390
1391 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1392
1393 *bg_blkno = ocfs2_which_cluster_group(inode,
1394 data_cluster);
1395
1396 if (*bg_blkno == osb->first_cluster_group_blkno)
1397 *bg_bit_off = (u16) data_cluster;
1398 else
1399 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
1400 data_blkno - *bg_blkno);
1401}
1402
1403/*
1404 * min_bits - minimum contiguous chunk from this total allocation we
1405 * can handle. set to what we asked for originally for a full
1406 * contig. allocation, set to '1' to indicate we can deal with extents
1407 * of any size.
1408 */
1409int ocfs2_claim_clusters(struct ocfs2_super *osb,
1410 struct ocfs2_journal_handle *handle,
1411 struct ocfs2_alloc_context *ac,
1412 u32 min_clusters,
1413 u32 *cluster_start,
1414 u32 *num_clusters)
1415{
1416 int status;
1417 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1418 u64 bg_blkno;
1419 u16 bg_bit_off;
1420
1421 mlog_entry_void();
1422
1423 BUG_ON(!ac);
1424 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1425
1426 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
1427 && ac->ac_which != OCFS2_AC_USE_MAIN);
1428 BUG_ON(ac->ac_handle != handle);
1429
1430 if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
1431 status = ocfs2_claim_local_alloc_bits(osb,
1432 handle,
1433 ac,
1434 bits_wanted,
1435 cluster_start,
1436 num_clusters);
1437 if (!status)
1438 atomic_inc(&osb->alloc_stats.local_data);
1439 } else {
1440 if (min_clusters > (osb->bitmap_cpg - 1)) {
1441 /* The only paths asking for contiguousness
1442 * should know about this already. */
1443 mlog(ML_ERROR, "minimum allocation requested exceeds "
1444 "group bitmap size!");
1445 status = -ENOSPC;
1446 goto bail;
1447 }
1448 /* clamp the current request down to a realistic size. */
1449 if (bits_wanted > (osb->bitmap_cpg - 1))
1450 bits_wanted = osb->bitmap_cpg - 1;
1451
1452 status = ocfs2_claim_suballoc_bits(osb,
1453 ac,
1454 bits_wanted,
1455 min_clusters,
1456 &bg_bit_off,
1457 num_clusters,
1458 &bg_blkno);
1459 if (!status) {
1460 *cluster_start =
1461 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1462 bg_blkno,
1463 bg_bit_off);
1464 atomic_inc(&osb->alloc_stats.bitmap_data);
1465 }
1466 }
1467 if (status < 0) {
1468 if (status != -ENOSPC)
1469 mlog_errno(status);
1470 goto bail;
1471 }
1472
1473 ac->ac_bits_given += *num_clusters;
1474
1475bail:
1476 mlog_exit(status);
1477 return status;
1478}
1479
1480static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
1481 struct inode *alloc_inode,
1482 struct ocfs2_group_desc *bg,
1483 struct buffer_head *group_bh,
1484 unsigned int bit_off,
1485 unsigned int num_bits)
1486{
1487 int status;
1488 unsigned int tmp;
1489 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1490 struct ocfs2_group_desc *undo_bg = NULL;
1491
1492 mlog_entry_void();
1493
1494 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
1495 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
1496 status = -EIO;
1497 goto bail;
1498 }
1499
1500 mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1501
1502 if (ocfs2_is_cluster_bitmap(alloc_inode))
1503 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1504
1505 status = ocfs2_journal_access(handle, alloc_inode, group_bh,
1506 journal_type);
1507 if (status < 0) {
1508 mlog_errno(status);
1509 goto bail;
1510 }
1511
1512 if (ocfs2_is_cluster_bitmap(alloc_inode))
1513 undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
1514
1515 tmp = num_bits;
1516 while(tmp--) {
1517 ocfs2_clear_bit((bit_off + tmp),
1518 (unsigned long *) bg->bg_bitmap);
1519 if (ocfs2_is_cluster_bitmap(alloc_inode))
1520 ocfs2_set_bit(bit_off + tmp,
1521 (unsigned long *) undo_bg->bg_bitmap);
1522 }
1523 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1524
1525 status = ocfs2_journal_dirty(handle, group_bh);
1526 if (status < 0)
1527 mlog_errno(status);
1528bail:
1529 return status;
1530}
1531
1532/*
1533 * expects the suballoc inode to already be locked.
1534 */
1535static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
1536 struct inode *alloc_inode,
1537 struct buffer_head *alloc_bh,
1538 unsigned int start_bit,
1539 u64 bg_blkno,
1540 unsigned int count)
1541{
1542 int status = 0;
1543 u32 tmp_used;
1544 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
1545 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
1546 struct ocfs2_chain_list *cl = &fe->id2.i_chain;
1547 struct buffer_head *group_bh = NULL;
1548 struct ocfs2_group_desc *group;
1549
1550 mlog_entry_void();
1551
1552 if (!OCFS2_IS_VALID_DINODE(fe)) {
1553 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
1554 status = -EIO;
1555 goto bail;
1556 }
1557 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
1558
Mark Fashehb06970532006-03-03 10:24:33 -08001559 mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
1560 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
1561 (unsigned long long)bg_blkno, start_bit);
Mark Fashehccd979b2005-12-15 14:31:24 -08001562
1563 status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
1564 alloc_inode);
1565 if (status < 0) {
1566 mlog_errno(status);
1567 goto bail;
1568 }
1569
1570 group = (struct ocfs2_group_desc *) group_bh->b_data;
Mark Fasheh7bf72ed2006-05-03 17:46:50 -07001571 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group);
1572 if (status) {
1573 mlog_errno(status);
Mark Fashehccd979b2005-12-15 14:31:24 -08001574 goto bail;
1575 }
1576 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
1577
1578 status = ocfs2_block_group_clear_bits(handle, alloc_inode,
1579 group, group_bh,
1580 start_bit, count);
1581 if (status < 0) {
1582 mlog_errno(status);
1583 goto bail;
1584 }
1585
1586 status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
1587 OCFS2_JOURNAL_ACCESS_WRITE);
1588 if (status < 0) {
1589 mlog_errno(status);
1590 goto bail;
1591 }
1592
1593 le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
1594 count);
1595 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1596 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
1597
1598 status = ocfs2_journal_dirty(handle, alloc_bh);
1599 if (status < 0) {
1600 mlog_errno(status);
1601 goto bail;
1602 }
1603
1604bail:
1605 if (group_bh)
1606 brelse(group_bh);
1607
1608 mlog_exit(status);
1609 return status;
1610}
1611
1612static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
1613{
1614 u64 group = block - (u64) bit;
1615
1616 return group;
1617}
1618
1619int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
1620 struct inode *inode_alloc_inode,
1621 struct buffer_head *inode_alloc_bh,
1622 struct ocfs2_dinode *di)
1623{
1624 u64 blk = le64_to_cpu(di->i_blkno);
1625 u16 bit = le16_to_cpu(di->i_suballoc_bit);
1626 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1627
1628 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
1629 inode_alloc_bh, bit, bg_blkno, 1);
1630}
1631
1632int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
1633 struct inode *eb_alloc_inode,
1634 struct buffer_head *eb_alloc_bh,
1635 struct ocfs2_extent_block *eb)
1636{
1637 u64 blk = le64_to_cpu(eb->h_blkno);
1638 u16 bit = le16_to_cpu(eb->h_suballoc_bit);
1639 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1640
1641 return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
1642 bit, bg_blkno, 1);
1643}
1644
1645int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
1646 struct inode *bitmap_inode,
1647 struct buffer_head *bitmap_bh,
1648 u64 start_blk,
1649 unsigned int num_clusters)
1650{
1651 int status;
1652 u16 bg_start_bit;
1653 u64 bg_blkno;
1654 struct ocfs2_dinode *fe;
1655
1656 /* You can't ever have a contiguous set of clusters
1657 * bigger than a block group bitmap so we never have to worry
1658 * about looping on them. */
1659
1660 mlog_entry_void();
1661
1662 /* This is expensive. We can safely remove once this stuff has
1663 * gotten tested really well. */
1664 BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
1665
1666 fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
1667
1668 ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
1669 &bg_start_bit);
1670
Mark Fashehb06970532006-03-03 10:24:33 -08001671 mlog(0, "want to free %u clusters starting at block %llu\n",
1672 num_clusters, (unsigned long long)start_blk);
1673 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
1674 (unsigned long long)bg_blkno, bg_start_bit);
Mark Fashehccd979b2005-12-15 14:31:24 -08001675
1676 status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
1677 bg_start_bit, bg_blkno,
1678 num_clusters);
1679 if (status < 0)
1680 mlog_errno(status);
1681
1682 mlog_exit(status);
1683 return status;
1684}
1685
1686static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
1687{
1688 printk("Block Group:\n");
1689 printk("bg_signature: %s\n", bg->bg_signature);
1690 printk("bg_size: %u\n", bg->bg_size);
1691 printk("bg_bits: %u\n", bg->bg_bits);
1692 printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
1693 printk("bg_chain: %u\n", bg->bg_chain);
1694 printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation));
Mark Fashehb06970532006-03-03 10:24:33 -08001695 printk("bg_next_group: %llu\n",
1696 (unsigned long long)bg->bg_next_group);
1697 printk("bg_parent_dinode: %llu\n",
1698 (unsigned long long)bg->bg_parent_dinode);
1699 printk("bg_blkno: %llu\n",
1700 (unsigned long long)bg->bg_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -08001701}
1702
1703static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
1704{
1705 int i;
1706
Mark Fashehb06970532006-03-03 10:24:33 -08001707 printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -08001708 printk("i_signature: %s\n", fe->i_signature);
Mark Fashehb06970532006-03-03 10:24:33 -08001709 printk("i_size: %llu\n",
1710 (unsigned long long)fe->i_size);
Mark Fashehccd979b2005-12-15 14:31:24 -08001711 printk("i_clusters: %u\n", fe->i_clusters);
1712 printk("i_generation: %u\n",
1713 le32_to_cpu(fe->i_generation));
1714 printk("id1.bitmap1.i_used: %u\n",
1715 le32_to_cpu(fe->id1.bitmap1.i_used));
1716 printk("id1.bitmap1.i_total: %u\n",
1717 le32_to_cpu(fe->id1.bitmap1.i_total));
1718 printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg);
1719 printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc);
1720 printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count);
1721 printk("id2.i_chain.cl_next_free_rec: %u\n",
1722 fe->id2.i_chain.cl_next_free_rec);
1723 for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
1724 printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i,
1725 fe->id2.i_chain.cl_recs[i].c_free);
1726 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
1727 fe->id2.i_chain.cl_recs[i].c_total);
Mark Fashehb06970532006-03-03 10:24:33 -08001728 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
1729 (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -08001730 }
1731}