blob: 8d79de8637b841f9c4ca39f68715e739d0831559 [file] [log] [blame]
Tao Maf2c870e2009-08-18 11:19:26 +08001/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * refcounttree.c
5 *
6 * Copyright (C) 2009 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License version 2 as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 */
17
18#define MLOG_MASK_PREFIX ML_REFCOUNT
19#include <cluster/masklog.h>
20#include "ocfs2.h"
21#include "inode.h"
22#include "alloc.h"
23#include "suballoc.h"
24#include "journal.h"
25#include "uptodate.h"
26#include "super.h"
27#include "buffer_head_io.h"
28#include "blockcheck.h"
Tao Mac732eb12009-08-18 11:21:00 +080029#include "refcounttree.h"
Tao Ma374a2632009-08-24 11:13:37 +080030#include "dlmglue.h"
Tao Mac732eb12009-08-18 11:21:00 +080031
32static inline struct ocfs2_refcount_tree *
33cache_info_to_refcount(struct ocfs2_caching_info *ci)
34{
35 return container_of(ci, struct ocfs2_refcount_tree, rf_ci);
36}
Tao Maf2c870e2009-08-18 11:19:26 +080037
38static int ocfs2_validate_refcount_block(struct super_block *sb,
39 struct buffer_head *bh)
40{
41 int rc;
42 struct ocfs2_refcount_block *rb =
43 (struct ocfs2_refcount_block *)bh->b_data;
44
45 mlog(0, "Validating refcount block %llu\n",
46 (unsigned long long)bh->b_blocknr);
47
48 BUG_ON(!buffer_uptodate(bh));
49
50 /*
51 * If the ecc fails, we return the error but otherwise
52 * leave the filesystem running. We know any error is
53 * local to this block.
54 */
55 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check);
56 if (rc) {
57 mlog(ML_ERROR, "Checksum failed for refcount block %llu\n",
58 (unsigned long long)bh->b_blocknr);
59 return rc;
60 }
61
62
63 if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
64 ocfs2_error(sb,
65 "Refcount block #%llu has bad signature %.*s",
66 (unsigned long long)bh->b_blocknr, 7,
67 rb->rf_signature);
68 return -EINVAL;
69 }
70
71 if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
72 ocfs2_error(sb,
73 "Refcount block #%llu has an invalid rf_blkno "
74 "of %llu",
75 (unsigned long long)bh->b_blocknr,
76 (unsigned long long)le64_to_cpu(rb->rf_blkno));
77 return -EINVAL;
78 }
79
80 if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
81 ocfs2_error(sb,
82 "Refcount block #%llu has an invalid "
83 "rf_fs_generation of #%u",
84 (unsigned long long)bh->b_blocknr,
85 le32_to_cpu(rb->rf_fs_generation));
86 return -EINVAL;
87 }
88
89 return 0;
90}
91
92static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
93 u64 rb_blkno,
94 struct buffer_head **bh)
95{
96 int rc;
97 struct buffer_head *tmp = *bh;
98
99 rc = ocfs2_read_block(ci, rb_blkno, &tmp,
100 ocfs2_validate_refcount_block);
101
102 /* If ocfs2_read_block() got us a new bh, pass it up. */
103 if (!rc && !*bh)
104 *bh = tmp;
105
106 return rc;
107}
Tao Mac732eb12009-08-18 11:21:00 +0800108
109static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci)
110{
111 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
112
113 return rf->rf_blkno;
114}
115
116static struct super_block *
117ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
118{
119 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
120
121 return rf->rf_sb;
122}
123
124static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
125{
126 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
127
128 spin_lock(&rf->rf_lock);
129}
130
131static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
132{
133 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
134
135 spin_unlock(&rf->rf_lock);
136}
137
138static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci)
139{
140 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
141
142 mutex_lock(&rf->rf_io_mutex);
143}
144
145static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci)
146{
147 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
148
149 mutex_unlock(&rf->rf_io_mutex);
150}
151
152static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
153 .co_owner = ocfs2_refcount_cache_owner,
154 .co_get_super = ocfs2_refcount_cache_get_super,
155 .co_cache_lock = ocfs2_refcount_cache_lock,
156 .co_cache_unlock = ocfs2_refcount_cache_unlock,
157 .co_io_lock = ocfs2_refcount_cache_io_lock,
158 .co_io_unlock = ocfs2_refcount_cache_io_unlock,
159};
Tao Ma374a2632009-08-24 11:13:37 +0800160
161static struct ocfs2_refcount_tree *
162ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno)
163{
164 struct rb_node *n = osb->osb_rf_lock_tree.rb_node;
165 struct ocfs2_refcount_tree *tree = NULL;
166
167 while (n) {
168 tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node);
169
170 if (blkno < tree->rf_blkno)
171 n = n->rb_left;
172 else if (blkno > tree->rf_blkno)
173 n = n->rb_right;
174 else
175 return tree;
176 }
177
178 return NULL;
179}
180
181/* osb_lock is already locked. */
182static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb,
183 struct ocfs2_refcount_tree *new)
184{
185 u64 rf_blkno = new->rf_blkno;
186 struct rb_node *parent = NULL;
187 struct rb_node **p = &osb->osb_rf_lock_tree.rb_node;
188 struct ocfs2_refcount_tree *tmp;
189
190 while (*p) {
191 parent = *p;
192
193 tmp = rb_entry(parent, struct ocfs2_refcount_tree,
194 rf_node);
195
196 if (rf_blkno < tmp->rf_blkno)
197 p = &(*p)->rb_left;
198 else if (rf_blkno > tmp->rf_blkno)
199 p = &(*p)->rb_right;
200 else {
201 /* This should never happen! */
202 mlog(ML_ERROR, "Duplicate refcount block %llu found!\n",
203 (unsigned long long)rf_blkno);
204 BUG();
205 }
206 }
207
208 rb_link_node(&new->rf_node, parent, p);
209 rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree);
210}
211
212static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree)
213{
214 ocfs2_metadata_cache_exit(&tree->rf_ci);
215 ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres);
216 ocfs2_lock_res_free(&tree->rf_lockres);
217 kfree(tree);
218}
219
220static inline void
221ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb,
222 struct ocfs2_refcount_tree *tree)
223{
224 rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree);
225 if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree)
226 osb->osb_ref_tree_lru = NULL;
227}
228
229static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
230 struct ocfs2_refcount_tree *tree)
231{
232 spin_lock(&osb->osb_lock);
233 ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
234 spin_unlock(&osb->osb_lock);
235}
236
237void ocfs2_kref_remove_refcount_tree(struct kref *kref)
238{
239 struct ocfs2_refcount_tree *tree =
240 container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
241
242 ocfs2_free_refcount_tree(tree);
243}
244
245static inline void
246ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree)
247{
248 kref_get(&tree->rf_getcnt);
249}
250
251static inline void
252ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree)
253{
254 kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree);
255}
256
257static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new,
258 struct super_block *sb)
259{
260 ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops);
261 mutex_init(&new->rf_io_mutex);
262 new->rf_sb = sb;
263 spin_lock_init(&new->rf_lock);
264}
265
266static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb,
267 struct ocfs2_refcount_tree *new,
268 u64 rf_blkno, u32 generation)
269{
270 init_rwsem(&new->rf_sem);
271 ocfs2_refcount_lock_res_init(&new->rf_lockres, osb,
272 rf_blkno, generation);
273}
274
275static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
276 struct ocfs2_refcount_tree **ret_tree)
277{
278 int ret = 0;
279 struct ocfs2_refcount_tree *tree, *new = NULL;
280 struct buffer_head *ref_root_bh = NULL;
281 struct ocfs2_refcount_block *ref_rb;
282
283 spin_lock(&osb->osb_lock);
284 if (osb->osb_ref_tree_lru &&
285 osb->osb_ref_tree_lru->rf_blkno == rf_blkno)
286 tree = osb->osb_ref_tree_lru;
287 else
288 tree = ocfs2_find_refcount_tree(osb, rf_blkno);
289 if (tree)
290 goto out;
291
292 spin_unlock(&osb->osb_lock);
293
294 new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
295 if (!new) {
296 ret = -ENOMEM;
297 return ret;
298 }
299
300 new->rf_blkno = rf_blkno;
301 kref_init(&new->rf_getcnt);
302 ocfs2_init_refcount_tree_ci(new, osb->sb);
303
304 /*
305 * We need the generation to create the refcount tree lock and since
306 * it isn't changed during the tree modification, we are safe here to
307 * read without protection.
308 * We also have to purge the cache after we create the lock since the
309 * refcount block may have the stale data. It can only be trusted when
310 * we hold the refcount lock.
311 */
312 ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh);
313 if (ret) {
314 mlog_errno(ret);
315 ocfs2_metadata_cache_exit(&new->rf_ci);
316 kfree(new);
317 return ret;
318 }
319
320 ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
321 new->rf_generation = le32_to_cpu(ref_rb->rf_generation);
322 ocfs2_init_refcount_tree_lock(osb, new, rf_blkno,
323 new->rf_generation);
324 ocfs2_metadata_cache_purge(&new->rf_ci);
325
326 spin_lock(&osb->osb_lock);
327 tree = ocfs2_find_refcount_tree(osb, rf_blkno);
328 if (tree)
329 goto out;
330
331 ocfs2_insert_refcount_tree(osb, new);
332
333 tree = new;
334 new = NULL;
335
336out:
337 *ret_tree = tree;
338
339 osb->osb_ref_tree_lru = tree;
340
341 spin_unlock(&osb->osb_lock);
342
343 if (new)
344 ocfs2_free_refcount_tree(new);
345
346 brelse(ref_root_bh);
347 return ret;
348}
349
350static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
351{
352 int ret;
353 struct buffer_head *di_bh = NULL;
354 struct ocfs2_dinode *di;
355
356 ret = ocfs2_read_inode_block(inode, &di_bh);
357 if (ret) {
358 mlog_errno(ret);
359 goto out;
360 }
361
362 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
363
364 di = (struct ocfs2_dinode *)di_bh->b_data;
365 *ref_blkno = le64_to_cpu(di->i_refcount_loc);
366 brelse(di_bh);
367out:
368 return ret;
369}
370
371static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
372 struct ocfs2_refcount_tree *tree, int rw)
373{
374 int ret;
375
376 ret = ocfs2_refcount_lock(tree, rw);
377 if (ret) {
378 mlog_errno(ret);
379 goto out;
380 }
381
382 if (rw)
383 down_write(&tree->rf_sem);
384 else
385 down_read(&tree->rf_sem);
386
387out:
388 return ret;
389}
390
391/*
392 * Lock the refcount tree pointed by ref_blkno and return the tree.
393 * In most case, we lock the tree and read the refcount block.
394 * So read it here if the caller really needs it.
395 *
396 * If the tree has been re-created by other node, it will free the
397 * old one and re-create it.
398 */
399int ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
400 u64 ref_blkno, int rw,
401 struct ocfs2_refcount_tree **ret_tree,
402 struct buffer_head **ref_bh)
403{
404 int ret, delete_tree = 0;
405 struct ocfs2_refcount_tree *tree = NULL;
406 struct buffer_head *ref_root_bh = NULL;
407 struct ocfs2_refcount_block *rb;
408
409again:
410 ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
411 if (ret) {
412 mlog_errno(ret);
413 return ret;
414 }
415
416 ocfs2_refcount_tree_get(tree);
417
418 ret = __ocfs2_lock_refcount_tree(osb, tree, rw);
419 if (ret) {
420 mlog_errno(ret);
421 ocfs2_refcount_tree_put(tree);
422 goto out;
423 }
424
425 ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
426 &ref_root_bh);
427 if (ret) {
428 mlog_errno(ret);
429 ocfs2_unlock_refcount_tree(osb, tree, rw);
430 ocfs2_refcount_tree_put(tree);
431 goto out;
432 }
433
434 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
435 /*
436 * If the refcount block has been freed and re-created, we may need
437 * to recreate the refcount tree also.
438 *
439 * Here we just remove the tree from the rb-tree, and the last
440 * kref holder will unlock and delete this refcount_tree.
441 * Then we goto "again" and ocfs2_get_refcount_tree will create
442 * the new refcount tree for us.
443 */
444 if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) {
445 if (!tree->rf_removed) {
446 ocfs2_erase_refcount_tree_from_list(osb, tree);
447 tree->rf_removed = 1;
448 delete_tree = 1;
449 }
450
451 ocfs2_unlock_refcount_tree(osb, tree, rw);
452 /*
453 * We get an extra reference when we create the refcount
454 * tree, so another put will destroy it.
455 */
456 if (delete_tree)
457 ocfs2_refcount_tree_put(tree);
458 brelse(ref_root_bh);
459 ref_root_bh = NULL;
460 goto again;
461 }
462
463 *ret_tree = tree;
464 if (ref_bh) {
465 *ref_bh = ref_root_bh;
466 ref_root_bh = NULL;
467 }
468out:
469 brelse(ref_root_bh);
470 return ret;
471}
472
473int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw,
474 struct ocfs2_refcount_tree **ret_tree,
475 struct buffer_head **ref_bh)
476{
477 int ret;
478 u64 ref_blkno;
479
480 ret = ocfs2_get_refcount_block(inode, &ref_blkno);
481 if (ret) {
482 mlog_errno(ret);
483 return ret;
484 }
485
486 return ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno,
487 rw, ret_tree, ref_bh);
488}
489
490void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
491 struct ocfs2_refcount_tree *tree, int rw)
492{
493 if (rw)
494 up_write(&tree->rf_sem);
495 else
496 up_read(&tree->rf_sem);
497
498 ocfs2_refcount_unlock(tree, rw);
499 ocfs2_refcount_tree_put(tree);
500}
501
502void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
503{
504 struct rb_node *node;
505 struct ocfs2_refcount_tree *tree;
506 struct rb_root *root = &osb->osb_rf_lock_tree;
507
508 while ((node = rb_last(root)) != NULL) {
509 tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
510
511 mlog(0, "Purge tree %llu\n",
512 (unsigned long long) tree->rf_blkno);
513
514 rb_erase(&tree->rf_node, root);
515 ocfs2_free_refcount_tree(tree);
516 }
517}