blob: c1891787fb9d02d442039e8afe02fa12a696eccd [file] [log] [blame]
Mark Fashehccd979b2005-12-15 14:31:24 -08001/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmglue.c
5 *
6 * Code which implements an OCFS2 specific interface to our DLM.
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/types.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/mm.h>
30#include <linux/smp_lock.h>
31#include <linux/crc32.h>
32#include <linux/kthread.h>
33#include <linux/pagemap.h>
34#include <linux/debugfs.h>
35#include <linux/seq_file.h>
36
37#include <cluster/heartbeat.h>
38#include <cluster/nodemanager.h>
39#include <cluster/tcp.h>
40
41#include <dlm/dlmapi.h>
42
43#define MLOG_MASK_PREFIX ML_DLM_GLUE
44#include <cluster/masklog.h>
45
46#include "ocfs2.h"
47
48#include "alloc.h"
Mark Fashehd680efe2006-09-08 14:14:34 -070049#include "dcache.h"
Mark Fashehccd979b2005-12-15 14:31:24 -080050#include "dlmglue.h"
51#include "extent_map.h"
52#include "heartbeat.h"
53#include "inode.h"
54#include "journal.h"
55#include "slot_map.h"
56#include "super.h"
57#include "uptodate.h"
58#include "vote.h"
59
60#include "buffer_head_io.h"
61
62struct ocfs2_mask_waiter {
63 struct list_head mw_item;
64 int mw_status;
65 struct completion mw_complete;
66 unsigned long mw_mask;
67 unsigned long mw_goal;
68};
69
70static void ocfs2_inode_ast_func(void *opaque);
71static void ocfs2_inode_bast_func(void *opaque,
72 int level);
Mark Fashehd680efe2006-09-08 14:14:34 -070073static void ocfs2_dentry_ast_func(void *opaque);
74static void ocfs2_dentry_bast_func(void *opaque,
75 int level);
Mark Fashehccd979b2005-12-15 14:31:24 -080076static void ocfs2_super_ast_func(void *opaque);
77static void ocfs2_super_bast_func(void *opaque,
78 int level);
79static void ocfs2_rename_ast_func(void *opaque);
80static void ocfs2_rename_bast_func(void *opaque,
81 int level);
82
Mark Fashehd680efe2006-09-08 14:14:34 -070083/*
84 * Return value from ocfs2_convert_worker_t functions.
85 *
86 * These control the precise actions of ocfs2_generic_unblock_lock()
87 * and ocfs2_process_blocked_lock()
88 *
89 */
90enum ocfs2_unblock_action {
91 UNBLOCK_CONTINUE = 0, /* Continue downconvert */
92 UNBLOCK_CONTINUE_POST = 1, /* Continue downconvert, fire
93 * ->post_unlock callback */
94 UNBLOCK_STOP_POST = 2, /* Do not downconvert, fire
95 * ->post_unlock() callback. */
96};
97
98struct ocfs2_unblock_ctl {
99 int requeue;
100 enum ocfs2_unblock_action unblock_action;
101};
102
Mark Fashehccd979b2005-12-15 14:31:24 -0800103/* so far, all locks have gotten along with the same unlock ast */
104static void ocfs2_unlock_ast_func(void *opaque,
105 enum dlm_status status);
Mark Fashehccd979b2005-12-15 14:31:24 -0800106static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
Mark Fashehd680efe2006-09-08 14:14:34 -0700107 struct ocfs2_unblock_ctl *ctl);
Mark Fashehccd979b2005-12-15 14:31:24 -0800108static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
Mark Fashehd680efe2006-09-08 14:14:34 -0700109 struct ocfs2_unblock_ctl *ctl);
Mark Fashehccd979b2005-12-15 14:31:24 -0800110static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
Mark Fashehd680efe2006-09-08 14:14:34 -0700111 struct ocfs2_unblock_ctl *ctl);
112static int ocfs2_unblock_dentry_lock(struct ocfs2_lock_res *lockres,
113 struct ocfs2_unblock_ctl *ctl);
Mark Fashehccd979b2005-12-15 14:31:24 -0800114static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
Mark Fashehd680efe2006-09-08 14:14:34 -0700115 struct ocfs2_unblock_ctl *ctl);
116
117static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
118 struct ocfs2_lock_res *lockres);
Mark Fashehccd979b2005-12-15 14:31:24 -0800119
Mark Fashehf625c972006-09-12 21:24:53 -0700120/*
121 * OCFS2 Lock Resource Operations
122 *
123 * These fine tune the behavior of the generic dlmglue locking infrastructure.
124 */
Mark Fashehccd979b2005-12-15 14:31:24 -0800125struct ocfs2_lock_res_ops {
126 void (*ast)(void *);
127 void (*bast)(void *, int);
128 void (*unlock_ast)(void *, enum dlm_status);
Mark Fashehd680efe2006-09-08 14:14:34 -0700129 int (*unblock)(struct ocfs2_lock_res *, struct ocfs2_unblock_ctl *);
130 void (*post_unlock)(struct ocfs2_super *, struct ocfs2_lock_res *);
Mark Fashehf625c972006-09-12 21:24:53 -0700131
132 /*
133 * LOCK_TYPE_* flags which describe the specific requirements
134 * of a lock type. Descriptions of each individual flag follow.
135 */
136 int flags;
Mark Fashehccd979b2005-12-15 14:31:24 -0800137};
138
Mark Fashehf625c972006-09-12 21:24:53 -0700139/*
140 * Some locks want to "refresh" potentially stale data when a
141 * meaningful (PRMODE or EXMODE) lock level is first obtained. If this
142 * flag is set, the OCFS2_LOCK_NEEDS_REFRESH flag will be set on the
143 * individual lockres l_flags member from the ast function. It is
144 * expected that the locking wrapper will clear the
145 * OCFS2_LOCK_NEEDS_REFRESH flag when done.
146 */
147#define LOCK_TYPE_REQUIRES_REFRESH 0x1
148
Mark Fashehd680efe2006-09-08 14:14:34 -0700149typedef int (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int);
150static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
151 struct ocfs2_lock_res *lockres,
152 struct ocfs2_unblock_ctl *ctl,
153 ocfs2_convert_worker_t *worker);
154
Mark Fashehccd979b2005-12-15 14:31:24 -0800155static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
156 .ast = ocfs2_inode_ast_func,
157 .bast = ocfs2_inode_bast_func,
158 .unlock_ast = ocfs2_unlock_ast_func,
159 .unblock = ocfs2_unblock_inode_lock,
Mark Fashehf625c972006-09-12 21:24:53 -0700160 .flags = 0,
Mark Fashehccd979b2005-12-15 14:31:24 -0800161};
162
163static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
164 .ast = ocfs2_inode_ast_func,
165 .bast = ocfs2_inode_bast_func,
166 .unlock_ast = ocfs2_unlock_ast_func,
167 .unblock = ocfs2_unblock_meta,
Mark Fashehf625c972006-09-12 21:24:53 -0700168 .flags = LOCK_TYPE_REQUIRES_REFRESH,
Mark Fashehccd979b2005-12-15 14:31:24 -0800169};
170
Mark Fashehccd979b2005-12-15 14:31:24 -0800171static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
172 .ast = ocfs2_inode_ast_func,
173 .bast = ocfs2_inode_bast_func,
174 .unlock_ast = ocfs2_unlock_ast_func,
175 .unblock = ocfs2_unblock_data,
Mark Fashehf625c972006-09-12 21:24:53 -0700176 .flags = 0,
Mark Fashehccd979b2005-12-15 14:31:24 -0800177};
178
179static struct ocfs2_lock_res_ops ocfs2_super_lops = {
180 .ast = ocfs2_super_ast_func,
181 .bast = ocfs2_super_bast_func,
182 .unlock_ast = ocfs2_unlock_ast_func,
183 .unblock = ocfs2_unblock_osb_lock,
Mark Fashehf625c972006-09-12 21:24:53 -0700184 .flags = LOCK_TYPE_REQUIRES_REFRESH,
Mark Fashehccd979b2005-12-15 14:31:24 -0800185};
186
187static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
188 .ast = ocfs2_rename_ast_func,
189 .bast = ocfs2_rename_bast_func,
190 .unlock_ast = ocfs2_unlock_ast_func,
191 .unblock = ocfs2_unblock_osb_lock,
Mark Fashehf625c972006-09-12 21:24:53 -0700192 .flags = 0,
Mark Fashehccd979b2005-12-15 14:31:24 -0800193};
194
Mark Fashehd680efe2006-09-08 14:14:34 -0700195static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
196 .ast = ocfs2_dentry_ast_func,
197 .bast = ocfs2_dentry_bast_func,
198 .unlock_ast = ocfs2_unlock_ast_func,
199 .unblock = ocfs2_unblock_dentry_lock,
200 .post_unlock = ocfs2_dentry_post_unlock,
Mark Fashehf625c972006-09-12 21:24:53 -0700201 .flags = 0,
Mark Fashehd680efe2006-09-08 14:14:34 -0700202};
203
Mark Fashehccd979b2005-12-15 14:31:24 -0800204static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
205{
206 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
207 lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
208 lockres->l_type == OCFS2_LOCK_TYPE_RW;
209}
210
211static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres)
212{
213 return lockres->l_type == OCFS2_LOCK_TYPE_SUPER;
214}
215
216static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres)
217{
218 return lockres->l_type == OCFS2_LOCK_TYPE_RENAME;
219}
220
221static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres)
222{
223 BUG_ON(!ocfs2_is_super_lock(lockres)
224 && !ocfs2_is_rename_lock(lockres));
225
226 return (struct ocfs2_super *) lockres->l_priv;
227}
228
229static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
230{
231 BUG_ON(!ocfs2_is_inode_lock(lockres));
232
233 return (struct inode *) lockres->l_priv;
234}
235
Mark Fashehd680efe2006-09-08 14:14:34 -0700236static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res *lockres)
237{
238 BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_DENTRY);
239
240 return (struct ocfs2_dentry_lock *)lockres->l_priv;
241}
242
Mark Fashehccd979b2005-12-15 14:31:24 -0800243static int ocfs2_lock_create(struct ocfs2_super *osb,
244 struct ocfs2_lock_res *lockres,
245 int level,
246 int dlm_flags);
247static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
248 int wanted);
249static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
250 struct ocfs2_lock_res *lockres,
251 int level);
252static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
253static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
254static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
255static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level);
256static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
257 struct ocfs2_lock_res *lockres);
258static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
259 int convert);
260#define ocfs2_log_dlm_error(_func, _stat, _lockres) do { \
261 mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \
262 "resource %s: %s\n", dlm_errname(_stat), _func, \
263 _lockres->l_name, dlm_errmsg(_stat)); \
264} while (0)
265static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
266 struct ocfs2_lock_res *lockres);
267static int ocfs2_meta_lock_update(struct inode *inode,
268 struct buffer_head **bh);
269static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
270static inline int ocfs2_highest_compat_lock_level(int level);
271static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
272 struct ocfs2_lock_res *lockres,
273 int new_level);
274
Mark Fashehccd979b2005-12-15 14:31:24 -0800275static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
276 u64 blkno,
277 u32 generation,
278 char *name)
279{
280 int len;
281
282 mlog_entry_void();
283
284 BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
285
Mark Fashehb0697052006-03-03 10:24:33 -0800286 len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x",
287 ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD,
288 (long long)blkno, generation);
Mark Fashehccd979b2005-12-15 14:31:24 -0800289
290 BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
291
292 mlog(0, "built lock resource with name: %s\n", name);
293
294 mlog_exit_void();
295}
296
Ingo Molnar34af9462006-06-27 02:53:55 -0700297static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
Mark Fashehccd979b2005-12-15 14:31:24 -0800298
299static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
300 struct ocfs2_dlm_debug *dlm_debug)
301{
302 mlog(0, "Add tracking for lockres %s\n", res->l_name);
303
304 spin_lock(&ocfs2_dlm_tracking_lock);
305 list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking);
306 spin_unlock(&ocfs2_dlm_tracking_lock);
307}
308
309static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
310{
311 spin_lock(&ocfs2_dlm_tracking_lock);
312 if (!list_empty(&res->l_debug_list))
313 list_del_init(&res->l_debug_list);
314 spin_unlock(&ocfs2_dlm_tracking_lock);
315}
316
317static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
318 struct ocfs2_lock_res *res,
319 enum ocfs2_lock_type type,
Mark Fashehccd979b2005-12-15 14:31:24 -0800320 struct ocfs2_lock_res_ops *ops,
321 void *priv)
322{
Mark Fashehccd979b2005-12-15 14:31:24 -0800323 res->l_type = type;
324 res->l_ops = ops;
325 res->l_priv = priv;
326
327 res->l_level = LKM_IVMODE;
328 res->l_requested = LKM_IVMODE;
329 res->l_blocking = LKM_IVMODE;
330 res->l_action = OCFS2_AST_INVALID;
331 res->l_unlock_action = OCFS2_UNLOCK_INVALID;
332
333 res->l_flags = OCFS2_LOCK_INITIALIZED;
334
335 ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
336}
337
338void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
339{
340 /* This also clears out the lock status block */
341 memset(res, 0, sizeof(struct ocfs2_lock_res));
342 spin_lock_init(&res->l_lock);
343 init_waitqueue_head(&res->l_event);
344 INIT_LIST_HEAD(&res->l_blocked_list);
345 INIT_LIST_HEAD(&res->l_mask_waiters);
346}
347
348void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
349 enum ocfs2_lock_type type,
Mark Fasheh24c19ef2006-09-22 17:28:19 -0700350 unsigned int generation,
Mark Fashehccd979b2005-12-15 14:31:24 -0800351 struct inode *inode)
352{
353 struct ocfs2_lock_res_ops *ops;
354
355 switch(type) {
356 case OCFS2_LOCK_TYPE_RW:
357 ops = &ocfs2_inode_rw_lops;
358 break;
359 case OCFS2_LOCK_TYPE_META:
360 ops = &ocfs2_inode_meta_lops;
361 break;
362 case OCFS2_LOCK_TYPE_DATA:
363 ops = &ocfs2_inode_data_lops;
364 break;
365 default:
366 mlog_bug_on_msg(1, "type: %d\n", type);
367 ops = NULL; /* thanks, gcc */
368 break;
369 };
370
Mark Fashehd680efe2006-09-08 14:14:34 -0700371 ocfs2_build_lock_name(type, OCFS2_I(inode)->ip_blkno,
Mark Fasheh24c19ef2006-09-22 17:28:19 -0700372 generation, res->l_name);
Mark Fashehd680efe2006-09-08 14:14:34 -0700373 ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, ops, inode);
374}
375
376static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres)
377{
378 __be64 inode_blkno_be;
379
380 memcpy(&inode_blkno_be, &lockres->l_name[OCFS2_DENTRY_LOCK_INO_START],
381 sizeof(__be64));
382
383 return be64_to_cpu(inode_blkno_be);
384}
385
386void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
387 u64 parent, struct inode *inode)
388{
389 int len;
390 u64 inode_blkno = OCFS2_I(inode)->ip_blkno;
391 __be64 inode_blkno_be = cpu_to_be64(inode_blkno);
392 struct ocfs2_lock_res *lockres = &dl->dl_lockres;
393
394 ocfs2_lock_res_init_once(lockres);
395
396 /*
397 * Unfortunately, the standard lock naming scheme won't work
398 * here because we have two 16 byte values to use. Instead,
399 * we'll stuff the inode number as a binary value. We still
400 * want error prints to show something without garbling the
401 * display, so drop a null byte in there before the inode
402 * number. A future version of OCFS2 will likely use all
403 * binary lock names. The stringified names have been a
404 * tremendous aid in debugging, but now that the debugfs
405 * interface exists, we can mangle things there if need be.
406 *
407 * NOTE: We also drop the standard "pad" value (the total lock
408 * name size stays the same though - the last part is all
409 * zeros due to the memset in ocfs2_lock_res_init_once()
410 */
411 len = snprintf(lockres->l_name, OCFS2_DENTRY_LOCK_INO_START,
412 "%c%016llx",
413 ocfs2_lock_type_char(OCFS2_LOCK_TYPE_DENTRY),
414 (long long)parent);
415
416 BUG_ON(len != (OCFS2_DENTRY_LOCK_INO_START - 1));
417
418 memcpy(&lockres->l_name[OCFS2_DENTRY_LOCK_INO_START], &inode_blkno_be,
419 sizeof(__be64));
420
421 ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
422 OCFS2_LOCK_TYPE_DENTRY, &ocfs2_dentry_lops,
423 dl);
Mark Fashehccd979b2005-12-15 14:31:24 -0800424}
425
426static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res,
427 struct ocfs2_super *osb)
428{
429 /* Superblock lockres doesn't come from a slab so we call init
430 * once on it manually. */
431 ocfs2_lock_res_init_once(res);
Mark Fashehd680efe2006-09-08 14:14:34 -0700432 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_SUPER, OCFS2_SUPER_BLOCK_BLKNO,
433 0, res->l_name);
Mark Fashehccd979b2005-12-15 14:31:24 -0800434 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER,
Mark Fashehccd979b2005-12-15 14:31:24 -0800435 &ocfs2_super_lops, osb);
436}
437
438static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
439 struct ocfs2_super *osb)
440{
441 /* Rename lockres doesn't come from a slab so we call init
442 * once on it manually. */
443 ocfs2_lock_res_init_once(res);
Mark Fashehd680efe2006-09-08 14:14:34 -0700444 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_RENAME, 0, 0, res->l_name);
445 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME,
Mark Fashehccd979b2005-12-15 14:31:24 -0800446 &ocfs2_rename_lops, osb);
447}
448
449void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
450{
451 mlog_entry_void();
452
453 if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
454 return;
455
456 ocfs2_remove_lockres_tracking(res);
457
458 mlog_bug_on_msg(!list_empty(&res->l_blocked_list),
459 "Lockres %s is on the blocked list\n",
460 res->l_name);
461 mlog_bug_on_msg(!list_empty(&res->l_mask_waiters),
462 "Lockres %s has mask waiters pending\n",
463 res->l_name);
464 mlog_bug_on_msg(spin_is_locked(&res->l_lock),
465 "Lockres %s is locked\n",
466 res->l_name);
467 mlog_bug_on_msg(res->l_ro_holders,
468 "Lockres %s has %u ro holders\n",
469 res->l_name, res->l_ro_holders);
470 mlog_bug_on_msg(res->l_ex_holders,
471 "Lockres %s has %u ex holders\n",
472 res->l_name, res->l_ex_holders);
473
474 /* Need to clear out the lock status block for the dlm */
475 memset(&res->l_lksb, 0, sizeof(res->l_lksb));
476
477 res->l_flags = 0UL;
478 mlog_exit_void();
479}
480
481static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
482 int level)
483{
484 mlog_entry_void();
485
486 BUG_ON(!lockres);
487
488 switch(level) {
489 case LKM_EXMODE:
490 lockres->l_ex_holders++;
491 break;
492 case LKM_PRMODE:
493 lockres->l_ro_holders++;
494 break;
495 default:
496 BUG();
497 }
498
499 mlog_exit_void();
500}
501
502static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
503 int level)
504{
505 mlog_entry_void();
506
507 BUG_ON(!lockres);
508
509 switch(level) {
510 case LKM_EXMODE:
511 BUG_ON(!lockres->l_ex_holders);
512 lockres->l_ex_holders--;
513 break;
514 case LKM_PRMODE:
515 BUG_ON(!lockres->l_ro_holders);
516 lockres->l_ro_holders--;
517 break;
518 default:
519 BUG();
520 }
521 mlog_exit_void();
522}
523
524/* WARNING: This function lives in a world where the only three lock
525 * levels are EX, PR, and NL. It *will* have to be adjusted when more
526 * lock types are added. */
527static inline int ocfs2_highest_compat_lock_level(int level)
528{
529 int new_level = LKM_EXMODE;
530
531 if (level == LKM_EXMODE)
532 new_level = LKM_NLMODE;
533 else if (level == LKM_PRMODE)
534 new_level = LKM_PRMODE;
535 return new_level;
536}
537
538static void lockres_set_flags(struct ocfs2_lock_res *lockres,
539 unsigned long newflags)
540{
541 struct list_head *pos, *tmp;
542 struct ocfs2_mask_waiter *mw;
543
544 assert_spin_locked(&lockres->l_lock);
545
546 lockres->l_flags = newflags;
547
548 list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) {
549 mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
550 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
551 continue;
552
553 list_del_init(&mw->mw_item);
554 mw->mw_status = 0;
555 complete(&mw->mw_complete);
556 }
557}
558static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
559{
560 lockres_set_flags(lockres, lockres->l_flags | or);
561}
562static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
563 unsigned long clear)
564{
565 lockres_set_flags(lockres, lockres->l_flags & ~clear);
566}
567
568static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
569{
570 mlog_entry_void();
571
572 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
573 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
574 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
575 BUG_ON(lockres->l_blocking <= LKM_NLMODE);
576
577 lockres->l_level = lockres->l_requested;
578 if (lockres->l_level <=
579 ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
580 lockres->l_blocking = LKM_NLMODE;
581 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
582 }
583 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
584
585 mlog_exit_void();
586}
587
588static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
589{
590 mlog_entry_void();
591
592 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
593 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
594
595 /* Convert from RO to EX doesn't really need anything as our
596 * information is already up to data. Convert from NL to
597 * *anything* however should mark ourselves as needing an
598 * update */
Mark Fashehf625c972006-09-12 21:24:53 -0700599 if (lockres->l_level == LKM_NLMODE &&
600 lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
Mark Fashehccd979b2005-12-15 14:31:24 -0800601 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
602
603 lockres->l_level = lockres->l_requested;
604 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
605
606 mlog_exit_void();
607}
608
609static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
610{
611 mlog_entry_void();
612
613 BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY));
614 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
615
616 if (lockres->l_requested > LKM_NLMODE &&
Mark Fashehf625c972006-09-12 21:24:53 -0700617 !(lockres->l_flags & OCFS2_LOCK_LOCAL) &&
618 lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
Mark Fashehccd979b2005-12-15 14:31:24 -0800619 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
620
621 lockres->l_level = lockres->l_requested;
622 lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
623 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
624
625 mlog_exit_void();
626}
627
628static void ocfs2_inode_ast_func(void *opaque)
629{
630 struct ocfs2_lock_res *lockres = opaque;
631 struct inode *inode;
632 struct dlm_lockstatus *lksb;
633 unsigned long flags;
634
635 mlog_entry_void();
636
637 inode = ocfs2_lock_res_inode(lockres);
638
Mark Fashehb0697052006-03-03 10:24:33 -0800639 mlog(0, "AST fired for inode %llu, l_action = %u, type = %s\n",
640 (unsigned long long)OCFS2_I(inode)->ip_blkno, lockres->l_action,
Mark Fashehccd979b2005-12-15 14:31:24 -0800641 ocfs2_lock_type_string(lockres->l_type));
642
643 BUG_ON(!ocfs2_is_inode_lock(lockres));
644
645 spin_lock_irqsave(&lockres->l_lock, flags);
646
647 lksb = &(lockres->l_lksb);
648 if (lksb->status != DLM_NORMAL) {
649 mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u "
Mark Fashehb0697052006-03-03 10:24:33 -0800650 "on inode %llu\n", lksb->status,
651 (unsigned long long)OCFS2_I(inode)->ip_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -0800652 spin_unlock_irqrestore(&lockres->l_lock, flags);
653 mlog_exit_void();
654 return;
655 }
656
657 switch(lockres->l_action) {
658 case OCFS2_AST_ATTACH:
659 ocfs2_generic_handle_attach_action(lockres);
660 lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
661 break;
662 case OCFS2_AST_CONVERT:
663 ocfs2_generic_handle_convert_action(lockres);
664 break;
665 case OCFS2_AST_DOWNCONVERT:
666 ocfs2_generic_handle_downconvert_action(lockres);
667 break;
668 default:
669 mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
670 "lockres flags = 0x%lx, unlock action: %u\n",
671 lockres->l_name, lockres->l_action, lockres->l_flags,
672 lockres->l_unlock_action);
673
674 BUG();
675 }
676
Mark Fashehccd979b2005-12-15 14:31:24 -0800677 /* set it to something invalid so if we get called again we
678 * can catch it. */
679 lockres->l_action = OCFS2_AST_INVALID;
680 spin_unlock_irqrestore(&lockres->l_lock, flags);
681 wake_up(&lockres->l_event);
682
683 mlog_exit_void();
684}
685
686static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
687 int level)
688{
689 int needs_downconvert = 0;
690 mlog_entry_void();
691
692 assert_spin_locked(&lockres->l_lock);
693
694 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
695
696 if (level > lockres->l_blocking) {
697 /* only schedule a downconvert if we haven't already scheduled
698 * one that goes low enough to satisfy the level we're
699 * blocking. this also catches the case where we get
700 * duplicate BASTs */
701 if (ocfs2_highest_compat_lock_level(level) <
702 ocfs2_highest_compat_lock_level(lockres->l_blocking))
703 needs_downconvert = 1;
704
705 lockres->l_blocking = level;
706 }
707
708 mlog_exit(needs_downconvert);
709 return needs_downconvert;
710}
711
712static void ocfs2_generic_bast_func(struct ocfs2_super *osb,
713 struct ocfs2_lock_res *lockres,
714 int level)
715{
716 int needs_downconvert;
717 unsigned long flags;
718
719 mlog_entry_void();
720
721 BUG_ON(level <= LKM_NLMODE);
722
723 spin_lock_irqsave(&lockres->l_lock, flags);
724 needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
725 if (needs_downconvert)
726 ocfs2_schedule_blocked_lock(osb, lockres);
727 spin_unlock_irqrestore(&lockres->l_lock, flags);
728
Mark Fashehd680efe2006-09-08 14:14:34 -0700729 wake_up(&lockres->l_event);
730
Mark Fashehccd979b2005-12-15 14:31:24 -0800731 ocfs2_kick_vote_thread(osb);
732
Mark Fashehccd979b2005-12-15 14:31:24 -0800733 mlog_exit_void();
734}
735
736static void ocfs2_inode_bast_func(void *opaque, int level)
737{
738 struct ocfs2_lock_res *lockres = opaque;
739 struct inode *inode;
740 struct ocfs2_super *osb;
741
742 mlog_entry_void();
743
744 BUG_ON(!ocfs2_is_inode_lock(lockres));
745
746 inode = ocfs2_lock_res_inode(lockres);
747 osb = OCFS2_SB(inode->i_sb);
748
Mark Fashehb0697052006-03-03 10:24:33 -0800749 mlog(0, "BAST fired for inode %llu, blocking %d, level %d type %s\n",
750 (unsigned long long)OCFS2_I(inode)->ip_blkno, level,
751 lockres->l_level, ocfs2_lock_type_string(lockres->l_type));
Mark Fashehccd979b2005-12-15 14:31:24 -0800752
753 ocfs2_generic_bast_func(osb, lockres, level);
754
755 mlog_exit_void();
756}
757
Mark Fashehf625c972006-09-12 21:24:53 -0700758static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres)
Mark Fashehccd979b2005-12-15 14:31:24 -0800759{
760 struct dlm_lockstatus *lksb = &lockres->l_lksb;
761 unsigned long flags;
762
763 spin_lock_irqsave(&lockres->l_lock, flags);
764
765 if (lksb->status != DLM_NORMAL) {
766 mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n",
767 lockres->l_name, lksb->status);
768 spin_unlock_irqrestore(&lockres->l_lock, flags);
769 return;
770 }
771
772 switch(lockres->l_action) {
773 case OCFS2_AST_ATTACH:
774 ocfs2_generic_handle_attach_action(lockres);
775 break;
776 case OCFS2_AST_CONVERT:
777 ocfs2_generic_handle_convert_action(lockres);
778 break;
779 case OCFS2_AST_DOWNCONVERT:
780 ocfs2_generic_handle_downconvert_action(lockres);
781 break;
782 default:
783 BUG();
784 }
785
Mark Fashehccd979b2005-12-15 14:31:24 -0800786 /* set it to something invalid so if we get called again we
787 * can catch it. */
788 lockres->l_action = OCFS2_AST_INVALID;
Mark Fashehccd979b2005-12-15 14:31:24 -0800789
790 wake_up(&lockres->l_event);
Mark Fashehd680efe2006-09-08 14:14:34 -0700791 spin_unlock_irqrestore(&lockres->l_lock, flags);
Mark Fashehccd979b2005-12-15 14:31:24 -0800792}
793
794static void ocfs2_super_ast_func(void *opaque)
795{
796 struct ocfs2_lock_res *lockres = opaque;
797
798 mlog_entry_void();
799 mlog(0, "Superblock AST fired\n");
800
801 BUG_ON(!ocfs2_is_super_lock(lockres));
Mark Fashehf625c972006-09-12 21:24:53 -0700802 ocfs2_generic_ast_func(lockres);
Mark Fashehccd979b2005-12-15 14:31:24 -0800803
804 mlog_exit_void();
805}
806
807static void ocfs2_super_bast_func(void *opaque,
808 int level)
809{
810 struct ocfs2_lock_res *lockres = opaque;
811 struct ocfs2_super *osb;
812
813 mlog_entry_void();
814 mlog(0, "Superblock BAST fired\n");
815
816 BUG_ON(!ocfs2_is_super_lock(lockres));
817 osb = ocfs2_lock_res_super(lockres);
818 ocfs2_generic_bast_func(osb, lockres, level);
819
820 mlog_exit_void();
821}
822
823static void ocfs2_rename_ast_func(void *opaque)
824{
825 struct ocfs2_lock_res *lockres = opaque;
826
827 mlog_entry_void();
828
829 mlog(0, "Rename AST fired\n");
830
831 BUG_ON(!ocfs2_is_rename_lock(lockres));
832
Mark Fashehf625c972006-09-12 21:24:53 -0700833 ocfs2_generic_ast_func(lockres);
Mark Fashehccd979b2005-12-15 14:31:24 -0800834
835 mlog_exit_void();
836}
837
838static void ocfs2_rename_bast_func(void *opaque,
839 int level)
840{
841 struct ocfs2_lock_res *lockres = opaque;
842 struct ocfs2_super *osb;
843
844 mlog_entry_void();
845
846 mlog(0, "Rename BAST fired\n");
847
848 BUG_ON(!ocfs2_is_rename_lock(lockres));
849
850 osb = ocfs2_lock_res_super(lockres);
851 ocfs2_generic_bast_func(osb, lockres, level);
852
853 mlog_exit_void();
854}
855
Mark Fashehd680efe2006-09-08 14:14:34 -0700856static void ocfs2_dentry_ast_func(void *opaque)
857{
858 struct ocfs2_lock_res *lockres = opaque;
859
860 BUG_ON(!lockres);
861
Mark Fashehf625c972006-09-12 21:24:53 -0700862 ocfs2_generic_ast_func(lockres);
Mark Fashehd680efe2006-09-08 14:14:34 -0700863}
864
865static void ocfs2_dentry_bast_func(void *opaque, int level)
866{
867 struct ocfs2_lock_res *lockres = opaque;
868 struct ocfs2_dentry_lock *dl = lockres->l_priv;
869 struct ocfs2_super *osb = OCFS2_SB(dl->dl_inode->i_sb);
870
871 mlog(0, "Dentry bast: level: %d, name: %s\n", level,
872 lockres->l_name);
873
874 ocfs2_generic_bast_func(osb, lockres, level);
875}
876
Mark Fashehccd979b2005-12-15 14:31:24 -0800877static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
878 int convert)
879{
880 unsigned long flags;
881
882 mlog_entry_void();
883 spin_lock_irqsave(&lockres->l_lock, flags);
884 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
885 if (convert)
886 lockres->l_action = OCFS2_AST_INVALID;
887 else
888 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
889 spin_unlock_irqrestore(&lockres->l_lock, flags);
890
891 wake_up(&lockres->l_event);
892 mlog_exit_void();
893}
894
895/* Note: If we detect another process working on the lock (i.e.,
896 * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller
897 * to do the right thing in that case.
898 */
899static int ocfs2_lock_create(struct ocfs2_super *osb,
900 struct ocfs2_lock_res *lockres,
901 int level,
902 int dlm_flags)
903{
904 int ret = 0;
905 enum dlm_status status;
906 unsigned long flags;
907
908 mlog_entry_void();
909
910 mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level,
911 dlm_flags);
912
913 spin_lock_irqsave(&lockres->l_lock, flags);
914 if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) ||
915 (lockres->l_flags & OCFS2_LOCK_BUSY)) {
916 spin_unlock_irqrestore(&lockres->l_lock, flags);
917 goto bail;
918 }
919
920 lockres->l_action = OCFS2_AST_ATTACH;
921 lockres->l_requested = level;
922 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
923 spin_unlock_irqrestore(&lockres->l_lock, flags);
924
925 status = dlmlock(osb->dlm,
926 level,
927 &lockres->l_lksb,
928 dlm_flags,
929 lockres->l_name,
Mark Fashehf0681062006-09-08 11:40:10 -0700930 OCFS2_LOCK_ID_MAX_LEN - 1,
Mark Fashehccd979b2005-12-15 14:31:24 -0800931 lockres->l_ops->ast,
932 lockres,
933 lockres->l_ops->bast);
934 if (status != DLM_NORMAL) {
935 ocfs2_log_dlm_error("dlmlock", status, lockres);
936 ret = -EINVAL;
937 ocfs2_recover_from_dlm_error(lockres, 1);
938 }
939
940 mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name);
941
942bail:
943 mlog_exit(ret);
944 return ret;
945}
946
947static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres,
948 int flag)
949{
950 unsigned long flags;
951 int ret;
952
953 spin_lock_irqsave(&lockres->l_lock, flags);
954 ret = lockres->l_flags & flag;
955 spin_unlock_irqrestore(&lockres->l_lock, flags);
956
957 return ret;
958}
959
960static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres)
961
962{
963 wait_event(lockres->l_event,
964 !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY));
965}
966
967static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres)
968
969{
970 wait_event(lockres->l_event,
971 !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING));
972}
973
974/* predict what lock level we'll be dropping down to on behalf
975 * of another node, and return true if the currently wanted
976 * level will be compatible with it. */
977static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
978 int wanted)
979{
980 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
981
982 return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking);
983}
984
985static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
986{
987 INIT_LIST_HEAD(&mw->mw_item);
988 init_completion(&mw->mw_complete);
989}
990
991static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
992{
993 wait_for_completion(&mw->mw_complete);
994 /* Re-arm the completion in case we want to wait on it again */
995 INIT_COMPLETION(mw->mw_complete);
996 return mw->mw_status;
997}
998
999static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
1000 struct ocfs2_mask_waiter *mw,
1001 unsigned long mask,
1002 unsigned long goal)
1003{
1004 BUG_ON(!list_empty(&mw->mw_item));
1005
1006 assert_spin_locked(&lockres->l_lock);
1007
1008 list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
1009 mw->mw_mask = mask;
1010 mw->mw_goal = goal;
1011}
1012
1013/* returns 0 if the mw that was removed was already satisfied, -EBUSY
1014 * if the mask still hadn't reached its goal */
1015static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
1016 struct ocfs2_mask_waiter *mw)
1017{
1018 unsigned long flags;
1019 int ret = 0;
1020
1021 spin_lock_irqsave(&lockres->l_lock, flags);
1022 if (!list_empty(&mw->mw_item)) {
1023 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
1024 ret = -EBUSY;
1025
1026 list_del_init(&mw->mw_item);
1027 init_completion(&mw->mw_complete);
1028 }
1029 spin_unlock_irqrestore(&lockres->l_lock, flags);
1030
1031 return ret;
1032
1033}
1034
1035static int ocfs2_cluster_lock(struct ocfs2_super *osb,
1036 struct ocfs2_lock_res *lockres,
1037 int level,
1038 int lkm_flags,
1039 int arg_flags)
1040{
1041 struct ocfs2_mask_waiter mw;
1042 enum dlm_status status;
1043 int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
1044 int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
1045 unsigned long flags;
1046
1047 mlog_entry_void();
1048
1049 ocfs2_init_mask_waiter(&mw);
1050
1051again:
1052 wait = 0;
1053
1054 if (catch_signals && signal_pending(current)) {
1055 ret = -ERESTARTSYS;
1056 goto out;
1057 }
1058
1059 spin_lock_irqsave(&lockres->l_lock, flags);
1060
1061 mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
1062 "Cluster lock called on freeing lockres %s! flags "
1063 "0x%lx\n", lockres->l_name, lockres->l_flags);
1064
1065 /* We only compare against the currently granted level
1066 * here. If the lock is blocked waiting on a downconvert,
1067 * we'll get caught below. */
1068 if (lockres->l_flags & OCFS2_LOCK_BUSY &&
1069 level > lockres->l_level) {
1070 /* is someone sitting in dlm_lock? If so, wait on
1071 * them. */
1072 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1073 wait = 1;
1074 goto unlock;
1075 }
1076
1077 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
1078 /* lock has not been created yet. */
1079 spin_unlock_irqrestore(&lockres->l_lock, flags);
1080
1081 ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
1082 if (ret < 0) {
1083 mlog_errno(ret);
1084 goto out;
1085 }
1086 goto again;
1087 }
1088
1089 if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
1090 !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
1091 /* is the lock is currently blocked on behalf of
1092 * another node */
1093 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0);
1094 wait = 1;
1095 goto unlock;
1096 }
1097
1098 if (level > lockres->l_level) {
1099 if (lockres->l_action != OCFS2_AST_INVALID)
1100 mlog(ML_ERROR, "lockres %s has action %u pending\n",
1101 lockres->l_name, lockres->l_action);
1102
1103 lockres->l_action = OCFS2_AST_CONVERT;
1104 lockres->l_requested = level;
1105 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
1106 spin_unlock_irqrestore(&lockres->l_lock, flags);
1107
1108 BUG_ON(level == LKM_IVMODE);
1109 BUG_ON(level == LKM_NLMODE);
1110
1111 mlog(0, "lock %s, convert from %d to level = %d\n",
1112 lockres->l_name, lockres->l_level, level);
1113
1114 /* call dlm_lock to upgrade lock now */
1115 status = dlmlock(osb->dlm,
1116 level,
1117 &lockres->l_lksb,
1118 lkm_flags|LKM_CONVERT|LKM_VALBLK,
1119 lockres->l_name,
Mark Fashehf0681062006-09-08 11:40:10 -07001120 OCFS2_LOCK_ID_MAX_LEN - 1,
Mark Fashehccd979b2005-12-15 14:31:24 -08001121 lockres->l_ops->ast,
1122 lockres,
1123 lockres->l_ops->bast);
1124 if (status != DLM_NORMAL) {
1125 if ((lkm_flags & LKM_NOQUEUE) &&
1126 (status == DLM_NOTQUEUED))
1127 ret = -EAGAIN;
1128 else {
1129 ocfs2_log_dlm_error("dlmlock", status,
1130 lockres);
1131 ret = -EINVAL;
1132 }
1133 ocfs2_recover_from_dlm_error(lockres, 1);
1134 goto out;
1135 }
1136
1137 mlog(0, "lock %s, successfull return from dlmlock\n",
1138 lockres->l_name);
1139
1140 /* At this point we've gone inside the dlm and need to
1141 * complete our work regardless. */
1142 catch_signals = 0;
1143
1144 /* wait for busy to clear and carry on */
1145 goto again;
1146 }
1147
1148 /* Ok, if we get here then we're good to go. */
1149 ocfs2_inc_holders(lockres, level);
1150
1151 ret = 0;
1152unlock:
1153 spin_unlock_irqrestore(&lockres->l_lock, flags);
1154out:
1155 /*
1156 * This is helping work around a lock inversion between the page lock
1157 * and dlm locks. One path holds the page lock while calling aops
1158 * which block acquiring dlm locks. The voting thread holds dlm
1159 * locks while acquiring page locks while down converting data locks.
1160 * This block is helping an aop path notice the inversion and back
1161 * off to unlock its page lock before trying the dlm lock again.
1162 */
1163 if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
1164 mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
1165 wait = 0;
1166 if (lockres_remove_mask_waiter(lockres, &mw))
1167 ret = -EAGAIN;
1168 else
1169 goto again;
1170 }
1171 if (wait) {
1172 ret = ocfs2_wait_for_mask(&mw);
1173 if (ret == 0)
1174 goto again;
1175 mlog_errno(ret);
1176 }
1177
1178 mlog_exit(ret);
1179 return ret;
1180}
1181
1182static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
1183 struct ocfs2_lock_res *lockres,
1184 int level)
1185{
1186 unsigned long flags;
1187
1188 mlog_entry_void();
1189 spin_lock_irqsave(&lockres->l_lock, flags);
1190 ocfs2_dec_holders(lockres, level);
1191 ocfs2_vote_on_unlock(osb, lockres);
1192 spin_unlock_irqrestore(&lockres->l_lock, flags);
1193 mlog_exit_void();
1194}
1195
Mark Fashehd680efe2006-09-08 14:14:34 -07001196int ocfs2_create_new_lock(struct ocfs2_super *osb,
1197 struct ocfs2_lock_res *lockres,
Mark Fasheh24c19ef2006-09-22 17:28:19 -07001198 int ex,
1199 int local)
Mark Fashehccd979b2005-12-15 14:31:24 -08001200{
Mark Fashehd680efe2006-09-08 14:14:34 -07001201 int level = ex ? LKM_EXMODE : LKM_PRMODE;
Mark Fashehccd979b2005-12-15 14:31:24 -08001202 unsigned long flags;
Mark Fasheh24c19ef2006-09-22 17:28:19 -07001203 int lkm_flags = local ? LKM_LOCAL : 0;
Mark Fashehccd979b2005-12-15 14:31:24 -08001204
1205 spin_lock_irqsave(&lockres->l_lock, flags);
1206 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
1207 lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
1208 spin_unlock_irqrestore(&lockres->l_lock, flags);
1209
Mark Fasheh24c19ef2006-09-22 17:28:19 -07001210 return ocfs2_lock_create(osb, lockres, level, lkm_flags);
Mark Fashehccd979b2005-12-15 14:31:24 -08001211}
1212
1213/* Grants us an EX lock on the data and metadata resources, skipping
1214 * the normal cluster directory lookup. Use this ONLY on newly created
1215 * inodes which other nodes can't possibly see, and which haven't been
1216 * hashed in the inode hash yet. This can give us a good performance
1217 * increase as it'll skip the network broadcast normally associated
1218 * with creating a new lock resource. */
1219int ocfs2_create_new_inode_locks(struct inode *inode)
1220{
1221 int ret;
Mark Fashehd680efe2006-09-08 14:14:34 -07001222 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
Mark Fashehccd979b2005-12-15 14:31:24 -08001223
1224 BUG_ON(!inode);
1225 BUG_ON(!ocfs2_inode_is_new(inode));
1226
1227 mlog_entry_void();
1228
Mark Fashehb0697052006-03-03 10:24:33 -08001229 mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -08001230
1231 /* NOTE: That we don't increment any of the holder counts, nor
1232 * do we add anything to a journal handle. Since this is
1233 * supposed to be a new inode which the cluster doesn't know
1234 * about yet, there is no need to. As far as the LVB handling
1235 * is concerned, this is basically like acquiring an EX lock
1236 * on a resource which has an invalid one -- we'll set it
1237 * valid when we release the EX. */
1238
Mark Fasheh24c19ef2006-09-22 17:28:19 -07001239 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_rw_lockres, 1, 1);
Mark Fashehccd979b2005-12-15 14:31:24 -08001240 if (ret) {
1241 mlog_errno(ret);
1242 goto bail;
1243 }
1244
Mark Fasheh24c19ef2006-09-22 17:28:19 -07001245 /*
1246 * We don't want to use LKM_LOCAL on a meta data lock as they
1247 * don't use a generation in their lock names.
1248 */
1249 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_meta_lockres, 1, 0);
Mark Fashehccd979b2005-12-15 14:31:24 -08001250 if (ret) {
1251 mlog_errno(ret);
1252 goto bail;
1253 }
1254
Mark Fasheh24c19ef2006-09-22 17:28:19 -07001255 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_data_lockres, 1, 1);
Mark Fashehccd979b2005-12-15 14:31:24 -08001256 if (ret) {
1257 mlog_errno(ret);
1258 goto bail;
1259 }
1260
1261bail:
1262 mlog_exit(ret);
1263 return ret;
1264}
1265
1266int ocfs2_rw_lock(struct inode *inode, int write)
1267{
1268 int status, level;
1269 struct ocfs2_lock_res *lockres;
1270
1271 BUG_ON(!inode);
1272
1273 mlog_entry_void();
1274
Mark Fashehb0697052006-03-03 10:24:33 -08001275 mlog(0, "inode %llu take %s RW lock\n",
1276 (unsigned long long)OCFS2_I(inode)->ip_blkno,
Mark Fashehccd979b2005-12-15 14:31:24 -08001277 write ? "EXMODE" : "PRMODE");
1278
1279 lockres = &OCFS2_I(inode)->ip_rw_lockres;
1280
1281 level = write ? LKM_EXMODE : LKM_PRMODE;
1282
1283 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
1284 0);
1285 if (status < 0)
1286 mlog_errno(status);
1287
1288 mlog_exit(status);
1289 return status;
1290}
1291
1292void ocfs2_rw_unlock(struct inode *inode, int write)
1293{
1294 int level = write ? LKM_EXMODE : LKM_PRMODE;
1295 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
1296
1297 mlog_entry_void();
1298
Mark Fashehb0697052006-03-03 10:24:33 -08001299 mlog(0, "inode %llu drop %s RW lock\n",
1300 (unsigned long long)OCFS2_I(inode)->ip_blkno,
Mark Fashehccd979b2005-12-15 14:31:24 -08001301 write ? "EXMODE" : "PRMODE");
1302
1303 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1304
1305 mlog_exit_void();
1306}
1307
1308int ocfs2_data_lock_full(struct inode *inode,
1309 int write,
1310 int arg_flags)
1311{
1312 int status = 0, level;
1313 struct ocfs2_lock_res *lockres;
1314
1315 BUG_ON(!inode);
1316
1317 mlog_entry_void();
1318
Mark Fashehb0697052006-03-03 10:24:33 -08001319 mlog(0, "inode %llu take %s DATA lock\n",
1320 (unsigned long long)OCFS2_I(inode)->ip_blkno,
Mark Fashehccd979b2005-12-15 14:31:24 -08001321 write ? "EXMODE" : "PRMODE");
1322
1323 /* We'll allow faking a readonly data lock for
1324 * rodevices. */
1325 if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) {
1326 if (write) {
1327 status = -EROFS;
1328 mlog_errno(status);
1329 }
1330 goto out;
1331 }
1332
1333 lockres = &OCFS2_I(inode)->ip_data_lockres;
1334
1335 level = write ? LKM_EXMODE : LKM_PRMODE;
1336
1337 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level,
1338 0, arg_flags);
1339 if (status < 0 && status != -EAGAIN)
1340 mlog_errno(status);
1341
1342out:
1343 mlog_exit(status);
1344 return status;
1345}
1346
1347/* see ocfs2_meta_lock_with_page() */
1348int ocfs2_data_lock_with_page(struct inode *inode,
1349 int write,
1350 struct page *page)
1351{
1352 int ret;
1353
1354 ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK);
1355 if (ret == -EAGAIN) {
1356 unlock_page(page);
1357 if (ocfs2_data_lock(inode, write) == 0)
1358 ocfs2_data_unlock(inode, write);
1359 ret = AOP_TRUNCATED_PAGE;
1360 }
1361
1362 return ret;
1363}
1364
1365static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
1366 struct ocfs2_lock_res *lockres)
1367{
1368 int kick = 0;
1369
1370 mlog_entry_void();
1371
1372 /* If we know that another node is waiting on our lock, kick
1373 * the vote thread * pre-emptively when we reach a release
1374 * condition. */
1375 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
1376 switch(lockres->l_blocking) {
1377 case LKM_EXMODE:
1378 if (!lockres->l_ex_holders && !lockres->l_ro_holders)
1379 kick = 1;
1380 break;
1381 case LKM_PRMODE:
1382 if (!lockres->l_ex_holders)
1383 kick = 1;
1384 break;
1385 default:
1386 BUG();
1387 }
1388 }
1389
1390 if (kick)
1391 ocfs2_kick_vote_thread(osb);
1392
1393 mlog_exit_void();
1394}
1395
1396void ocfs2_data_unlock(struct inode *inode,
1397 int write)
1398{
1399 int level = write ? LKM_EXMODE : LKM_PRMODE;
1400 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
1401
1402 mlog_entry_void();
1403
Mark Fashehb0697052006-03-03 10:24:33 -08001404 mlog(0, "inode %llu drop %s DATA lock\n",
1405 (unsigned long long)OCFS2_I(inode)->ip_blkno,
Mark Fashehccd979b2005-12-15 14:31:24 -08001406 write ? "EXMODE" : "PRMODE");
1407
1408 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
1409 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1410
1411 mlog_exit_void();
1412}
1413
1414#define OCFS2_SEC_BITS 34
1415#define OCFS2_SEC_SHIFT (64 - 34)
1416#define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1)
1417
1418/* LVB only has room for 64 bits of time here so we pack it for
1419 * now. */
1420static u64 ocfs2_pack_timespec(struct timespec *spec)
1421{
1422 u64 res;
1423 u64 sec = spec->tv_sec;
1424 u32 nsec = spec->tv_nsec;
1425
1426 res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
1427
1428 return res;
1429}
1430
1431/* Call this with the lockres locked. I am reasonably sure we don't
1432 * need ip_lock in this function as anyone who would be changing those
1433 * values is supposed to be blocked in ocfs2_meta_lock right now. */
1434static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1435{
1436 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1437 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
1438 struct ocfs2_meta_lvb *lvb;
1439
1440 mlog_entry_void();
1441
1442 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1443
Mark Fasheh24c19ef2006-09-22 17:28:19 -07001444 /*
1445 * Invalidate the LVB of a deleted inode - this way other
1446 * nodes are forced to go to disk and discover the new inode
1447 * status.
1448 */
1449 if (oi->ip_flags & OCFS2_INODE_DELETED) {
1450 lvb->lvb_version = 0;
1451 goto out;
1452 }
1453
Mark Fasheh4d3b83f2006-09-12 15:22:18 -07001454 lvb->lvb_version = OCFS2_LVB_VERSION;
Mark Fashehccd979b2005-12-15 14:31:24 -08001455 lvb->lvb_isize = cpu_to_be64(i_size_read(inode));
1456 lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
1457 lvb->lvb_iuid = cpu_to_be32(inode->i_uid);
1458 lvb->lvb_igid = cpu_to_be32(inode->i_gid);
1459 lvb->lvb_imode = cpu_to_be16(inode->i_mode);
1460 lvb->lvb_inlink = cpu_to_be16(inode->i_nlink);
1461 lvb->lvb_iatime_packed =
1462 cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
1463 lvb->lvb_ictime_packed =
1464 cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
1465 lvb->lvb_imtime_packed =
1466 cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
Herbert Poetzlca4d1472006-07-03 17:27:12 -07001467 lvb->lvb_iattr = cpu_to_be32(oi->ip_attr);
Mark Fashehf9e2d822006-09-12 15:35:49 -07001468 lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
Mark Fashehccd979b2005-12-15 14:31:24 -08001469
Mark Fasheh24c19ef2006-09-22 17:28:19 -07001470out:
Mark Fashehccd979b2005-12-15 14:31:24 -08001471 mlog_meta_lvb(0, lockres);
1472
1473 mlog_exit_void();
1474}
1475
1476static void ocfs2_unpack_timespec(struct timespec *spec,
1477 u64 packed_time)
1478{
1479 spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
1480 spec->tv_nsec = packed_time & OCFS2_NSEC_MASK;
1481}
1482
1483static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1484{
1485 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1486 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
1487 struct ocfs2_meta_lvb *lvb;
1488
1489 mlog_entry_void();
1490
1491 mlog_meta_lvb(0, lockres);
1492
1493 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1494
1495 /* We're safe here without the lockres lock... */
1496 spin_lock(&oi->ip_lock);
1497 oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters);
1498 i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
1499
Herbert Poetzlca4d1472006-07-03 17:27:12 -07001500 oi->ip_attr = be32_to_cpu(lvb->lvb_iattr);
1501 ocfs2_set_inode_flags(inode);
1502
Mark Fashehccd979b2005-12-15 14:31:24 -08001503 /* fast-symlinks are a special case */
1504 if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
1505 inode->i_blocks = 0;
1506 else
1507 inode->i_blocks =
1508 ocfs2_align_bytes_to_sectors(i_size_read(inode));
1509
1510 inode->i_uid = be32_to_cpu(lvb->lvb_iuid);
1511 inode->i_gid = be32_to_cpu(lvb->lvb_igid);
1512 inode->i_mode = be16_to_cpu(lvb->lvb_imode);
1513 inode->i_nlink = be16_to_cpu(lvb->lvb_inlink);
1514 ocfs2_unpack_timespec(&inode->i_atime,
1515 be64_to_cpu(lvb->lvb_iatime_packed));
1516 ocfs2_unpack_timespec(&inode->i_mtime,
1517 be64_to_cpu(lvb->lvb_imtime_packed));
1518 ocfs2_unpack_timespec(&inode->i_ctime,
1519 be64_to_cpu(lvb->lvb_ictime_packed));
1520 spin_unlock(&oi->ip_lock);
1521
1522 mlog_exit_void();
1523}
1524
Mark Fashehf9e2d822006-09-12 15:35:49 -07001525static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
1526 struct ocfs2_lock_res *lockres)
Mark Fashehccd979b2005-12-15 14:31:24 -08001527{
1528 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1529
Mark Fashehf9e2d822006-09-12 15:35:49 -07001530 if (lvb->lvb_version == OCFS2_LVB_VERSION
1531 && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
Mark Fashehccd979b2005-12-15 14:31:24 -08001532 return 1;
1533 return 0;
1534}
1535
1536/* Determine whether a lock resource needs to be refreshed, and
1537 * arbitrate who gets to refresh it.
1538 *
1539 * 0 means no refresh needed.
1540 *
1541 * > 0 means you need to refresh this and you MUST call
1542 * ocfs2_complete_lock_res_refresh afterwards. */
1543static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
1544{
1545 unsigned long flags;
1546 int status = 0;
1547
1548 mlog_entry_void();
1549
1550refresh_check:
1551 spin_lock_irqsave(&lockres->l_lock, flags);
1552 if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
1553 spin_unlock_irqrestore(&lockres->l_lock, flags);
1554 goto bail;
1555 }
1556
1557 if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
1558 spin_unlock_irqrestore(&lockres->l_lock, flags);
1559
1560 ocfs2_wait_on_refreshing_lock(lockres);
1561 goto refresh_check;
1562 }
1563
1564 /* Ok, I'll be the one to refresh this lock. */
1565 lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING);
1566 spin_unlock_irqrestore(&lockres->l_lock, flags);
1567
1568 status = 1;
1569bail:
1570 mlog_exit(status);
1571 return status;
1572}
1573
1574/* If status is non zero, I'll mark it as not being in refresh
1575 * anymroe, but i won't clear the needs refresh flag. */
1576static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres,
1577 int status)
1578{
1579 unsigned long flags;
1580 mlog_entry_void();
1581
1582 spin_lock_irqsave(&lockres->l_lock, flags);
1583 lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
1584 if (!status)
1585 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
1586 spin_unlock_irqrestore(&lockres->l_lock, flags);
1587
1588 wake_up(&lockres->l_event);
1589
1590 mlog_exit_void();
1591}
1592
1593/* may or may not return a bh if it went to disk. */
1594static int ocfs2_meta_lock_update(struct inode *inode,
1595 struct buffer_head **bh)
1596{
1597 int status = 0;
1598 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1599 struct ocfs2_lock_res *lockres;
1600 struct ocfs2_dinode *fe;
1601
1602 mlog_entry_void();
1603
1604 spin_lock(&oi->ip_lock);
1605 if (oi->ip_flags & OCFS2_INODE_DELETED) {
Mark Fashehb0697052006-03-03 10:24:33 -08001606 mlog(0, "Orphaned inode %llu was deleted while we "
Mark Fashehccd979b2005-12-15 14:31:24 -08001607 "were waiting on a lock. ip_flags = 0x%x\n",
Mark Fashehb0697052006-03-03 10:24:33 -08001608 (unsigned long long)oi->ip_blkno, oi->ip_flags);
Mark Fashehccd979b2005-12-15 14:31:24 -08001609 spin_unlock(&oi->ip_lock);
1610 status = -ENOENT;
1611 goto bail;
1612 }
1613 spin_unlock(&oi->ip_lock);
1614
1615 lockres = &oi->ip_meta_lockres;
1616
1617 if (!ocfs2_should_refresh_lock_res(lockres))
1618 goto bail;
1619
1620 /* This will discard any caching information we might have had
1621 * for the inode metadata. */
1622 ocfs2_metadata_cache_purge(inode);
1623
1624 /* will do nothing for inode types that don't use the extent
1625 * map (directories, bitmap files, etc) */
1626 ocfs2_extent_map_trunc(inode, 0);
1627
Mark Fashehf9e2d822006-09-12 15:35:49 -07001628 if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
Mark Fashehb0697052006-03-03 10:24:33 -08001629 mlog(0, "Trusting LVB on inode %llu\n",
1630 (unsigned long long)oi->ip_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -08001631 ocfs2_refresh_inode_from_lvb(inode);
1632 } else {
1633 /* Boo, we have to go to disk. */
1634 /* read bh, cast, ocfs2_refresh_inode */
1635 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
1636 bh, OCFS2_BH_CACHED, inode);
1637 if (status < 0) {
1638 mlog_errno(status);
1639 goto bail_refresh;
1640 }
1641 fe = (struct ocfs2_dinode *) (*bh)->b_data;
1642
1643 /* This is a good chance to make sure we're not
1644 * locking an invalid object.
1645 *
1646 * We bug on a stale inode here because we checked
1647 * above whether it was wiped from disk. The wiping
1648 * node provides a guarantee that we receive that
1649 * message and can mark the inode before dropping any
1650 * locks associated with it. */
1651 if (!OCFS2_IS_VALID_DINODE(fe)) {
1652 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
1653 status = -EIO;
1654 goto bail_refresh;
1655 }
1656 mlog_bug_on_msg(inode->i_generation !=
1657 le32_to_cpu(fe->i_generation),
Mark Fashehb0697052006-03-03 10:24:33 -08001658 "Invalid dinode %llu disk generation: %u "
Mark Fashehccd979b2005-12-15 14:31:24 -08001659 "inode->i_generation: %u\n",
Mark Fashehb0697052006-03-03 10:24:33 -08001660 (unsigned long long)oi->ip_blkno,
1661 le32_to_cpu(fe->i_generation),
Mark Fashehccd979b2005-12-15 14:31:24 -08001662 inode->i_generation);
1663 mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) ||
1664 !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)),
Mark Fashehb0697052006-03-03 10:24:33 -08001665 "Stale dinode %llu dtime: %llu flags: 0x%x\n",
1666 (unsigned long long)oi->ip_blkno,
1667 (unsigned long long)le64_to_cpu(fe->i_dtime),
Mark Fashehccd979b2005-12-15 14:31:24 -08001668 le32_to_cpu(fe->i_flags));
1669
1670 ocfs2_refresh_inode(inode, fe);
1671 }
1672
1673 status = 0;
1674bail_refresh:
1675 ocfs2_complete_lock_res_refresh(lockres, status);
1676bail:
1677 mlog_exit(status);
1678 return status;
1679}
1680
1681static int ocfs2_assign_bh(struct inode *inode,
1682 struct buffer_head **ret_bh,
1683 struct buffer_head *passed_bh)
1684{
1685 int status;
1686
1687 if (passed_bh) {
1688 /* Ok, the update went to disk for us, use the
1689 * returned bh. */
1690 *ret_bh = passed_bh;
1691 get_bh(*ret_bh);
1692
1693 return 0;
1694 }
1695
1696 status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
1697 OCFS2_I(inode)->ip_blkno,
1698 ret_bh,
1699 OCFS2_BH_CACHED,
1700 inode);
1701 if (status < 0)
1702 mlog_errno(status);
1703
1704 return status;
1705}
1706
1707/*
1708 * returns < 0 error if the callback will never be called, otherwise
1709 * the result of the lock will be communicated via the callback.
1710 */
1711int ocfs2_meta_lock_full(struct inode *inode,
1712 struct ocfs2_journal_handle *handle,
1713 struct buffer_head **ret_bh,
1714 int ex,
1715 int arg_flags)
1716{
1717 int status, level, dlm_flags, acquired;
1718 struct ocfs2_lock_res *lockres;
1719 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1720 struct buffer_head *local_bh = NULL;
1721
1722 BUG_ON(!inode);
1723
1724 mlog_entry_void();
1725
Mark Fashehb0697052006-03-03 10:24:33 -08001726 mlog(0, "inode %llu, take %s META lock\n",
1727 (unsigned long long)OCFS2_I(inode)->ip_blkno,
Mark Fashehccd979b2005-12-15 14:31:24 -08001728 ex ? "EXMODE" : "PRMODE");
1729
1730 status = 0;
1731 acquired = 0;
1732 /* We'll allow faking a readonly metadata lock for
1733 * rodevices. */
1734 if (ocfs2_is_hard_readonly(osb)) {
1735 if (ex)
1736 status = -EROFS;
1737 goto bail;
1738 }
1739
1740 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1741 wait_event(osb->recovery_event,
1742 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1743
1744 acquired = 0;
1745 lockres = &OCFS2_I(inode)->ip_meta_lockres;
1746 level = ex ? LKM_EXMODE : LKM_PRMODE;
1747 dlm_flags = 0;
1748 if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
1749 dlm_flags |= LKM_NOQUEUE;
1750
1751 status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
1752 if (status < 0) {
1753 if (status != -EAGAIN && status != -EIOCBRETRY)
1754 mlog_errno(status);
1755 goto bail;
1756 }
1757
1758 /* Notify the error cleanup path to drop the cluster lock. */
1759 acquired = 1;
1760
1761 /* We wait twice because a node may have died while we were in
1762 * the lower dlm layers. The second time though, we've
1763 * committed to owning this lock so we don't allow signals to
1764 * abort the operation. */
1765 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1766 wait_event(osb->recovery_event,
1767 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1768
Mark Fasheh24c19ef2006-09-22 17:28:19 -07001769 /*
1770 * We only see this flag if we're being called from
1771 * ocfs2_read_locked_inode(). It means we're locking an inode
1772 * which hasn't been populated yet, so clear the refresh flag
1773 * and let the caller handle it.
1774 */
1775 if (inode->i_state & I_NEW) {
1776 status = 0;
1777 ocfs2_complete_lock_res_refresh(lockres, 0);
1778 goto bail;
1779 }
1780
Mark Fashehccd979b2005-12-15 14:31:24 -08001781 /* This is fun. The caller may want a bh back, or it may
1782 * not. ocfs2_meta_lock_update definitely wants one in, but
1783 * may or may not read one, depending on what's in the
1784 * LVB. The result of all of this is that we've *only* gone to
1785 * disk if we have to, so the complexity is worthwhile. */
1786 status = ocfs2_meta_lock_update(inode, &local_bh);
1787 if (status < 0) {
1788 if (status != -ENOENT)
1789 mlog_errno(status);
1790 goto bail;
1791 }
1792
1793 if (ret_bh) {
1794 status = ocfs2_assign_bh(inode, ret_bh, local_bh);
1795 if (status < 0) {
1796 mlog_errno(status);
1797 goto bail;
1798 }
1799 }
1800
1801 if (handle) {
1802 status = ocfs2_handle_add_lock(handle, inode);
1803 if (status < 0)
1804 mlog_errno(status);
1805 }
1806
1807bail:
1808 if (status < 0) {
1809 if (ret_bh && (*ret_bh)) {
1810 brelse(*ret_bh);
1811 *ret_bh = NULL;
1812 }
1813 if (acquired)
1814 ocfs2_meta_unlock(inode, ex);
1815 }
1816
1817 if (local_bh)
1818 brelse(local_bh);
1819
1820 mlog_exit(status);
1821 return status;
1822}
1823
1824/*
1825 * This is working around a lock inversion between tasks acquiring DLM locks
1826 * while holding a page lock and the vote thread which blocks dlm lock acquiry
1827 * while acquiring page locks.
1828 *
1829 * ** These _with_page variantes are only intended to be called from aop
1830 * methods that hold page locks and return a very specific *positive* error
1831 * code that aop methods pass up to the VFS -- test for errors with != 0. **
1832 *
1833 * The DLM is called such that it returns -EAGAIN if it would have blocked
1834 * waiting for the vote thread. In that case we unlock our page so the vote
1835 * thread can make progress. Once we've done this we have to return
1836 * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up
1837 * into the VFS who will then immediately retry the aop call.
1838 *
1839 * We do a blocking lock and immediate unlock before returning, though, so that
1840 * the lock has a great chance of being cached on this node by the time the VFS
1841 * calls back to retry the aop. This has a potential to livelock as nodes
1842 * ping locks back and forth, but that's a risk we're willing to take to avoid
1843 * the lock inversion simply.
1844 */
1845int ocfs2_meta_lock_with_page(struct inode *inode,
1846 struct ocfs2_journal_handle *handle,
1847 struct buffer_head **ret_bh,
1848 int ex,
1849 struct page *page)
1850{
1851 int ret;
1852
1853 ret = ocfs2_meta_lock_full(inode, handle, ret_bh, ex,
1854 OCFS2_LOCK_NONBLOCK);
1855 if (ret == -EAGAIN) {
1856 unlock_page(page);
1857 if (ocfs2_meta_lock(inode, handle, ret_bh, ex) == 0)
1858 ocfs2_meta_unlock(inode, ex);
1859 ret = AOP_TRUNCATED_PAGE;
1860 }
1861
1862 return ret;
1863}
1864
1865void ocfs2_meta_unlock(struct inode *inode,
1866 int ex)
1867{
1868 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1869 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
1870
1871 mlog_entry_void();
1872
Mark Fashehb0697052006-03-03 10:24:33 -08001873 mlog(0, "inode %llu drop %s META lock\n",
1874 (unsigned long long)OCFS2_I(inode)->ip_blkno,
Mark Fashehccd979b2005-12-15 14:31:24 -08001875 ex ? "EXMODE" : "PRMODE");
1876
1877 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
1878 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1879
1880 mlog_exit_void();
1881}
1882
1883int ocfs2_super_lock(struct ocfs2_super *osb,
1884 int ex)
1885{
1886 int status;
1887 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1888 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
1889 struct buffer_head *bh;
1890 struct ocfs2_slot_info *si = osb->slot_info;
1891
1892 mlog_entry_void();
1893
1894 if (ocfs2_is_hard_readonly(osb))
1895 return -EROFS;
1896
1897 status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
1898 if (status < 0) {
1899 mlog_errno(status);
1900 goto bail;
1901 }
1902
1903 /* The super block lock path is really in the best position to
1904 * know when resources covered by the lock need to be
1905 * refreshed, so we do it here. Of course, making sense of
1906 * everything is up to the caller :) */
1907 status = ocfs2_should_refresh_lock_res(lockres);
1908 if (status < 0) {
1909 mlog_errno(status);
1910 goto bail;
1911 }
1912 if (status) {
1913 bh = si->si_bh;
1914 status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
1915 si->si_inode);
1916 if (status == 0)
1917 ocfs2_update_slot_info(si);
1918
1919 ocfs2_complete_lock_res_refresh(lockres, status);
1920
1921 if (status < 0)
1922 mlog_errno(status);
1923 }
1924bail:
1925 mlog_exit(status);
1926 return status;
1927}
1928
1929void ocfs2_super_unlock(struct ocfs2_super *osb,
1930 int ex)
1931{
1932 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1933 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
1934
1935 ocfs2_cluster_unlock(osb, lockres, level);
1936}
1937
1938int ocfs2_rename_lock(struct ocfs2_super *osb)
1939{
1940 int status;
1941 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
1942
1943 if (ocfs2_is_hard_readonly(osb))
1944 return -EROFS;
1945
1946 status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0);
1947 if (status < 0)
1948 mlog_errno(status);
1949
1950 return status;
1951}
1952
1953void ocfs2_rename_unlock(struct ocfs2_super *osb)
1954{
1955 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
1956
1957 ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
1958}
1959
Mark Fashehd680efe2006-09-08 14:14:34 -07001960int ocfs2_dentry_lock(struct dentry *dentry, int ex)
1961{
1962 int ret;
1963 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1964 struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
1965 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
1966
1967 BUG_ON(!dl);
1968
1969 if (ocfs2_is_hard_readonly(osb))
1970 return -EROFS;
1971
1972 ret = ocfs2_cluster_lock(osb, &dl->dl_lockres, level, 0, 0);
1973 if (ret < 0)
1974 mlog_errno(ret);
1975
1976 return ret;
1977}
1978
1979void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
1980{
1981 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1982 struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
1983 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
1984
1985 ocfs2_cluster_unlock(osb, &dl->dl_lockres, level);
1986}
1987
Mark Fashehccd979b2005-12-15 14:31:24 -08001988/* Reference counting of the dlm debug structure. We want this because
1989 * open references on the debug inodes can live on after a mount, so
1990 * we can't rely on the ocfs2_super to always exist. */
1991static void ocfs2_dlm_debug_free(struct kref *kref)
1992{
1993 struct ocfs2_dlm_debug *dlm_debug;
1994
1995 dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt);
1996
1997 kfree(dlm_debug);
1998}
1999
2000void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug)
2001{
2002 if (dlm_debug)
2003 kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free);
2004}
2005
2006static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug)
2007{
2008 kref_get(&debug->d_refcnt);
2009}
2010
2011struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
2012{
2013 struct ocfs2_dlm_debug *dlm_debug;
2014
2015 dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL);
2016 if (!dlm_debug) {
2017 mlog_errno(-ENOMEM);
2018 goto out;
2019 }
2020
2021 kref_init(&dlm_debug->d_refcnt);
2022 INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
2023 dlm_debug->d_locking_state = NULL;
2024out:
2025 return dlm_debug;
2026}
2027
2028/* Access to this is arbitrated for us via seq_file->sem. */
2029struct ocfs2_dlm_seq_priv {
2030 struct ocfs2_dlm_debug *p_dlm_debug;
2031 struct ocfs2_lock_res p_iter_res;
2032 struct ocfs2_lock_res p_tmp_res;
2033};
2034
2035static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start,
2036 struct ocfs2_dlm_seq_priv *priv)
2037{
2038 struct ocfs2_lock_res *iter, *ret = NULL;
2039 struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug;
2040
2041 assert_spin_locked(&ocfs2_dlm_tracking_lock);
2042
2043 list_for_each_entry(iter, &start->l_debug_list, l_debug_list) {
2044 /* discover the head of the list */
2045 if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) {
2046 mlog(0, "End of list found, %p\n", ret);
2047 break;
2048 }
2049
2050 /* We track our "dummy" iteration lockres' by a NULL
2051 * l_ops field. */
2052 if (iter->l_ops != NULL) {
2053 ret = iter;
2054 break;
2055 }
2056 }
2057
2058 return ret;
2059}
2060
2061static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos)
2062{
2063 struct ocfs2_dlm_seq_priv *priv = m->private;
2064 struct ocfs2_lock_res *iter;
2065
2066 spin_lock(&ocfs2_dlm_tracking_lock);
2067 iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv);
2068 if (iter) {
2069 /* Since lockres' have the lifetime of their container
2070 * (which can be inodes, ocfs2_supers, etc) we want to
2071 * copy this out to a temporary lockres while still
2072 * under the spinlock. Obviously after this we can't
2073 * trust any pointers on the copy returned, but that's
2074 * ok as the information we want isn't typically held
2075 * in them. */
2076 priv->p_tmp_res = *iter;
2077 iter = &priv->p_tmp_res;
2078 }
2079 spin_unlock(&ocfs2_dlm_tracking_lock);
2080
2081 return iter;
2082}
2083
2084static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v)
2085{
2086}
2087
2088static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
2089{
2090 struct ocfs2_dlm_seq_priv *priv = m->private;
2091 struct ocfs2_lock_res *iter = v;
2092 struct ocfs2_lock_res *dummy = &priv->p_iter_res;
2093
2094 spin_lock(&ocfs2_dlm_tracking_lock);
2095 iter = ocfs2_dlm_next_res(iter, priv);
2096 list_del_init(&dummy->l_debug_list);
2097 if (iter) {
2098 list_add(&dummy->l_debug_list, &iter->l_debug_list);
2099 priv->p_tmp_res = *iter;
2100 iter = &priv->p_tmp_res;
2101 }
2102 spin_unlock(&ocfs2_dlm_tracking_lock);
2103
2104 return iter;
2105}
2106
2107/* So that debugfs.ocfs2 can determine which format is being used */
2108#define OCFS2_DLM_DEBUG_STR_VERSION 1
2109static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
2110{
2111 int i;
2112 char *lvb;
2113 struct ocfs2_lock_res *lockres = v;
2114
2115 if (!lockres)
2116 return -EINVAL;
2117
Mark Fashehd680efe2006-09-08 14:14:34 -07002118 seq_printf(m, "0x%x\t", OCFS2_DLM_DEBUG_STR_VERSION);
2119
2120 if (lockres->l_type == OCFS2_LOCK_TYPE_DENTRY)
2121 seq_printf(m, "%.*s%08x\t", OCFS2_DENTRY_LOCK_INO_START - 1,
2122 lockres->l_name,
2123 (unsigned int)ocfs2_get_dentry_lock_ino(lockres));
2124 else
2125 seq_printf(m, "%.*s\t", OCFS2_LOCK_ID_MAX_LEN, lockres->l_name);
2126
2127 seq_printf(m, "%d\t"
Mark Fashehccd979b2005-12-15 14:31:24 -08002128 "0x%lx\t"
2129 "0x%x\t"
2130 "0x%x\t"
2131 "%u\t"
2132 "%u\t"
2133 "%d\t"
2134 "%d\t",
Mark Fashehccd979b2005-12-15 14:31:24 -08002135 lockres->l_level,
2136 lockres->l_flags,
2137 lockres->l_action,
2138 lockres->l_unlock_action,
2139 lockres->l_ro_holders,
2140 lockres->l_ex_holders,
2141 lockres->l_requested,
2142 lockres->l_blocking);
2143
2144 /* Dump the raw LVB */
2145 lvb = lockres->l_lksb.lvb;
2146 for(i = 0; i < DLM_LVB_LEN; i++)
2147 seq_printf(m, "0x%x\t", lvb[i]);
2148
2149 /* End the line */
2150 seq_printf(m, "\n");
2151 return 0;
2152}
2153
2154static struct seq_operations ocfs2_dlm_seq_ops = {
2155 .start = ocfs2_dlm_seq_start,
2156 .stop = ocfs2_dlm_seq_stop,
2157 .next = ocfs2_dlm_seq_next,
2158 .show = ocfs2_dlm_seq_show,
2159};
2160
2161static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
2162{
2163 struct seq_file *seq = (struct seq_file *) file->private_data;
2164 struct ocfs2_dlm_seq_priv *priv = seq->private;
2165 struct ocfs2_lock_res *res = &priv->p_iter_res;
2166
2167 ocfs2_remove_lockres_tracking(res);
2168 ocfs2_put_dlm_debug(priv->p_dlm_debug);
2169 return seq_release_private(inode, file);
2170}
2171
2172static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
2173{
2174 int ret;
2175 struct ocfs2_dlm_seq_priv *priv;
2176 struct seq_file *seq;
2177 struct ocfs2_super *osb;
2178
2179 priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL);
2180 if (!priv) {
2181 ret = -ENOMEM;
2182 mlog_errno(ret);
2183 goto out;
2184 }
2185 osb = (struct ocfs2_super *) inode->u.generic_ip;
2186 ocfs2_get_dlm_debug(osb->osb_dlm_debug);
2187 priv->p_dlm_debug = osb->osb_dlm_debug;
2188 INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
2189
2190 ret = seq_open(file, &ocfs2_dlm_seq_ops);
2191 if (ret) {
2192 kfree(priv);
2193 mlog_errno(ret);
2194 goto out;
2195 }
2196
2197 seq = (struct seq_file *) file->private_data;
2198 seq->private = priv;
2199
2200 ocfs2_add_lockres_tracking(&priv->p_iter_res,
2201 priv->p_dlm_debug);
2202
2203out:
2204 return ret;
2205}
2206
Arjan van de Ven4b6f5d22006-03-28 01:56:42 -08002207static const struct file_operations ocfs2_dlm_debug_fops = {
Mark Fashehccd979b2005-12-15 14:31:24 -08002208 .open = ocfs2_dlm_debug_open,
2209 .release = ocfs2_dlm_debug_release,
2210 .read = seq_read,
2211 .llseek = seq_lseek,
2212};
2213
2214static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
2215{
2216 int ret = 0;
2217 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
2218
2219 dlm_debug->d_locking_state = debugfs_create_file("locking_state",
2220 S_IFREG|S_IRUSR,
2221 osb->osb_debug_root,
2222 osb,
2223 &ocfs2_dlm_debug_fops);
2224 if (!dlm_debug->d_locking_state) {
2225 ret = -EINVAL;
2226 mlog(ML_ERROR,
2227 "Unable to create locking state debugfs file.\n");
2228 goto out;
2229 }
2230
2231 ocfs2_get_dlm_debug(dlm_debug);
2232out:
2233 return ret;
2234}
2235
2236static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
2237{
2238 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
2239
2240 if (dlm_debug) {
2241 debugfs_remove(dlm_debug->d_locking_state);
2242 ocfs2_put_dlm_debug(dlm_debug);
2243 }
2244}
2245
2246int ocfs2_dlm_init(struct ocfs2_super *osb)
2247{
2248 int status;
2249 u32 dlm_key;
2250 struct dlm_ctxt *dlm;
2251
2252 mlog_entry_void();
2253
2254 status = ocfs2_dlm_init_debug(osb);
2255 if (status < 0) {
2256 mlog_errno(status);
2257 goto bail;
2258 }
2259
2260 /* launch vote thread */
Mark Fasheh78427042006-05-04 12:03:26 -07002261 osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote");
Mark Fashehccd979b2005-12-15 14:31:24 -08002262 if (IS_ERR(osb->vote_task)) {
2263 status = PTR_ERR(osb->vote_task);
2264 osb->vote_task = NULL;
2265 mlog_errno(status);
2266 goto bail;
2267 }
2268
2269 /* used by the dlm code to make message headers unique, each
2270 * node in this domain must agree on this. */
2271 dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
2272
2273 /* for now, uuid == domain */
2274 dlm = dlm_register_domain(osb->uuid_str, dlm_key);
2275 if (IS_ERR(dlm)) {
2276 status = PTR_ERR(dlm);
2277 mlog_errno(status);
2278 goto bail;
2279 }
2280
2281 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
2282 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
2283
2284 dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb);
2285
2286 osb->dlm = dlm;
2287
2288 status = 0;
2289bail:
2290 if (status < 0) {
2291 ocfs2_dlm_shutdown_debug(osb);
2292 if (osb->vote_task)
2293 kthread_stop(osb->vote_task);
2294 }
2295
2296 mlog_exit(status);
2297 return status;
2298}
2299
2300void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
2301{
2302 mlog_entry_void();
2303
2304 dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
2305
2306 ocfs2_drop_osb_locks(osb);
2307
2308 if (osb->vote_task) {
2309 kthread_stop(osb->vote_task);
2310 osb->vote_task = NULL;
2311 }
2312
2313 ocfs2_lock_res_free(&osb->osb_super_lockres);
2314 ocfs2_lock_res_free(&osb->osb_rename_lockres);
2315
2316 dlm_unregister_domain(osb->dlm);
2317 osb->dlm = NULL;
2318
2319 ocfs2_dlm_shutdown_debug(osb);
2320
2321 mlog_exit_void();
2322}
2323
2324static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status)
2325{
2326 struct ocfs2_lock_res *lockres = opaque;
2327 unsigned long flags;
2328
2329 mlog_entry_void();
2330
2331 mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
2332 lockres->l_unlock_action);
2333
2334 spin_lock_irqsave(&lockres->l_lock, flags);
2335 /* We tried to cancel a convert request, but it was already
2336 * granted. All we want to do here is clear our unlock
2337 * state. The wake_up call done at the bottom is redundant
2338 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
2339 * hurt anything anyway */
2340 if (status == DLM_CANCELGRANT &&
2341 lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
2342 mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
2343
2344 /* We don't clear the busy flag in this case as it
2345 * should have been cleared by the ast which the dlm
2346 * has called. */
2347 goto complete_unlock;
2348 }
2349
2350 if (status != DLM_NORMAL) {
2351 mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
2352 "unlock_action %d\n", status, lockres->l_name,
2353 lockres->l_unlock_action);
2354 spin_unlock_irqrestore(&lockres->l_lock, flags);
2355 return;
2356 }
2357
2358 switch(lockres->l_unlock_action) {
2359 case OCFS2_UNLOCK_CANCEL_CONVERT:
2360 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
2361 lockres->l_action = OCFS2_AST_INVALID;
2362 break;
2363 case OCFS2_UNLOCK_DROP_LOCK:
2364 lockres->l_level = LKM_IVMODE;
2365 break;
2366 default:
2367 BUG();
2368 }
2369
2370 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
2371complete_unlock:
2372 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
2373 spin_unlock_irqrestore(&lockres->l_lock, flags);
2374
2375 wake_up(&lockres->l_event);
2376
2377 mlog_exit_void();
2378}
2379
2380typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *);
2381
2382struct drop_lock_cb {
2383 ocfs2_pre_drop_cb_t *drop_func;
2384 void *drop_data;
2385};
2386
2387static int ocfs2_drop_lock(struct ocfs2_super *osb,
2388 struct ocfs2_lock_res *lockres,
2389 struct drop_lock_cb *dcb)
2390{
2391 enum dlm_status status;
2392 unsigned long flags;
2393
2394 /* We didn't get anywhere near actually using this lockres. */
2395 if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
2396 goto out;
2397
2398 spin_lock_irqsave(&lockres->l_lock, flags);
2399
2400 mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING),
2401 "lockres %s, flags 0x%lx\n",
2402 lockres->l_name, lockres->l_flags);
2403
2404 while (lockres->l_flags & OCFS2_LOCK_BUSY) {
2405 mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = "
2406 "%u, unlock_action = %u\n",
2407 lockres->l_name, lockres->l_flags, lockres->l_action,
2408 lockres->l_unlock_action);
2409
2410 spin_unlock_irqrestore(&lockres->l_lock, flags);
2411
2412 /* XXX: Today we just wait on any busy
2413 * locks... Perhaps we need to cancel converts in the
2414 * future? */
2415 ocfs2_wait_on_busy_lock(lockres);
2416
2417 spin_lock_irqsave(&lockres->l_lock, flags);
2418 }
2419
2420 if (dcb)
2421 dcb->drop_func(lockres, dcb->drop_data);
2422
2423 if (lockres->l_flags & OCFS2_LOCK_BUSY)
2424 mlog(ML_ERROR, "destroying busy lock: \"%s\"\n",
2425 lockres->l_name);
2426 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
2427 mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name);
2428
2429 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
2430 spin_unlock_irqrestore(&lockres->l_lock, flags);
2431 goto out;
2432 }
2433
2434 lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED);
2435
2436 /* make sure we never get here while waiting for an ast to
2437 * fire. */
2438 BUG_ON(lockres->l_action != OCFS2_AST_INVALID);
2439
2440 /* is this necessary? */
2441 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
2442 lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
2443 spin_unlock_irqrestore(&lockres->l_lock, flags);
2444
2445 mlog(0, "lock %s\n", lockres->l_name);
2446
2447 status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK,
2448 lockres->l_ops->unlock_ast, lockres);
2449 if (status != DLM_NORMAL) {
2450 ocfs2_log_dlm_error("dlmunlock", status, lockres);
2451 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
2452 dlm_print_one_lock(lockres->l_lksb.lockid);
2453 BUG();
2454 }
2455 mlog(0, "lock %s, successfull return from dlmunlock\n",
2456 lockres->l_name);
2457
2458 ocfs2_wait_on_busy_lock(lockres);
2459out:
2460 mlog_exit(0);
2461 return 0;
2462}
2463
2464/* Mark the lockres as being dropped. It will no longer be
2465 * queued if blocking, but we still may have to wait on it
2466 * being dequeued from the vote thread before we can consider
2467 * it safe to drop.
2468 *
2469 * You can *not* attempt to call cluster_lock on this lockres anymore. */
2470void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
2471{
2472 int status;
2473 struct ocfs2_mask_waiter mw;
2474 unsigned long flags;
2475
2476 ocfs2_init_mask_waiter(&mw);
2477
2478 spin_lock_irqsave(&lockres->l_lock, flags);
2479 lockres->l_flags |= OCFS2_LOCK_FREEING;
2480 while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
2481 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
2482 spin_unlock_irqrestore(&lockres->l_lock, flags);
2483
2484 mlog(0, "Waiting on lockres %s\n", lockres->l_name);
2485
2486 status = ocfs2_wait_for_mask(&mw);
2487 if (status)
2488 mlog_errno(status);
2489
2490 spin_lock_irqsave(&lockres->l_lock, flags);
2491 }
2492 spin_unlock_irqrestore(&lockres->l_lock, flags);
2493}
2494
Mark Fashehd680efe2006-09-08 14:14:34 -07002495void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
2496 struct ocfs2_lock_res *lockres)
2497{
2498 int ret;
2499
2500 ocfs2_mark_lockres_freeing(lockres);
2501 ret = ocfs2_drop_lock(osb, lockres, NULL);
2502 if (ret)
2503 mlog_errno(ret);
2504}
2505
Mark Fashehccd979b2005-12-15 14:31:24 -08002506static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
2507{
Mark Fashehd680efe2006-09-08 14:14:34 -07002508 ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
2509 ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
Mark Fashehccd979b2005-12-15 14:31:24 -08002510}
2511
2512static void ocfs2_meta_pre_drop(struct ocfs2_lock_res *lockres, void *data)
2513{
2514 struct inode *inode = data;
2515
2516 /* the metadata lock requires a bit more work as we have an
2517 * LVB to worry about. */
2518 if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
2519 lockres->l_level == LKM_EXMODE &&
2520 !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
2521 __ocfs2_stuff_meta_lvb(inode);
2522}
2523
2524int ocfs2_drop_inode_locks(struct inode *inode)
2525{
2526 int status, err;
2527 struct drop_lock_cb meta_dcb = { ocfs2_meta_pre_drop, inode, };
2528
2529 mlog_entry_void();
2530
2531 /* No need to call ocfs2_mark_lockres_freeing here -
2532 * ocfs2_clear_inode has done it for us. */
2533
2534 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2535 &OCFS2_I(inode)->ip_data_lockres,
2536 NULL);
2537 if (err < 0)
2538 mlog_errno(err);
2539
2540 status = err;
2541
2542 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2543 &OCFS2_I(inode)->ip_meta_lockres,
2544 &meta_dcb);
2545 if (err < 0)
2546 mlog_errno(err);
2547 if (err < 0 && !status)
2548 status = err;
2549
2550 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2551 &OCFS2_I(inode)->ip_rw_lockres,
2552 NULL);
2553 if (err < 0)
2554 mlog_errno(err);
2555 if (err < 0 && !status)
2556 status = err;
2557
2558 mlog_exit(status);
2559 return status;
2560}
2561
2562static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
2563 int new_level)
2564{
2565 assert_spin_locked(&lockres->l_lock);
2566
2567 BUG_ON(lockres->l_blocking <= LKM_NLMODE);
2568
2569 if (lockres->l_level <= new_level) {
2570 mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n",
2571 lockres->l_level, new_level);
2572 BUG();
2573 }
2574
2575 mlog(0, "lock %s, new_level = %d, l_blocking = %d\n",
2576 lockres->l_name, new_level, lockres->l_blocking);
2577
2578 lockres->l_action = OCFS2_AST_DOWNCONVERT;
2579 lockres->l_requested = new_level;
2580 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
2581}
2582
2583static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
2584 struct ocfs2_lock_res *lockres,
2585 int new_level,
2586 int lvb)
2587{
2588 int ret, dlm_flags = LKM_CONVERT;
2589 enum dlm_status status;
2590
2591 mlog_entry_void();
2592
2593 if (lvb)
2594 dlm_flags |= LKM_VALBLK;
2595
2596 status = dlmlock(osb->dlm,
2597 new_level,
2598 &lockres->l_lksb,
2599 dlm_flags,
2600 lockres->l_name,
Mark Fashehf0681062006-09-08 11:40:10 -07002601 OCFS2_LOCK_ID_MAX_LEN - 1,
Mark Fashehccd979b2005-12-15 14:31:24 -08002602 lockres->l_ops->ast,
2603 lockres,
2604 lockres->l_ops->bast);
2605 if (status != DLM_NORMAL) {
2606 ocfs2_log_dlm_error("dlmlock", status, lockres);
2607 ret = -EINVAL;
2608 ocfs2_recover_from_dlm_error(lockres, 1);
2609 goto bail;
2610 }
2611
2612 ret = 0;
2613bail:
2614 mlog_exit(ret);
2615 return ret;
2616}
2617
2618/* returns 1 when the caller should unlock and call dlmunlock */
2619static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
2620 struct ocfs2_lock_res *lockres)
2621{
2622 assert_spin_locked(&lockres->l_lock);
2623
2624 mlog_entry_void();
2625 mlog(0, "lock %s\n", lockres->l_name);
2626
2627 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
2628 /* If we're already trying to cancel a lock conversion
2629 * then just drop the spinlock and allow the caller to
2630 * requeue this lock. */
2631
2632 mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
2633 return 0;
2634 }
2635
2636 /* were we in a convert when we got the bast fire? */
2637 BUG_ON(lockres->l_action != OCFS2_AST_CONVERT &&
2638 lockres->l_action != OCFS2_AST_DOWNCONVERT);
2639 /* set things up for the unlockast to know to just
2640 * clear out the ast_action and unset busy, etc. */
2641 lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT;
2642
2643 mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY),
2644 "lock %s, invalid flags: 0x%lx\n",
2645 lockres->l_name, lockres->l_flags);
2646
2647 return 1;
2648}
2649
2650static int ocfs2_cancel_convert(struct ocfs2_super *osb,
2651 struct ocfs2_lock_res *lockres)
2652{
2653 int ret;
2654 enum dlm_status status;
2655
2656 mlog_entry_void();
2657 mlog(0, "lock %s\n", lockres->l_name);
2658
2659 ret = 0;
2660 status = dlmunlock(osb->dlm,
2661 &lockres->l_lksb,
2662 LKM_CANCEL,
2663 lockres->l_ops->unlock_ast,
2664 lockres);
2665 if (status != DLM_NORMAL) {
2666 ocfs2_log_dlm_error("dlmunlock", status, lockres);
2667 ret = -EINVAL;
2668 ocfs2_recover_from_dlm_error(lockres, 0);
2669 }
2670
2671 mlog(0, "lock %s return from dlmunlock\n", lockres->l_name);
2672
2673 mlog_exit(ret);
2674 return ret;
2675}
2676
2677static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
2678 struct ocfs2_lock_res *lockres,
2679 int new_level)
2680{
2681 int ret;
2682
2683 mlog_entry_void();
2684
2685 BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE);
2686
2687 if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
2688 ret = 0;
2689 mlog(0, "lockres %s currently being refreshed -- backing "
2690 "off!\n", lockres->l_name);
2691 } else if (new_level == LKM_PRMODE)
2692 ret = !lockres->l_ex_holders &&
2693 ocfs2_inode_fully_checkpointed(inode);
2694 else /* Must be NLMODE we're converting to. */
2695 ret = !lockres->l_ro_holders && !lockres->l_ex_holders &&
2696 ocfs2_inode_fully_checkpointed(inode);
2697
2698 mlog_exit(ret);
2699 return ret;
2700}
2701
2702static int ocfs2_do_unblock_meta(struct inode *inode,
2703 int *requeue)
2704{
2705 int new_level;
2706 int set_lvb = 0;
2707 int ret = 0;
2708 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
2709 unsigned long flags;
2710
2711 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2712
2713 mlog_entry_void();
2714
2715 spin_lock_irqsave(&lockres->l_lock, flags);
2716
2717 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
2718
2719 mlog(0, "l_level=%d, l_blocking=%d\n", lockres->l_level,
2720 lockres->l_blocking);
2721
2722 BUG_ON(lockres->l_level != LKM_EXMODE &&
2723 lockres->l_level != LKM_PRMODE);
2724
2725 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
2726 *requeue = 1;
2727 ret = ocfs2_prepare_cancel_convert(osb, lockres);
2728 spin_unlock_irqrestore(&lockres->l_lock, flags);
2729 if (ret) {
2730 ret = ocfs2_cancel_convert(osb, lockres);
2731 if (ret < 0)
2732 mlog_errno(ret);
2733 }
2734 goto leave;
2735 }
2736
2737 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
2738
2739 mlog(0, "l_level=%d, l_blocking=%d, new_level=%d\n",
2740 lockres->l_level, lockres->l_blocking, new_level);
2741
2742 if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) {
2743 if (lockres->l_level == LKM_EXMODE)
2744 set_lvb = 1;
2745
2746 /* If the lock hasn't been refreshed yet (rare), then
2747 * our memory inode values are old and we skip
2748 * stuffing the lvb. There's no need to actually clear
2749 * out the lvb here as it's value is still valid. */
2750 if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
2751 if (set_lvb)
2752 __ocfs2_stuff_meta_lvb(inode);
2753 } else
2754 mlog(0, "lockres %s: downconverting stale lock!\n",
2755 lockres->l_name);
2756
2757 mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, "
2758 "l_blocking=%d, new_level=%d\n",
2759 lockres->l_level, lockres->l_blocking, new_level);
2760
2761 ocfs2_prepare_downconvert(lockres, new_level);
2762 spin_unlock_irqrestore(&lockres->l_lock, flags);
2763 ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
2764 goto leave;
2765 }
2766 if (!ocfs2_inode_fully_checkpointed(inode))
2767 ocfs2_start_checkpoint(osb);
2768
2769 *requeue = 1;
2770 spin_unlock_irqrestore(&lockres->l_lock, flags);
2771 ret = 0;
2772leave:
2773 mlog_exit(ret);
2774 return ret;
2775}
2776
2777static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
2778 struct ocfs2_lock_res *lockres,
Mark Fashehd680efe2006-09-08 14:14:34 -07002779 struct ocfs2_unblock_ctl *ctl,
Mark Fashehccd979b2005-12-15 14:31:24 -08002780 ocfs2_convert_worker_t *worker)
2781{
2782 unsigned long flags;
2783 int blocking;
2784 int new_level;
2785 int ret = 0;
2786
2787 mlog_entry_void();
2788
2789 spin_lock_irqsave(&lockres->l_lock, flags);
2790
2791 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
2792
2793recheck:
2794 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
Mark Fashehd680efe2006-09-08 14:14:34 -07002795 ctl->requeue = 1;
Mark Fashehccd979b2005-12-15 14:31:24 -08002796 ret = ocfs2_prepare_cancel_convert(osb, lockres);
2797 spin_unlock_irqrestore(&lockres->l_lock, flags);
2798 if (ret) {
2799 ret = ocfs2_cancel_convert(osb, lockres);
2800 if (ret < 0)
2801 mlog_errno(ret);
2802 }
2803 goto leave;
2804 }
2805
2806 /* if we're blocking an exclusive and we have *any* holders,
2807 * then requeue. */
2808 if ((lockres->l_blocking == LKM_EXMODE)
2809 && (lockres->l_ex_holders || lockres->l_ro_holders)) {
2810 spin_unlock_irqrestore(&lockres->l_lock, flags);
Mark Fashehd680efe2006-09-08 14:14:34 -07002811 ctl->requeue = 1;
Mark Fashehccd979b2005-12-15 14:31:24 -08002812 ret = 0;
2813 goto leave;
2814 }
2815
2816 /* If it's a PR we're blocking, then only
2817 * requeue if we've got any EX holders */
2818 if (lockres->l_blocking == LKM_PRMODE &&
2819 lockres->l_ex_holders) {
2820 spin_unlock_irqrestore(&lockres->l_lock, flags);
Mark Fashehd680efe2006-09-08 14:14:34 -07002821 ctl->requeue = 1;
Mark Fashehccd979b2005-12-15 14:31:24 -08002822 ret = 0;
2823 goto leave;
2824 }
2825
2826 /* If we get here, then we know that there are no more
2827 * incompatible holders (and anyone asking for an incompatible
2828 * lock is blocked). We can now downconvert the lock */
2829 if (!worker)
2830 goto downconvert;
2831
2832 /* Some lockres types want to do a bit of work before
2833 * downconverting a lock. Allow that here. The worker function
2834 * may sleep, so we save off a copy of what we're blocking as
2835 * it may change while we're not holding the spin lock. */
2836 blocking = lockres->l_blocking;
2837 spin_unlock_irqrestore(&lockres->l_lock, flags);
2838
Mark Fashehd680efe2006-09-08 14:14:34 -07002839 ctl->unblock_action = worker(lockres, blocking);
2840
2841 if (ctl->unblock_action == UNBLOCK_STOP_POST)
2842 goto leave;
Mark Fashehccd979b2005-12-15 14:31:24 -08002843
2844 spin_lock_irqsave(&lockres->l_lock, flags);
2845 if (blocking != lockres->l_blocking) {
2846 /* If this changed underneath us, then we can't drop
2847 * it just yet. */
2848 goto recheck;
2849 }
2850
2851downconvert:
Mark Fashehd680efe2006-09-08 14:14:34 -07002852 ctl->requeue = 0;
Mark Fashehccd979b2005-12-15 14:31:24 -08002853 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
2854
2855 ocfs2_prepare_downconvert(lockres, new_level);
2856 spin_unlock_irqrestore(&lockres->l_lock, flags);
2857 ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0);
2858leave:
2859 mlog_exit(ret);
2860 return ret;
2861}
2862
Mark Fashehd680efe2006-09-08 14:14:34 -07002863static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
2864 int blocking)
Mark Fashehccd979b2005-12-15 14:31:24 -08002865{
2866 struct inode *inode;
2867 struct address_space *mapping;
2868
Mark Fashehccd979b2005-12-15 14:31:24 -08002869 inode = ocfs2_lock_res_inode(lockres);
2870 mapping = inode->i_mapping;
2871
2872 if (filemap_fdatawrite(mapping)) {
Mark Fashehb0697052006-03-03 10:24:33 -08002873 mlog(ML_ERROR, "Could not sync inode %llu for downconvert!",
2874 (unsigned long long)OCFS2_I(inode)->ip_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -08002875 }
2876 sync_mapping_buffers(mapping);
2877 if (blocking == LKM_EXMODE) {
2878 truncate_inode_pages(mapping, 0);
2879 unmap_mapping_range(mapping, 0, 0, 0);
2880 } else {
2881 /* We only need to wait on the I/O if we're not also
2882 * truncating pages because truncate_inode_pages waits
2883 * for us above. We don't truncate pages if we're
2884 * blocking anything < EXMODE because we want to keep
2885 * them around in that case. */
2886 filemap_fdatawait(mapping);
2887 }
2888
Mark Fashehd680efe2006-09-08 14:14:34 -07002889 return UNBLOCK_CONTINUE;
Mark Fashehccd979b2005-12-15 14:31:24 -08002890}
2891
2892int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
Mark Fashehd680efe2006-09-08 14:14:34 -07002893 struct ocfs2_unblock_ctl *ctl)
Mark Fashehccd979b2005-12-15 14:31:24 -08002894{
2895 int status;
2896 struct inode *inode;
2897 struct ocfs2_super *osb;
2898
2899 mlog_entry_void();
2900
2901 inode = ocfs2_lock_res_inode(lockres);
2902 osb = OCFS2_SB(inode->i_sb);
2903
Mark Fashehb0697052006-03-03 10:24:33 -08002904 mlog(0, "unblock inode %llu\n",
2905 (unsigned long long)OCFS2_I(inode)->ip_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -08002906
Mark Fashehd680efe2006-09-08 14:14:34 -07002907 status = ocfs2_generic_unblock_lock(osb, lockres, ctl,
Mark Fashehccd979b2005-12-15 14:31:24 -08002908 ocfs2_data_convert_worker);
2909 if (status < 0)
2910 mlog_errno(status);
2911
Mark Fashehb0697052006-03-03 10:24:33 -08002912 mlog(0, "inode %llu, requeue = %d\n",
Mark Fashehd680efe2006-09-08 14:14:34 -07002913 (unsigned long long)OCFS2_I(inode)->ip_blkno, ctl->requeue);
Mark Fashehccd979b2005-12-15 14:31:24 -08002914
2915 mlog_exit(status);
2916 return status;
2917}
2918
2919static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
Mark Fashehd680efe2006-09-08 14:14:34 -07002920 struct ocfs2_unblock_ctl *ctl)
Mark Fashehccd979b2005-12-15 14:31:24 -08002921{
2922 int status;
2923 struct inode *inode;
2924
2925 mlog_entry_void();
2926
2927 mlog(0, "Unblock lockres %s\n", lockres->l_name);
2928
2929 inode = ocfs2_lock_res_inode(lockres);
2930
2931 status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb),
Mark Fashehd680efe2006-09-08 14:14:34 -07002932 lockres, ctl, NULL);
Mark Fashehccd979b2005-12-15 14:31:24 -08002933 if (status < 0)
2934 mlog_errno(status);
2935
2936 mlog_exit(status);
2937 return status;
2938}
2939
Mark Fashehd680efe2006-09-08 14:14:34 -07002940static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
2941 struct ocfs2_unblock_ctl *ctl)
Mark Fashehccd979b2005-12-15 14:31:24 -08002942{
2943 int status;
2944 struct inode *inode;
2945
2946 mlog_entry_void();
2947
2948 inode = ocfs2_lock_res_inode(lockres);
2949
Mark Fashehb0697052006-03-03 10:24:33 -08002950 mlog(0, "unblock inode %llu\n",
2951 (unsigned long long)OCFS2_I(inode)->ip_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -08002952
Mark Fashehd680efe2006-09-08 14:14:34 -07002953 status = ocfs2_do_unblock_meta(inode, &ctl->requeue);
Mark Fashehccd979b2005-12-15 14:31:24 -08002954 if (status < 0)
2955 mlog_errno(status);
2956
Mark Fashehb0697052006-03-03 10:24:33 -08002957 mlog(0, "inode %llu, requeue = %d\n",
Mark Fashehd680efe2006-09-08 14:14:34 -07002958 (unsigned long long)OCFS2_I(inode)->ip_blkno, ctl->requeue);
Mark Fashehccd979b2005-12-15 14:31:24 -08002959
2960 mlog_exit(status);
2961 return status;
2962}
2963
Mark Fashehd680efe2006-09-08 14:14:34 -07002964/*
2965 * Does the final reference drop on our dentry lock. Right now this
2966 * happens in the vote thread, but we could choose to simplify the
2967 * dlmglue API and push these off to the ocfs2_wq in the future.
2968 */
2969static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
2970 struct ocfs2_lock_res *lockres)
2971{
2972 struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres);
2973 ocfs2_dentry_lock_put(osb, dl);
2974}
2975
2976/*
2977 * d_delete() matching dentries before the lock downconvert.
2978 *
2979 * At this point, any process waiting to destroy the
2980 * dentry_lock due to last ref count is stopped by the
2981 * OCFS2_LOCK_QUEUED flag.
2982 *
2983 * We have two potential problems
2984 *
2985 * 1) If we do the last reference drop on our dentry_lock (via dput)
2986 * we'll wind up in ocfs2_release_dentry_lock(), waiting on
2987 * the downconvert to finish. Instead we take an elevated
2988 * reference and push the drop until after we've completed our
2989 * unblock processing.
2990 *
2991 * 2) There might be another process with a final reference,
2992 * waiting on us to finish processing. If this is the case, we
2993 * detect it and exit out - there's no more dentries anyway.
2994 */
2995static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
2996 int blocking)
2997{
2998 struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres);
2999 struct ocfs2_inode_info *oi = OCFS2_I(dl->dl_inode);
3000 struct dentry *dentry;
3001 unsigned long flags;
3002 int extra_ref = 0;
3003
3004 /*
3005 * This node is blocking another node from getting a read
3006 * lock. This happens when we've renamed within a
3007 * directory. We've forced the other nodes to d_delete(), but
3008 * we never actually dropped our lock because it's still
3009 * valid. The downconvert code will retain a PR for this node,
3010 * so there's no further work to do.
3011 */
3012 if (blocking == LKM_PRMODE)
3013 return UNBLOCK_CONTINUE;
3014
3015 /*
3016 * Mark this inode as potentially orphaned. The code in
3017 * ocfs2_delete_inode() will figure out whether it actually
3018 * needs to be freed or not.
3019 */
3020 spin_lock(&oi->ip_lock);
3021 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
3022 spin_unlock(&oi->ip_lock);
3023
3024 /*
3025 * Yuck. We need to make sure however that the check of
3026 * OCFS2_LOCK_FREEING and the extra reference are atomic with
3027 * respect to a reference decrement or the setting of that
3028 * flag.
3029 */
3030 spin_lock_irqsave(&lockres->l_lock, flags);
3031 spin_lock(&dentry_attach_lock);
3032 if (!(lockres->l_flags & OCFS2_LOCK_FREEING)
3033 && dl->dl_count) {
3034 dl->dl_count++;
3035 extra_ref = 1;
3036 }
3037 spin_unlock(&dentry_attach_lock);
3038 spin_unlock_irqrestore(&lockres->l_lock, flags);
3039
3040 mlog(0, "extra_ref = %d\n", extra_ref);
3041
3042 /*
3043 * We have a process waiting on us in ocfs2_dentry_iput(),
3044 * which means we can't have any more outstanding
3045 * aliases. There's no need to do any more work.
3046 */
3047 if (!extra_ref)
3048 return UNBLOCK_CONTINUE;
3049
3050 spin_lock(&dentry_attach_lock);
3051 while (1) {
3052 dentry = ocfs2_find_local_alias(dl->dl_inode,
3053 dl->dl_parent_blkno, 1);
3054 if (!dentry)
3055 break;
3056 spin_unlock(&dentry_attach_lock);
3057
3058 mlog(0, "d_delete(%.*s);\n", dentry->d_name.len,
3059 dentry->d_name.name);
3060
3061 /*
3062 * The following dcache calls may do an
3063 * iput(). Normally we don't want that from the
3064 * downconverting thread, but in this case it's ok
3065 * because the requesting node already has an
3066 * exclusive lock on the inode, so it can't be queued
3067 * for a downconvert.
3068 */
3069 d_delete(dentry);
3070 dput(dentry);
3071
3072 spin_lock(&dentry_attach_lock);
3073 }
3074 spin_unlock(&dentry_attach_lock);
3075
3076 /*
3077 * If we are the last holder of this dentry lock, there is no
3078 * reason to downconvert so skip straight to the unlock.
3079 */
3080 if (dl->dl_count == 1)
3081 return UNBLOCK_STOP_POST;
3082
3083 return UNBLOCK_CONTINUE_POST;
3084}
3085
3086static int ocfs2_unblock_dentry_lock(struct ocfs2_lock_res *lockres,
3087 struct ocfs2_unblock_ctl *ctl)
3088{
3089 int ret;
3090 struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres);
3091 struct ocfs2_super *osb = OCFS2_SB(dl->dl_inode->i_sb);
3092
3093 mlog(0, "unblock dentry lock: %llu\n",
3094 (unsigned long long)OCFS2_I(dl->dl_inode)->ip_blkno);
3095
3096 ret = ocfs2_generic_unblock_lock(osb,
3097 lockres,
3098 ctl,
3099 ocfs2_dentry_convert_worker);
3100 if (ret < 0)
3101 mlog_errno(ret);
3102
3103 mlog(0, "requeue = %d, post = %d\n", ctl->requeue, ctl->unblock_action);
3104
3105 return ret;
3106}
3107
Mark Fashehccd979b2005-12-15 14:31:24 -08003108/* Generic unblock function for any lockres whose private data is an
3109 * ocfs2_super pointer. */
3110static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
Mark Fashehd680efe2006-09-08 14:14:34 -07003111 struct ocfs2_unblock_ctl *ctl)
Mark Fashehccd979b2005-12-15 14:31:24 -08003112{
3113 int status;
3114 struct ocfs2_super *osb;
3115
3116 mlog_entry_void();
3117
3118 mlog(0, "Unblock lockres %s\n", lockres->l_name);
3119
3120 osb = ocfs2_lock_res_super(lockres);
3121
3122 status = ocfs2_generic_unblock_lock(osb,
3123 lockres,
Mark Fashehd680efe2006-09-08 14:14:34 -07003124 ctl,
Mark Fashehccd979b2005-12-15 14:31:24 -08003125 NULL);
3126 if (status < 0)
3127 mlog_errno(status);
3128
3129 mlog_exit(status);
3130 return status;
3131}
3132
3133void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3134 struct ocfs2_lock_res *lockres)
3135{
3136 int status;
Mark Fashehd680efe2006-09-08 14:14:34 -07003137 struct ocfs2_unblock_ctl ctl = {0, 0,};
Mark Fashehccd979b2005-12-15 14:31:24 -08003138 unsigned long flags;
3139
3140 /* Our reference to the lockres in this function can be
3141 * considered valid until we remove the OCFS2_LOCK_QUEUED
3142 * flag. */
3143
3144 mlog_entry_void();
3145
3146 BUG_ON(!lockres);
3147 BUG_ON(!lockres->l_ops);
3148 BUG_ON(!lockres->l_ops->unblock);
3149
3150 mlog(0, "lockres %s blocked.\n", lockres->l_name);
3151
3152 /* Detect whether a lock has been marked as going away while
3153 * the vote thread was processing other things. A lock can
3154 * still be marked with OCFS2_LOCK_FREEING after this check,
3155 * but short circuiting here will still save us some
3156 * performance. */
3157 spin_lock_irqsave(&lockres->l_lock, flags);
3158 if (lockres->l_flags & OCFS2_LOCK_FREEING)
3159 goto unqueue;
3160 spin_unlock_irqrestore(&lockres->l_lock, flags);
3161
Mark Fashehd680efe2006-09-08 14:14:34 -07003162 status = lockres->l_ops->unblock(lockres, &ctl);
Mark Fashehccd979b2005-12-15 14:31:24 -08003163 if (status < 0)
3164 mlog_errno(status);
3165
3166 spin_lock_irqsave(&lockres->l_lock, flags);
3167unqueue:
Mark Fashehd680efe2006-09-08 14:14:34 -07003168 if (lockres->l_flags & OCFS2_LOCK_FREEING || !ctl.requeue) {
Mark Fashehccd979b2005-12-15 14:31:24 -08003169 lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
3170 } else
3171 ocfs2_schedule_blocked_lock(osb, lockres);
3172
3173 mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name,
Mark Fashehd680efe2006-09-08 14:14:34 -07003174 ctl.requeue ? "yes" : "no");
Mark Fashehccd979b2005-12-15 14:31:24 -08003175 spin_unlock_irqrestore(&lockres->l_lock, flags);
3176
Mark Fashehd680efe2006-09-08 14:14:34 -07003177 if (ctl.unblock_action != UNBLOCK_CONTINUE
3178 && lockres->l_ops->post_unlock)
3179 lockres->l_ops->post_unlock(osb, lockres);
3180
Mark Fashehccd979b2005-12-15 14:31:24 -08003181 mlog_exit_void();
3182}
3183
3184static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
3185 struct ocfs2_lock_res *lockres)
3186{
3187 mlog_entry_void();
3188
3189 assert_spin_locked(&lockres->l_lock);
3190
3191 if (lockres->l_flags & OCFS2_LOCK_FREEING) {
3192 /* Do not schedule a lock for downconvert when it's on
3193 * the way to destruction - any nodes wanting access
3194 * to the resource will get it soon. */
3195 mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n",
3196 lockres->l_name, lockres->l_flags);
3197 return;
3198 }
3199
3200 lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
3201
3202 spin_lock(&osb->vote_task_lock);
3203 if (list_empty(&lockres->l_blocked_list)) {
3204 list_add_tail(&lockres->l_blocked_list,
3205 &osb->blocked_lock_list);
3206 osb->blocked_lock_count++;
3207 }
3208 spin_unlock(&osb->vote_task_lock);
3209
3210 mlog_exit_void();
3211}
3212
3213/* This aids in debugging situations where a bad LVB might be involved. */
3214void ocfs2_dump_meta_lvb_info(u64 level,
3215 const char *function,
3216 unsigned int line,
3217 struct ocfs2_lock_res *lockres)
3218{
3219 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
3220
3221 mlog(level, "LVB information for %s (called from %s:%u):\n",
3222 lockres->l_name, function, line);
Mark Fashehf9e2d822006-09-12 15:35:49 -07003223 mlog(level, "version: %u, clusters: %u, generation: 0x%x\n",
3224 lvb->lvb_version, be32_to_cpu(lvb->lvb_iclusters),
3225 be32_to_cpu(lvb->lvb_igeneration));
Mark Fashehb0697052006-03-03 10:24:33 -08003226 mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n",
3227 (unsigned long long)be64_to_cpu(lvb->lvb_isize),
3228 be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid),
3229 be16_to_cpu(lvb->lvb_imode));
3230 mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, "
Herbert Poetzlca4d1472006-07-03 17:27:12 -07003231 "mtime_packed 0x%llx iattr 0x%x\n", be16_to_cpu(lvb->lvb_inlink),
Mark Fashehb0697052006-03-03 10:24:33 -08003232 (long long)be64_to_cpu(lvb->lvb_iatime_packed),
3233 (long long)be64_to_cpu(lvb->lvb_ictime_packed),
Herbert Poetzlca4d1472006-07-03 17:27:12 -07003234 (long long)be64_to_cpu(lvb->lvb_imtime_packed),
3235 be32_to_cpu(lvb->lvb_iattr));
Mark Fashehccd979b2005-12-15 14:31:24 -08003236}