blob: 024777abc8e32d1d97938541d38430369a342293 [file] [log] [blame]
Mark Fashehccd979b2005-12-15 14:31:24 -08001/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmglue.c
5 *
6 * Code which implements an OCFS2 specific interface to our DLM.
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/types.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/mm.h>
30#include <linux/smp_lock.h>
31#include <linux/crc32.h>
32#include <linux/kthread.h>
33#include <linux/pagemap.h>
34#include <linux/debugfs.h>
35#include <linux/seq_file.h>
36
37#include <cluster/heartbeat.h>
38#include <cluster/nodemanager.h>
39#include <cluster/tcp.h>
40
41#include <dlm/dlmapi.h>
42
43#define MLOG_MASK_PREFIX ML_DLM_GLUE
44#include <cluster/masklog.h>
45
46#include "ocfs2.h"
47
48#include "alloc.h"
Mark Fashehd680efe2006-09-08 14:14:34 -070049#include "dcache.h"
Mark Fashehccd979b2005-12-15 14:31:24 -080050#include "dlmglue.h"
51#include "extent_map.h"
Tiger Yang7f1a37e2006-11-15 15:48:42 +080052#include "file.h"
Mark Fashehccd979b2005-12-15 14:31:24 -080053#include "heartbeat.h"
54#include "inode.h"
55#include "journal.h"
56#include "slot_map.h"
57#include "super.h"
58#include "uptodate.h"
59#include "vote.h"
60
61#include "buffer_head_io.h"
62
63struct ocfs2_mask_waiter {
64 struct list_head mw_item;
65 int mw_status;
66 struct completion mw_complete;
67 unsigned long mw_mask;
68 unsigned long mw_goal;
69};
70
Mark Fasheh54a7e752006-09-12 21:49:13 -070071static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
72static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
Mark Fashehccd979b2005-12-15 14:31:24 -080073
Mark Fashehd680efe2006-09-08 14:14:34 -070074/*
Mark Fashehcc567d82006-09-13 21:52:21 -070075 * Return value from ->downconvert_worker functions.
Mark Fashehd680efe2006-09-08 14:14:34 -070076 *
Mark Fashehb5e500e2006-09-13 22:01:16 -070077 * These control the precise actions of ocfs2_unblock_lock()
Mark Fashehd680efe2006-09-08 14:14:34 -070078 * and ocfs2_process_blocked_lock()
79 *
80 */
81enum ocfs2_unblock_action {
82 UNBLOCK_CONTINUE = 0, /* Continue downconvert */
83 UNBLOCK_CONTINUE_POST = 1, /* Continue downconvert, fire
84 * ->post_unlock callback */
85 UNBLOCK_STOP_POST = 2, /* Do not downconvert, fire
86 * ->post_unlock() callback. */
87};
88
89struct ocfs2_unblock_ctl {
90 int requeue;
91 enum ocfs2_unblock_action unblock_action;
92};
93
Mark Fasheh810d5ae2006-09-13 21:39:52 -070094static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
95 int new_level);
96static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres);
97
Mark Fashehcc567d82006-09-13 21:52:21 -070098static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
99 int blocking);
100
Mark Fashehcc567d82006-09-13 21:52:21 -0700101static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
102 int blocking);
Mark Fashehd680efe2006-09-08 14:14:34 -0700103
104static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
105 struct ocfs2_lock_res *lockres);
Mark Fashehccd979b2005-12-15 14:31:24 -0800106
Adrian Bunk6cb129f2007-04-26 00:29:35 -0700107
108#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
109
110/* This aids in debugging situations where a bad LVB might be involved. */
111static void ocfs2_dump_meta_lvb_info(u64 level,
112 const char *function,
113 unsigned int line,
114 struct ocfs2_lock_res *lockres)
115{
116 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
117
118 mlog(level, "LVB information for %s (called from %s:%u):\n",
119 lockres->l_name, function, line);
120 mlog(level, "version: %u, clusters: %u, generation: 0x%x\n",
121 lvb->lvb_version, be32_to_cpu(lvb->lvb_iclusters),
122 be32_to_cpu(lvb->lvb_igeneration));
123 mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n",
124 (unsigned long long)be64_to_cpu(lvb->lvb_isize),
125 be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid),
126 be16_to_cpu(lvb->lvb_imode));
127 mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, "
128 "mtime_packed 0x%llx iattr 0x%x\n", be16_to_cpu(lvb->lvb_inlink),
129 (long long)be64_to_cpu(lvb->lvb_iatime_packed),
130 (long long)be64_to_cpu(lvb->lvb_ictime_packed),
131 (long long)be64_to_cpu(lvb->lvb_imtime_packed),
132 be32_to_cpu(lvb->lvb_iattr));
133}
134
135
Mark Fashehf625c972006-09-12 21:24:53 -0700136/*
137 * OCFS2 Lock Resource Operations
138 *
139 * These fine tune the behavior of the generic dlmglue locking infrastructure.
Mark Fasheh0d5dc6c2006-09-14 14:44:51 -0700140 *
141 * The most basic of lock types can point ->l_priv to their respective
142 * struct ocfs2_super and allow the default actions to manage things.
143 *
144 * Right now, each lock type also needs to implement an init function,
145 * and trivial lock/unlock wrappers. ocfs2_simple_drop_lockres()
146 * should be called when the lock is no longer needed (i.e., object
147 * destruction time).
Mark Fashehf625c972006-09-12 21:24:53 -0700148 */
Mark Fashehccd979b2005-12-15 14:31:24 -0800149struct ocfs2_lock_res_ops {
Mark Fasheh54a7e752006-09-12 21:49:13 -0700150 /*
151 * Translate an ocfs2_lock_res * into an ocfs2_super *. Define
152 * this callback if ->l_priv is not an ocfs2_super pointer
153 */
154 struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *);
Mark Fashehb5e500e2006-09-13 22:01:16 -0700155
Mark Fasheh0d5dc6c2006-09-14 14:44:51 -0700156 /*
157 * Optionally called in the downconvert (or "vote") thread
158 * after a successful downconvert. The lockres will not be
159 * referenced after this callback is called, so it is safe to
160 * free memory, etc.
161 *
162 * The exact semantics of when this is called are controlled
163 * by ->downconvert_worker()
164 */
Mark Fashehd680efe2006-09-08 14:14:34 -0700165 void (*post_unlock)(struct ocfs2_super *, struct ocfs2_lock_res *);
Mark Fashehf625c972006-09-12 21:24:53 -0700166
167 /*
Mark Fasheh16d5b9562006-09-13 21:10:12 -0700168 * Allow a lock type to add checks to determine whether it is
169 * safe to downconvert a lock. Return 0 to re-queue the
170 * downconvert at a later time, nonzero to continue.
171 *
172 * For most locks, the default checks that there are no
173 * incompatible holders are sufficient.
174 *
175 * Called with the lockres spinlock held.
176 */
177 int (*check_downconvert)(struct ocfs2_lock_res *, int);
178
179 /*
Mark Fasheh5ef0d4e2006-09-13 21:21:52 -0700180 * Allows a lock type to populate the lock value block. This
181 * is called on downconvert, and when we drop a lock.
182 *
183 * Locks that want to use this should set LOCK_TYPE_USES_LVB
184 * in the flags field.
185 *
186 * Called with the lockres spinlock held.
187 */
188 void (*set_lvb)(struct ocfs2_lock_res *);
189
190 /*
Mark Fashehcc567d82006-09-13 21:52:21 -0700191 * Called from the downconvert thread when it is determined
192 * that a lock will be downconverted. This is called without
193 * any locks held so the function can do work that might
194 * schedule (syncing out data, etc).
195 *
196 * This should return any one of the ocfs2_unblock_action
197 * values, depending on what it wants the thread to do.
198 */
199 int (*downconvert_worker)(struct ocfs2_lock_res *, int);
200
201 /*
Mark Fashehf625c972006-09-12 21:24:53 -0700202 * LOCK_TYPE_* flags which describe the specific requirements
203 * of a lock type. Descriptions of each individual flag follow.
204 */
205 int flags;
Mark Fashehccd979b2005-12-15 14:31:24 -0800206};
207
Mark Fashehf625c972006-09-12 21:24:53 -0700208/*
209 * Some locks want to "refresh" potentially stale data when a
210 * meaningful (PRMODE or EXMODE) lock level is first obtained. If this
211 * flag is set, the OCFS2_LOCK_NEEDS_REFRESH flag will be set on the
212 * individual lockres l_flags member from the ast function. It is
213 * expected that the locking wrapper will clear the
214 * OCFS2_LOCK_NEEDS_REFRESH flag when done.
215 */
216#define LOCK_TYPE_REQUIRES_REFRESH 0x1
217
Mark Fashehb80fc012006-09-12 22:08:14 -0700218/*
Mark Fasheh5ef0d4e2006-09-13 21:21:52 -0700219 * Indicate that a lock type makes use of the lock value block. The
220 * ->set_lvb lock type callback must be defined.
Mark Fashehb80fc012006-09-12 22:08:14 -0700221 */
222#define LOCK_TYPE_USES_LVB 0x2
223
Mark Fashehccd979b2005-12-15 14:31:24 -0800224static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
Mark Fasheh54a7e752006-09-12 21:49:13 -0700225 .get_osb = ocfs2_get_inode_osb,
Mark Fashehf625c972006-09-12 21:24:53 -0700226 .flags = 0,
Mark Fashehccd979b2005-12-15 14:31:24 -0800227};
228
229static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
Mark Fasheh54a7e752006-09-12 21:49:13 -0700230 .get_osb = ocfs2_get_inode_osb,
Mark Fasheh810d5ae2006-09-13 21:39:52 -0700231 .check_downconvert = ocfs2_check_meta_downconvert,
232 .set_lvb = ocfs2_set_meta_lvb,
Mark Fashehb80fc012006-09-12 22:08:14 -0700233 .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
Mark Fashehccd979b2005-12-15 14:31:24 -0800234};
235
Mark Fashehccd979b2005-12-15 14:31:24 -0800236static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
Mark Fasheh54a7e752006-09-12 21:49:13 -0700237 .get_osb = ocfs2_get_inode_osb,
Mark Fashehcc567d82006-09-13 21:52:21 -0700238 .downconvert_worker = ocfs2_data_convert_worker,
Mark Fashehf625c972006-09-12 21:24:53 -0700239 .flags = 0,
Mark Fashehccd979b2005-12-15 14:31:24 -0800240};
241
242static struct ocfs2_lock_res_ops ocfs2_super_lops = {
Mark Fashehf625c972006-09-12 21:24:53 -0700243 .flags = LOCK_TYPE_REQUIRES_REFRESH,
Mark Fashehccd979b2005-12-15 14:31:24 -0800244};
245
246static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
Mark Fashehf625c972006-09-12 21:24:53 -0700247 .flags = 0,
Mark Fashehccd979b2005-12-15 14:31:24 -0800248};
249
Mark Fashehd680efe2006-09-08 14:14:34 -0700250static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
Mark Fasheh54a7e752006-09-12 21:49:13 -0700251 .get_osb = ocfs2_get_dentry_osb,
Mark Fashehd680efe2006-09-08 14:14:34 -0700252 .post_unlock = ocfs2_dentry_post_unlock,
Mark Fashehcc567d82006-09-13 21:52:21 -0700253 .downconvert_worker = ocfs2_dentry_convert_worker,
Mark Fashehf625c972006-09-12 21:24:53 -0700254 .flags = 0,
Mark Fashehd680efe2006-09-08 14:14:34 -0700255};
256
Tiger Yang50008632007-03-20 16:01:38 -0700257static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
258 .get_osb = ocfs2_get_inode_osb,
259 .flags = 0,
260};
261
Mark Fashehccd979b2005-12-15 14:31:24 -0800262static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
263{
264 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
265 lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
Tiger Yang50008632007-03-20 16:01:38 -0700266 lockres->l_type == OCFS2_LOCK_TYPE_RW ||
267 lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
Mark Fashehccd979b2005-12-15 14:31:24 -0800268}
269
Mark Fashehccd979b2005-12-15 14:31:24 -0800270static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
271{
272 BUG_ON(!ocfs2_is_inode_lock(lockres));
273
274 return (struct inode *) lockres->l_priv;
275}
276
Mark Fashehd680efe2006-09-08 14:14:34 -0700277static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res *lockres)
278{
279 BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_DENTRY);
280
281 return (struct ocfs2_dentry_lock *)lockres->l_priv;
282}
283
Mark Fasheh54a7e752006-09-12 21:49:13 -0700284static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
285{
286 if (lockres->l_ops->get_osb)
287 return lockres->l_ops->get_osb(lockres);
288
289 return (struct ocfs2_super *)lockres->l_priv;
290}
291
Mark Fashehccd979b2005-12-15 14:31:24 -0800292static int ocfs2_lock_create(struct ocfs2_super *osb,
293 struct ocfs2_lock_res *lockres,
294 int level,
295 int dlm_flags);
296static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
297 int wanted);
298static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
299 struct ocfs2_lock_res *lockres,
300 int level);
301static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
302static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
303static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
304static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level);
305static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
306 struct ocfs2_lock_res *lockres);
307static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
308 int convert);
309#define ocfs2_log_dlm_error(_func, _stat, _lockres) do { \
310 mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \
311 "resource %s: %s\n", dlm_errname(_stat), _func, \
312 _lockres->l_name, dlm_errmsg(_stat)); \
313} while (0)
314static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
315 struct ocfs2_lock_res *lockres);
316static int ocfs2_meta_lock_update(struct inode *inode,
317 struct buffer_head **bh);
318static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
319static inline int ocfs2_highest_compat_lock_level(int level);
Mark Fashehccd979b2005-12-15 14:31:24 -0800320
Mark Fashehccd979b2005-12-15 14:31:24 -0800321static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
322 u64 blkno,
323 u32 generation,
324 char *name)
325{
326 int len;
327
328 mlog_entry_void();
329
330 BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
331
Mark Fashehb0697052006-03-03 10:24:33 -0800332 len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x",
333 ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD,
334 (long long)blkno, generation);
Mark Fashehccd979b2005-12-15 14:31:24 -0800335
336 BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
337
338 mlog(0, "built lock resource with name: %s\n", name);
339
340 mlog_exit_void();
341}
342
Ingo Molnar34af9462006-06-27 02:53:55 -0700343static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
Mark Fashehccd979b2005-12-15 14:31:24 -0800344
345static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
346 struct ocfs2_dlm_debug *dlm_debug)
347{
348 mlog(0, "Add tracking for lockres %s\n", res->l_name);
349
350 spin_lock(&ocfs2_dlm_tracking_lock);
351 list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking);
352 spin_unlock(&ocfs2_dlm_tracking_lock);
353}
354
355static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
356{
357 spin_lock(&ocfs2_dlm_tracking_lock);
358 if (!list_empty(&res->l_debug_list))
359 list_del_init(&res->l_debug_list);
360 spin_unlock(&ocfs2_dlm_tracking_lock);
361}
362
363static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
364 struct ocfs2_lock_res *res,
365 enum ocfs2_lock_type type,
Mark Fashehccd979b2005-12-15 14:31:24 -0800366 struct ocfs2_lock_res_ops *ops,
367 void *priv)
368{
Mark Fashehccd979b2005-12-15 14:31:24 -0800369 res->l_type = type;
370 res->l_ops = ops;
371 res->l_priv = priv;
372
373 res->l_level = LKM_IVMODE;
374 res->l_requested = LKM_IVMODE;
375 res->l_blocking = LKM_IVMODE;
376 res->l_action = OCFS2_AST_INVALID;
377 res->l_unlock_action = OCFS2_UNLOCK_INVALID;
378
379 res->l_flags = OCFS2_LOCK_INITIALIZED;
380
381 ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
382}
383
384void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
385{
386 /* This also clears out the lock status block */
387 memset(res, 0, sizeof(struct ocfs2_lock_res));
388 spin_lock_init(&res->l_lock);
389 init_waitqueue_head(&res->l_event);
390 INIT_LIST_HEAD(&res->l_blocked_list);
391 INIT_LIST_HEAD(&res->l_mask_waiters);
392}
393
394void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
395 enum ocfs2_lock_type type,
Mark Fasheh24c19ef2006-09-22 17:28:19 -0700396 unsigned int generation,
Mark Fashehccd979b2005-12-15 14:31:24 -0800397 struct inode *inode)
398{
399 struct ocfs2_lock_res_ops *ops;
400
401 switch(type) {
402 case OCFS2_LOCK_TYPE_RW:
403 ops = &ocfs2_inode_rw_lops;
404 break;
405 case OCFS2_LOCK_TYPE_META:
406 ops = &ocfs2_inode_meta_lops;
407 break;
408 case OCFS2_LOCK_TYPE_DATA:
409 ops = &ocfs2_inode_data_lops;
410 break;
Tiger Yang50008632007-03-20 16:01:38 -0700411 case OCFS2_LOCK_TYPE_OPEN:
412 ops = &ocfs2_inode_open_lops;
413 break;
Mark Fashehccd979b2005-12-15 14:31:24 -0800414 default:
415 mlog_bug_on_msg(1, "type: %d\n", type);
416 ops = NULL; /* thanks, gcc */
417 break;
418 };
419
Mark Fashehd680efe2006-09-08 14:14:34 -0700420 ocfs2_build_lock_name(type, OCFS2_I(inode)->ip_blkno,
Mark Fasheh24c19ef2006-09-22 17:28:19 -0700421 generation, res->l_name);
Mark Fashehd680efe2006-09-08 14:14:34 -0700422 ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, ops, inode);
423}
424
Mark Fasheh54a7e752006-09-12 21:49:13 -0700425static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
426{
427 struct inode *inode = ocfs2_lock_res_inode(lockres);
428
429 return OCFS2_SB(inode->i_sb);
430}
431
Mark Fashehd680efe2006-09-08 14:14:34 -0700432static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres)
433{
434 __be64 inode_blkno_be;
435
436 memcpy(&inode_blkno_be, &lockres->l_name[OCFS2_DENTRY_LOCK_INO_START],
437 sizeof(__be64));
438
439 return be64_to_cpu(inode_blkno_be);
440}
441
Mark Fasheh54a7e752006-09-12 21:49:13 -0700442static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres)
443{
444 struct ocfs2_dentry_lock *dl = lockres->l_priv;
445
446 return OCFS2_SB(dl->dl_inode->i_sb);
447}
448
Mark Fashehd680efe2006-09-08 14:14:34 -0700449void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
450 u64 parent, struct inode *inode)
451{
452 int len;
453 u64 inode_blkno = OCFS2_I(inode)->ip_blkno;
454 __be64 inode_blkno_be = cpu_to_be64(inode_blkno);
455 struct ocfs2_lock_res *lockres = &dl->dl_lockres;
456
457 ocfs2_lock_res_init_once(lockres);
458
459 /*
460 * Unfortunately, the standard lock naming scheme won't work
461 * here because we have two 16 byte values to use. Instead,
462 * we'll stuff the inode number as a binary value. We still
463 * want error prints to show something without garbling the
464 * display, so drop a null byte in there before the inode
465 * number. A future version of OCFS2 will likely use all
466 * binary lock names. The stringified names have been a
467 * tremendous aid in debugging, but now that the debugfs
468 * interface exists, we can mangle things there if need be.
469 *
470 * NOTE: We also drop the standard "pad" value (the total lock
471 * name size stays the same though - the last part is all
472 * zeros due to the memset in ocfs2_lock_res_init_once()
473 */
474 len = snprintf(lockres->l_name, OCFS2_DENTRY_LOCK_INO_START,
475 "%c%016llx",
476 ocfs2_lock_type_char(OCFS2_LOCK_TYPE_DENTRY),
477 (long long)parent);
478
479 BUG_ON(len != (OCFS2_DENTRY_LOCK_INO_START - 1));
480
481 memcpy(&lockres->l_name[OCFS2_DENTRY_LOCK_INO_START], &inode_blkno_be,
482 sizeof(__be64));
483
484 ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
485 OCFS2_LOCK_TYPE_DENTRY, &ocfs2_dentry_lops,
486 dl);
Mark Fashehccd979b2005-12-15 14:31:24 -0800487}
488
489static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res,
490 struct ocfs2_super *osb)
491{
492 /* Superblock lockres doesn't come from a slab so we call init
493 * once on it manually. */
494 ocfs2_lock_res_init_once(res);
Mark Fashehd680efe2006-09-08 14:14:34 -0700495 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_SUPER, OCFS2_SUPER_BLOCK_BLKNO,
496 0, res->l_name);
Mark Fashehccd979b2005-12-15 14:31:24 -0800497 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER,
Mark Fashehccd979b2005-12-15 14:31:24 -0800498 &ocfs2_super_lops, osb);
499}
500
501static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
502 struct ocfs2_super *osb)
503{
504 /* Rename lockres doesn't come from a slab so we call init
505 * once on it manually. */
506 ocfs2_lock_res_init_once(res);
Mark Fashehd680efe2006-09-08 14:14:34 -0700507 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_RENAME, 0, 0, res->l_name);
508 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME,
Mark Fashehccd979b2005-12-15 14:31:24 -0800509 &ocfs2_rename_lops, osb);
510}
511
512void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
513{
514 mlog_entry_void();
515
516 if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
517 return;
518
519 ocfs2_remove_lockres_tracking(res);
520
521 mlog_bug_on_msg(!list_empty(&res->l_blocked_list),
522 "Lockres %s is on the blocked list\n",
523 res->l_name);
524 mlog_bug_on_msg(!list_empty(&res->l_mask_waiters),
525 "Lockres %s has mask waiters pending\n",
526 res->l_name);
527 mlog_bug_on_msg(spin_is_locked(&res->l_lock),
528 "Lockres %s is locked\n",
529 res->l_name);
530 mlog_bug_on_msg(res->l_ro_holders,
531 "Lockres %s has %u ro holders\n",
532 res->l_name, res->l_ro_holders);
533 mlog_bug_on_msg(res->l_ex_holders,
534 "Lockres %s has %u ex holders\n",
535 res->l_name, res->l_ex_holders);
536
537 /* Need to clear out the lock status block for the dlm */
538 memset(&res->l_lksb, 0, sizeof(res->l_lksb));
539
540 res->l_flags = 0UL;
541 mlog_exit_void();
542}
543
544static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
545 int level)
546{
547 mlog_entry_void();
548
549 BUG_ON(!lockres);
550
551 switch(level) {
552 case LKM_EXMODE:
553 lockres->l_ex_holders++;
554 break;
555 case LKM_PRMODE:
556 lockres->l_ro_holders++;
557 break;
558 default:
559 BUG();
560 }
561
562 mlog_exit_void();
563}
564
565static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
566 int level)
567{
568 mlog_entry_void();
569
570 BUG_ON(!lockres);
571
572 switch(level) {
573 case LKM_EXMODE:
574 BUG_ON(!lockres->l_ex_holders);
575 lockres->l_ex_holders--;
576 break;
577 case LKM_PRMODE:
578 BUG_ON(!lockres->l_ro_holders);
579 lockres->l_ro_holders--;
580 break;
581 default:
582 BUG();
583 }
584 mlog_exit_void();
585}
586
587/* WARNING: This function lives in a world where the only three lock
588 * levels are EX, PR, and NL. It *will* have to be adjusted when more
589 * lock types are added. */
590static inline int ocfs2_highest_compat_lock_level(int level)
591{
592 int new_level = LKM_EXMODE;
593
594 if (level == LKM_EXMODE)
595 new_level = LKM_NLMODE;
596 else if (level == LKM_PRMODE)
597 new_level = LKM_PRMODE;
598 return new_level;
599}
600
601static void lockres_set_flags(struct ocfs2_lock_res *lockres,
602 unsigned long newflags)
603{
604 struct list_head *pos, *tmp;
605 struct ocfs2_mask_waiter *mw;
606
607 assert_spin_locked(&lockres->l_lock);
608
609 lockres->l_flags = newflags;
610
611 list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) {
612 mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
613 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
614 continue;
615
616 list_del_init(&mw->mw_item);
617 mw->mw_status = 0;
618 complete(&mw->mw_complete);
619 }
620}
621static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
622{
623 lockres_set_flags(lockres, lockres->l_flags | or);
624}
625static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
626 unsigned long clear)
627{
628 lockres_set_flags(lockres, lockres->l_flags & ~clear);
629}
630
631static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
632{
633 mlog_entry_void();
634
635 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
636 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
637 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
638 BUG_ON(lockres->l_blocking <= LKM_NLMODE);
639
640 lockres->l_level = lockres->l_requested;
641 if (lockres->l_level <=
642 ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
643 lockres->l_blocking = LKM_NLMODE;
644 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
645 }
646 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
647
648 mlog_exit_void();
649}
650
651static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
652{
653 mlog_entry_void();
654
655 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
656 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
657
658 /* Convert from RO to EX doesn't really need anything as our
659 * information is already up to data. Convert from NL to
660 * *anything* however should mark ourselves as needing an
661 * update */
Mark Fashehf625c972006-09-12 21:24:53 -0700662 if (lockres->l_level == LKM_NLMODE &&
663 lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
Mark Fashehccd979b2005-12-15 14:31:24 -0800664 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
665
666 lockres->l_level = lockres->l_requested;
667 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
668
669 mlog_exit_void();
670}
671
672static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
673{
674 mlog_entry_void();
675
676 BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY));
677 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
678
679 if (lockres->l_requested > LKM_NLMODE &&
Mark Fashehf625c972006-09-12 21:24:53 -0700680 !(lockres->l_flags & OCFS2_LOCK_LOCAL) &&
681 lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
Mark Fashehccd979b2005-12-15 14:31:24 -0800682 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
683
684 lockres->l_level = lockres->l_requested;
685 lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
686 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
687
688 mlog_exit_void();
689}
690
Mark Fashehccd979b2005-12-15 14:31:24 -0800691static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
692 int level)
693{
694 int needs_downconvert = 0;
695 mlog_entry_void();
696
697 assert_spin_locked(&lockres->l_lock);
698
699 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
700
701 if (level > lockres->l_blocking) {
702 /* only schedule a downconvert if we haven't already scheduled
703 * one that goes low enough to satisfy the level we're
704 * blocking. this also catches the case where we get
705 * duplicate BASTs */
706 if (ocfs2_highest_compat_lock_level(level) <
707 ocfs2_highest_compat_lock_level(lockres->l_blocking))
708 needs_downconvert = 1;
709
710 lockres->l_blocking = level;
711 }
712
713 mlog_exit(needs_downconvert);
714 return needs_downconvert;
715}
716
Mark Fashehaa2623a2006-09-12 21:58:23 -0700717static void ocfs2_blocking_ast(void *opaque, int level)
Mark Fashehccd979b2005-12-15 14:31:24 -0800718{
Mark Fashehaa2623a2006-09-12 21:58:23 -0700719 struct ocfs2_lock_res *lockres = opaque;
720 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
Mark Fashehccd979b2005-12-15 14:31:24 -0800721 int needs_downconvert;
722 unsigned long flags;
723
Mark Fashehccd979b2005-12-15 14:31:24 -0800724 BUG_ON(level <= LKM_NLMODE);
725
Mark Fashehaa2623a2006-09-12 21:58:23 -0700726 mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n",
727 lockres->l_name, level, lockres->l_level,
728 ocfs2_lock_type_string(lockres->l_type));
729
Mark Fashehccd979b2005-12-15 14:31:24 -0800730 spin_lock_irqsave(&lockres->l_lock, flags);
731 needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
732 if (needs_downconvert)
733 ocfs2_schedule_blocked_lock(osb, lockres);
734 spin_unlock_irqrestore(&lockres->l_lock, flags);
735
Mark Fashehd680efe2006-09-08 14:14:34 -0700736 wake_up(&lockres->l_event);
737
Mark Fashehccd979b2005-12-15 14:31:24 -0800738 ocfs2_kick_vote_thread(osb);
Mark Fashehccd979b2005-12-15 14:31:24 -0800739}
740
Mark Fashehe92d57d2006-09-12 21:34:35 -0700741static void ocfs2_locking_ast(void *opaque)
Mark Fashehccd979b2005-12-15 14:31:24 -0800742{
Mark Fashehe92d57d2006-09-12 21:34:35 -0700743 struct ocfs2_lock_res *lockres = opaque;
Mark Fashehccd979b2005-12-15 14:31:24 -0800744 struct dlm_lockstatus *lksb = &lockres->l_lksb;
745 unsigned long flags;
746
747 spin_lock_irqsave(&lockres->l_lock, flags);
748
749 if (lksb->status != DLM_NORMAL) {
750 mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n",
751 lockres->l_name, lksb->status);
752 spin_unlock_irqrestore(&lockres->l_lock, flags);
753 return;
754 }
755
756 switch(lockres->l_action) {
757 case OCFS2_AST_ATTACH:
758 ocfs2_generic_handle_attach_action(lockres);
Mark Fashehe92d57d2006-09-12 21:34:35 -0700759 lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
Mark Fashehccd979b2005-12-15 14:31:24 -0800760 break;
761 case OCFS2_AST_CONVERT:
762 ocfs2_generic_handle_convert_action(lockres);
763 break;
764 case OCFS2_AST_DOWNCONVERT:
765 ocfs2_generic_handle_downconvert_action(lockres);
766 break;
767 default:
Mark Fashehe92d57d2006-09-12 21:34:35 -0700768 mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
769 "lockres flags = 0x%lx, unlock action: %u\n",
770 lockres->l_name, lockres->l_action, lockres->l_flags,
771 lockres->l_unlock_action);
Mark Fashehccd979b2005-12-15 14:31:24 -0800772 BUG();
773 }
774
Mark Fashehccd979b2005-12-15 14:31:24 -0800775 /* set it to something invalid so if we get called again we
776 * can catch it. */
777 lockres->l_action = OCFS2_AST_INVALID;
Mark Fashehccd979b2005-12-15 14:31:24 -0800778
779 wake_up(&lockres->l_event);
Mark Fashehd680efe2006-09-08 14:14:34 -0700780 spin_unlock_irqrestore(&lockres->l_lock, flags);
Mark Fashehccd979b2005-12-15 14:31:24 -0800781}
782
Mark Fashehccd979b2005-12-15 14:31:24 -0800783static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
784 int convert)
785{
786 unsigned long flags;
787
788 mlog_entry_void();
789 spin_lock_irqsave(&lockres->l_lock, flags);
790 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
791 if (convert)
792 lockres->l_action = OCFS2_AST_INVALID;
793 else
794 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
795 spin_unlock_irqrestore(&lockres->l_lock, flags);
796
797 wake_up(&lockres->l_event);
798 mlog_exit_void();
799}
800
801/* Note: If we detect another process working on the lock (i.e.,
802 * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller
803 * to do the right thing in that case.
804 */
805static int ocfs2_lock_create(struct ocfs2_super *osb,
806 struct ocfs2_lock_res *lockres,
807 int level,
808 int dlm_flags)
809{
810 int ret = 0;
Sunil Mushranc271c5c2006-12-05 17:56:35 -0800811 enum dlm_status status = DLM_NORMAL;
Mark Fashehccd979b2005-12-15 14:31:24 -0800812 unsigned long flags;
813
814 mlog_entry_void();
815
816 mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level,
817 dlm_flags);
818
819 spin_lock_irqsave(&lockres->l_lock, flags);
820 if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) ||
821 (lockres->l_flags & OCFS2_LOCK_BUSY)) {
822 spin_unlock_irqrestore(&lockres->l_lock, flags);
823 goto bail;
824 }
825
826 lockres->l_action = OCFS2_AST_ATTACH;
827 lockres->l_requested = level;
828 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
829 spin_unlock_irqrestore(&lockres->l_lock, flags);
830
831 status = dlmlock(osb->dlm,
832 level,
833 &lockres->l_lksb,
834 dlm_flags,
835 lockres->l_name,
Mark Fashehf0681062006-09-08 11:40:10 -0700836 OCFS2_LOCK_ID_MAX_LEN - 1,
Mark Fashehe92d57d2006-09-12 21:34:35 -0700837 ocfs2_locking_ast,
Mark Fashehccd979b2005-12-15 14:31:24 -0800838 lockres,
Mark Fashehaa2623a2006-09-12 21:58:23 -0700839 ocfs2_blocking_ast);
Mark Fashehccd979b2005-12-15 14:31:24 -0800840 if (status != DLM_NORMAL) {
841 ocfs2_log_dlm_error("dlmlock", status, lockres);
842 ret = -EINVAL;
843 ocfs2_recover_from_dlm_error(lockres, 1);
844 }
845
846 mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name);
847
848bail:
849 mlog_exit(ret);
850 return ret;
851}
852
853static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres,
854 int flag)
855{
856 unsigned long flags;
857 int ret;
858
859 spin_lock_irqsave(&lockres->l_lock, flags);
860 ret = lockres->l_flags & flag;
861 spin_unlock_irqrestore(&lockres->l_lock, flags);
862
863 return ret;
864}
865
866static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres)
867
868{
869 wait_event(lockres->l_event,
870 !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY));
871}
872
873static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres)
874
875{
876 wait_event(lockres->l_event,
877 !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING));
878}
879
880/* predict what lock level we'll be dropping down to on behalf
881 * of another node, and return true if the currently wanted
882 * level will be compatible with it. */
883static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
884 int wanted)
885{
886 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
887
888 return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking);
889}
890
891static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
892{
893 INIT_LIST_HEAD(&mw->mw_item);
894 init_completion(&mw->mw_complete);
895}
896
897static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
898{
899 wait_for_completion(&mw->mw_complete);
900 /* Re-arm the completion in case we want to wait on it again */
901 INIT_COMPLETION(mw->mw_complete);
902 return mw->mw_status;
903}
904
905static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
906 struct ocfs2_mask_waiter *mw,
907 unsigned long mask,
908 unsigned long goal)
909{
910 BUG_ON(!list_empty(&mw->mw_item));
911
912 assert_spin_locked(&lockres->l_lock);
913
914 list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
915 mw->mw_mask = mask;
916 mw->mw_goal = goal;
917}
918
919/* returns 0 if the mw that was removed was already satisfied, -EBUSY
920 * if the mask still hadn't reached its goal */
921static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
922 struct ocfs2_mask_waiter *mw)
923{
924 unsigned long flags;
925 int ret = 0;
926
927 spin_lock_irqsave(&lockres->l_lock, flags);
928 if (!list_empty(&mw->mw_item)) {
929 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
930 ret = -EBUSY;
931
932 list_del_init(&mw->mw_item);
933 init_completion(&mw->mw_complete);
934 }
935 spin_unlock_irqrestore(&lockres->l_lock, flags);
936
937 return ret;
938
939}
940
941static int ocfs2_cluster_lock(struct ocfs2_super *osb,
942 struct ocfs2_lock_res *lockres,
943 int level,
944 int lkm_flags,
945 int arg_flags)
946{
947 struct ocfs2_mask_waiter mw;
948 enum dlm_status status;
949 int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
950 int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
951 unsigned long flags;
952
953 mlog_entry_void();
954
955 ocfs2_init_mask_waiter(&mw);
956
Mark Fashehb80fc012006-09-12 22:08:14 -0700957 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
958 lkm_flags |= LKM_VALBLK;
959
Mark Fashehccd979b2005-12-15 14:31:24 -0800960again:
961 wait = 0;
962
963 if (catch_signals && signal_pending(current)) {
964 ret = -ERESTARTSYS;
965 goto out;
966 }
967
968 spin_lock_irqsave(&lockres->l_lock, flags);
969
970 mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
971 "Cluster lock called on freeing lockres %s! flags "
972 "0x%lx\n", lockres->l_name, lockres->l_flags);
973
974 /* We only compare against the currently granted level
975 * here. If the lock is blocked waiting on a downconvert,
976 * we'll get caught below. */
977 if (lockres->l_flags & OCFS2_LOCK_BUSY &&
978 level > lockres->l_level) {
979 /* is someone sitting in dlm_lock? If so, wait on
980 * them. */
981 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
982 wait = 1;
983 goto unlock;
984 }
985
986 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
987 /* lock has not been created yet. */
988 spin_unlock_irqrestore(&lockres->l_lock, flags);
989
990 ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
991 if (ret < 0) {
992 mlog_errno(ret);
993 goto out;
994 }
995 goto again;
996 }
997
998 if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
999 !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
1000 /* is the lock is currently blocked on behalf of
1001 * another node */
1002 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0);
1003 wait = 1;
1004 goto unlock;
1005 }
1006
1007 if (level > lockres->l_level) {
1008 if (lockres->l_action != OCFS2_AST_INVALID)
1009 mlog(ML_ERROR, "lockres %s has action %u pending\n",
1010 lockres->l_name, lockres->l_action);
1011
1012 lockres->l_action = OCFS2_AST_CONVERT;
1013 lockres->l_requested = level;
1014 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
1015 spin_unlock_irqrestore(&lockres->l_lock, flags);
1016
1017 BUG_ON(level == LKM_IVMODE);
1018 BUG_ON(level == LKM_NLMODE);
1019
1020 mlog(0, "lock %s, convert from %d to level = %d\n",
1021 lockres->l_name, lockres->l_level, level);
1022
1023 /* call dlm_lock to upgrade lock now */
1024 status = dlmlock(osb->dlm,
1025 level,
1026 &lockres->l_lksb,
Mark Fashehb80fc012006-09-12 22:08:14 -07001027 lkm_flags|LKM_CONVERT,
Mark Fashehccd979b2005-12-15 14:31:24 -08001028 lockres->l_name,
Mark Fashehf0681062006-09-08 11:40:10 -07001029 OCFS2_LOCK_ID_MAX_LEN - 1,
Mark Fashehe92d57d2006-09-12 21:34:35 -07001030 ocfs2_locking_ast,
Mark Fashehccd979b2005-12-15 14:31:24 -08001031 lockres,
Mark Fashehaa2623a2006-09-12 21:58:23 -07001032 ocfs2_blocking_ast);
Mark Fashehccd979b2005-12-15 14:31:24 -08001033 if (status != DLM_NORMAL) {
1034 if ((lkm_flags & LKM_NOQUEUE) &&
1035 (status == DLM_NOTQUEUED))
1036 ret = -EAGAIN;
1037 else {
1038 ocfs2_log_dlm_error("dlmlock", status,
1039 lockres);
1040 ret = -EINVAL;
1041 }
1042 ocfs2_recover_from_dlm_error(lockres, 1);
1043 goto out;
1044 }
1045
1046 mlog(0, "lock %s, successfull return from dlmlock\n",
1047 lockres->l_name);
1048
1049 /* At this point we've gone inside the dlm and need to
1050 * complete our work regardless. */
1051 catch_signals = 0;
1052
1053 /* wait for busy to clear and carry on */
1054 goto again;
1055 }
1056
1057 /* Ok, if we get here then we're good to go. */
1058 ocfs2_inc_holders(lockres, level);
1059
1060 ret = 0;
1061unlock:
1062 spin_unlock_irqrestore(&lockres->l_lock, flags);
1063out:
1064 /*
1065 * This is helping work around a lock inversion between the page lock
1066 * and dlm locks. One path holds the page lock while calling aops
1067 * which block acquiring dlm locks. The voting thread holds dlm
1068 * locks while acquiring page locks while down converting data locks.
1069 * This block is helping an aop path notice the inversion and back
1070 * off to unlock its page lock before trying the dlm lock again.
1071 */
1072 if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
1073 mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
1074 wait = 0;
1075 if (lockres_remove_mask_waiter(lockres, &mw))
1076 ret = -EAGAIN;
1077 else
1078 goto again;
1079 }
1080 if (wait) {
1081 ret = ocfs2_wait_for_mask(&mw);
1082 if (ret == 0)
1083 goto again;
1084 mlog_errno(ret);
1085 }
1086
1087 mlog_exit(ret);
1088 return ret;
1089}
1090
1091static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
1092 struct ocfs2_lock_res *lockres,
1093 int level)
1094{
1095 unsigned long flags;
1096
1097 mlog_entry_void();
1098 spin_lock_irqsave(&lockres->l_lock, flags);
1099 ocfs2_dec_holders(lockres, level);
1100 ocfs2_vote_on_unlock(osb, lockres);
1101 spin_unlock_irqrestore(&lockres->l_lock, flags);
1102 mlog_exit_void();
1103}
1104
Adrian Bunkda661162006-11-20 03:24:28 +01001105static int ocfs2_create_new_lock(struct ocfs2_super *osb,
1106 struct ocfs2_lock_res *lockres,
1107 int ex,
1108 int local)
Mark Fashehccd979b2005-12-15 14:31:24 -08001109{
Mark Fashehd680efe2006-09-08 14:14:34 -07001110 int level = ex ? LKM_EXMODE : LKM_PRMODE;
Mark Fashehccd979b2005-12-15 14:31:24 -08001111 unsigned long flags;
Mark Fasheh24c19ef2006-09-22 17:28:19 -07001112 int lkm_flags = local ? LKM_LOCAL : 0;
Mark Fashehccd979b2005-12-15 14:31:24 -08001113
1114 spin_lock_irqsave(&lockres->l_lock, flags);
1115 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
1116 lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
1117 spin_unlock_irqrestore(&lockres->l_lock, flags);
1118
Mark Fasheh24c19ef2006-09-22 17:28:19 -07001119 return ocfs2_lock_create(osb, lockres, level, lkm_flags);
Mark Fashehccd979b2005-12-15 14:31:24 -08001120}
1121
1122/* Grants us an EX lock on the data and metadata resources, skipping
1123 * the normal cluster directory lookup. Use this ONLY on newly created
1124 * inodes which other nodes can't possibly see, and which haven't been
1125 * hashed in the inode hash yet. This can give us a good performance
1126 * increase as it'll skip the network broadcast normally associated
1127 * with creating a new lock resource. */
1128int ocfs2_create_new_inode_locks(struct inode *inode)
1129{
1130 int ret;
Mark Fashehd680efe2006-09-08 14:14:34 -07001131 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
Mark Fashehccd979b2005-12-15 14:31:24 -08001132
1133 BUG_ON(!inode);
1134 BUG_ON(!ocfs2_inode_is_new(inode));
1135
1136 mlog_entry_void();
1137
Mark Fashehb0697052006-03-03 10:24:33 -08001138 mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -08001139
1140 /* NOTE: That we don't increment any of the holder counts, nor
1141 * do we add anything to a journal handle. Since this is
1142 * supposed to be a new inode which the cluster doesn't know
1143 * about yet, there is no need to. As far as the LVB handling
1144 * is concerned, this is basically like acquiring an EX lock
1145 * on a resource which has an invalid one -- we'll set it
1146 * valid when we release the EX. */
1147
Mark Fasheh24c19ef2006-09-22 17:28:19 -07001148 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_rw_lockres, 1, 1);
Mark Fashehccd979b2005-12-15 14:31:24 -08001149 if (ret) {
1150 mlog_errno(ret);
1151 goto bail;
1152 }
1153
Mark Fasheh24c19ef2006-09-22 17:28:19 -07001154 /*
1155 * We don't want to use LKM_LOCAL on a meta data lock as they
1156 * don't use a generation in their lock names.
1157 */
1158 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_meta_lockres, 1, 0);
Mark Fashehccd979b2005-12-15 14:31:24 -08001159 if (ret) {
1160 mlog_errno(ret);
1161 goto bail;
1162 }
1163
Mark Fasheh24c19ef2006-09-22 17:28:19 -07001164 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_data_lockres, 1, 1);
Mark Fashehccd979b2005-12-15 14:31:24 -08001165 if (ret) {
1166 mlog_errno(ret);
1167 goto bail;
1168 }
1169
Tiger Yang50008632007-03-20 16:01:38 -07001170 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
1171 if (ret) {
1172 mlog_errno(ret);
1173 goto bail;
1174 }
1175
Mark Fashehccd979b2005-12-15 14:31:24 -08001176bail:
1177 mlog_exit(ret);
1178 return ret;
1179}
1180
1181int ocfs2_rw_lock(struct inode *inode, int write)
1182{
1183 int status, level;
1184 struct ocfs2_lock_res *lockres;
Sunil Mushranc271c5c2006-12-05 17:56:35 -08001185 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
Mark Fashehccd979b2005-12-15 14:31:24 -08001186
1187 BUG_ON(!inode);
1188
1189 mlog_entry_void();
1190
Mark Fashehb0697052006-03-03 10:24:33 -08001191 mlog(0, "inode %llu take %s RW lock\n",
1192 (unsigned long long)OCFS2_I(inode)->ip_blkno,
Mark Fashehccd979b2005-12-15 14:31:24 -08001193 write ? "EXMODE" : "PRMODE");
1194
Sunil Mushranc271c5c2006-12-05 17:56:35 -08001195 if (ocfs2_mount_local(osb))
1196 return 0;
1197
Mark Fashehccd979b2005-12-15 14:31:24 -08001198 lockres = &OCFS2_I(inode)->ip_rw_lockres;
1199
1200 level = write ? LKM_EXMODE : LKM_PRMODE;
1201
1202 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
1203 0);
1204 if (status < 0)
1205 mlog_errno(status);
1206
1207 mlog_exit(status);
1208 return status;
1209}
1210
1211void ocfs2_rw_unlock(struct inode *inode, int write)
1212{
1213 int level = write ? LKM_EXMODE : LKM_PRMODE;
1214 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
Sunil Mushranc271c5c2006-12-05 17:56:35 -08001215 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
Mark Fashehccd979b2005-12-15 14:31:24 -08001216
1217 mlog_entry_void();
1218
Mark Fashehb0697052006-03-03 10:24:33 -08001219 mlog(0, "inode %llu drop %s RW lock\n",
1220 (unsigned long long)OCFS2_I(inode)->ip_blkno,
Mark Fashehccd979b2005-12-15 14:31:24 -08001221 write ? "EXMODE" : "PRMODE");
1222
Sunil Mushranc271c5c2006-12-05 17:56:35 -08001223 if (!ocfs2_mount_local(osb))
1224 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
Mark Fashehccd979b2005-12-15 14:31:24 -08001225
1226 mlog_exit_void();
1227}
1228
Tiger Yang50008632007-03-20 16:01:38 -07001229/*
1230 * ocfs2_open_lock always get PR mode lock.
1231 */
1232int ocfs2_open_lock(struct inode *inode)
1233{
1234 int status = 0;
1235 struct ocfs2_lock_res *lockres;
1236 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1237
1238 BUG_ON(!inode);
1239
1240 mlog_entry_void();
1241
1242 mlog(0, "inode %llu take PRMODE open lock\n",
1243 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1244
1245 if (ocfs2_mount_local(osb))
1246 goto out;
1247
1248 lockres = &OCFS2_I(inode)->ip_open_lockres;
1249
1250 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
1251 LKM_PRMODE, 0, 0);
1252 if (status < 0)
1253 mlog_errno(status);
1254
1255out:
1256 mlog_exit(status);
1257 return status;
1258}
1259
1260int ocfs2_try_open_lock(struct inode *inode, int write)
1261{
1262 int status = 0, level;
1263 struct ocfs2_lock_res *lockres;
1264 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1265
1266 BUG_ON(!inode);
1267
1268 mlog_entry_void();
1269
1270 mlog(0, "inode %llu try to take %s open lock\n",
1271 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1272 write ? "EXMODE" : "PRMODE");
1273
1274 if (ocfs2_mount_local(osb))
1275 goto out;
1276
1277 lockres = &OCFS2_I(inode)->ip_open_lockres;
1278
1279 level = write ? LKM_EXMODE : LKM_PRMODE;
1280
1281 /*
1282 * The file system may already holding a PRMODE/EXMODE open lock.
1283 * Since we pass LKM_NOQUEUE, the request won't block waiting on
1284 * other nodes and the -EAGAIN will indicate to the caller that
1285 * this inode is still in use.
1286 */
1287 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
1288 level, LKM_NOQUEUE, 0);
1289
1290out:
1291 mlog_exit(status);
1292 return status;
1293}
1294
1295/*
1296 * ocfs2_open_unlock unlock PR and EX mode open locks.
1297 */
1298void ocfs2_open_unlock(struct inode *inode)
1299{
1300 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
1301 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1302
1303 mlog_entry_void();
1304
1305 mlog(0, "inode %llu drop open lock\n",
1306 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1307
1308 if (ocfs2_mount_local(osb))
1309 goto out;
1310
1311 if(lockres->l_ro_holders)
1312 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
1313 LKM_PRMODE);
1314 if(lockres->l_ex_holders)
1315 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
1316 LKM_EXMODE);
1317
1318out:
1319 mlog_exit_void();
1320}
1321
Mark Fashehccd979b2005-12-15 14:31:24 -08001322int ocfs2_data_lock_full(struct inode *inode,
1323 int write,
1324 int arg_flags)
1325{
1326 int status = 0, level;
1327 struct ocfs2_lock_res *lockres;
Sunil Mushranc271c5c2006-12-05 17:56:35 -08001328 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
Mark Fashehccd979b2005-12-15 14:31:24 -08001329
1330 BUG_ON(!inode);
1331
1332 mlog_entry_void();
1333
Mark Fashehb0697052006-03-03 10:24:33 -08001334 mlog(0, "inode %llu take %s DATA lock\n",
1335 (unsigned long long)OCFS2_I(inode)->ip_blkno,
Mark Fashehccd979b2005-12-15 14:31:24 -08001336 write ? "EXMODE" : "PRMODE");
1337
1338 /* We'll allow faking a readonly data lock for
1339 * rodevices. */
1340 if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) {
1341 if (write) {
1342 status = -EROFS;
1343 mlog_errno(status);
1344 }
1345 goto out;
1346 }
1347
Sunil Mushranc271c5c2006-12-05 17:56:35 -08001348 if (ocfs2_mount_local(osb))
1349 goto out;
1350
Mark Fashehccd979b2005-12-15 14:31:24 -08001351 lockres = &OCFS2_I(inode)->ip_data_lockres;
1352
1353 level = write ? LKM_EXMODE : LKM_PRMODE;
1354
1355 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level,
1356 0, arg_flags);
1357 if (status < 0 && status != -EAGAIN)
1358 mlog_errno(status);
1359
1360out:
1361 mlog_exit(status);
1362 return status;
1363}
1364
1365/* see ocfs2_meta_lock_with_page() */
1366int ocfs2_data_lock_with_page(struct inode *inode,
1367 int write,
1368 struct page *page)
1369{
1370 int ret;
1371
1372 ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK);
1373 if (ret == -EAGAIN) {
1374 unlock_page(page);
1375 if (ocfs2_data_lock(inode, write) == 0)
1376 ocfs2_data_unlock(inode, write);
1377 ret = AOP_TRUNCATED_PAGE;
1378 }
1379
1380 return ret;
1381}
1382
1383static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
1384 struct ocfs2_lock_res *lockres)
1385{
1386 int kick = 0;
1387
1388 mlog_entry_void();
1389
1390 /* If we know that another node is waiting on our lock, kick
1391 * the vote thread * pre-emptively when we reach a release
1392 * condition. */
1393 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
1394 switch(lockres->l_blocking) {
1395 case LKM_EXMODE:
1396 if (!lockres->l_ex_holders && !lockres->l_ro_holders)
1397 kick = 1;
1398 break;
1399 case LKM_PRMODE:
1400 if (!lockres->l_ex_holders)
1401 kick = 1;
1402 break;
1403 default:
1404 BUG();
1405 }
1406 }
1407
1408 if (kick)
1409 ocfs2_kick_vote_thread(osb);
1410
1411 mlog_exit_void();
1412}
1413
1414void ocfs2_data_unlock(struct inode *inode,
1415 int write)
1416{
1417 int level = write ? LKM_EXMODE : LKM_PRMODE;
1418 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
Sunil Mushranc271c5c2006-12-05 17:56:35 -08001419 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
Mark Fashehccd979b2005-12-15 14:31:24 -08001420
1421 mlog_entry_void();
1422
Mark Fashehb0697052006-03-03 10:24:33 -08001423 mlog(0, "inode %llu drop %s DATA lock\n",
1424 (unsigned long long)OCFS2_I(inode)->ip_blkno,
Mark Fashehccd979b2005-12-15 14:31:24 -08001425 write ? "EXMODE" : "PRMODE");
1426
Sunil Mushranc271c5c2006-12-05 17:56:35 -08001427 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
1428 !ocfs2_mount_local(osb))
Mark Fashehccd979b2005-12-15 14:31:24 -08001429 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1430
1431 mlog_exit_void();
1432}
1433
1434#define OCFS2_SEC_BITS 34
1435#define OCFS2_SEC_SHIFT (64 - 34)
1436#define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1)
1437
1438/* LVB only has room for 64 bits of time here so we pack it for
1439 * now. */
1440static u64 ocfs2_pack_timespec(struct timespec *spec)
1441{
1442 u64 res;
1443 u64 sec = spec->tv_sec;
1444 u32 nsec = spec->tv_nsec;
1445
1446 res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
1447
1448 return res;
1449}
1450
1451/* Call this with the lockres locked. I am reasonably sure we don't
1452 * need ip_lock in this function as anyone who would be changing those
1453 * values is supposed to be blocked in ocfs2_meta_lock right now. */
1454static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1455{
1456 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1457 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
1458 struct ocfs2_meta_lvb *lvb;
1459
1460 mlog_entry_void();
1461
1462 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1463
Mark Fasheh24c19ef2006-09-22 17:28:19 -07001464 /*
1465 * Invalidate the LVB of a deleted inode - this way other
1466 * nodes are forced to go to disk and discover the new inode
1467 * status.
1468 */
1469 if (oi->ip_flags & OCFS2_INODE_DELETED) {
1470 lvb->lvb_version = 0;
1471 goto out;
1472 }
1473
Mark Fasheh4d3b83f2006-09-12 15:22:18 -07001474 lvb->lvb_version = OCFS2_LVB_VERSION;
Mark Fashehccd979b2005-12-15 14:31:24 -08001475 lvb->lvb_isize = cpu_to_be64(i_size_read(inode));
1476 lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
1477 lvb->lvb_iuid = cpu_to_be32(inode->i_uid);
1478 lvb->lvb_igid = cpu_to_be32(inode->i_gid);
1479 lvb->lvb_imode = cpu_to_be16(inode->i_mode);
1480 lvb->lvb_inlink = cpu_to_be16(inode->i_nlink);
1481 lvb->lvb_iatime_packed =
1482 cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
1483 lvb->lvb_ictime_packed =
1484 cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
1485 lvb->lvb_imtime_packed =
1486 cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
Herbert Poetzlca4d1472006-07-03 17:27:12 -07001487 lvb->lvb_iattr = cpu_to_be32(oi->ip_attr);
Mark Fashehf9e2d822006-09-12 15:35:49 -07001488 lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
Mark Fashehccd979b2005-12-15 14:31:24 -08001489
Mark Fasheh24c19ef2006-09-22 17:28:19 -07001490out:
Mark Fashehccd979b2005-12-15 14:31:24 -08001491 mlog_meta_lvb(0, lockres);
1492
1493 mlog_exit_void();
1494}
1495
1496static void ocfs2_unpack_timespec(struct timespec *spec,
1497 u64 packed_time)
1498{
1499 spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
1500 spec->tv_nsec = packed_time & OCFS2_NSEC_MASK;
1501}
1502
1503static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1504{
1505 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1506 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
1507 struct ocfs2_meta_lvb *lvb;
1508
1509 mlog_entry_void();
1510
1511 mlog_meta_lvb(0, lockres);
1512
1513 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1514
1515 /* We're safe here without the lockres lock... */
1516 spin_lock(&oi->ip_lock);
1517 oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters);
1518 i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
1519
Herbert Poetzlca4d1472006-07-03 17:27:12 -07001520 oi->ip_attr = be32_to_cpu(lvb->lvb_iattr);
1521 ocfs2_set_inode_flags(inode);
1522
Mark Fashehccd979b2005-12-15 14:31:24 -08001523 /* fast-symlinks are a special case */
1524 if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
1525 inode->i_blocks = 0;
1526 else
Mark Fasheh8110b072007-03-22 16:53:23 -07001527 inode->i_blocks = ocfs2_inode_sector_count(inode);
Mark Fashehccd979b2005-12-15 14:31:24 -08001528
1529 inode->i_uid = be32_to_cpu(lvb->lvb_iuid);
1530 inode->i_gid = be32_to_cpu(lvb->lvb_igid);
1531 inode->i_mode = be16_to_cpu(lvb->lvb_imode);
1532 inode->i_nlink = be16_to_cpu(lvb->lvb_inlink);
1533 ocfs2_unpack_timespec(&inode->i_atime,
1534 be64_to_cpu(lvb->lvb_iatime_packed));
1535 ocfs2_unpack_timespec(&inode->i_mtime,
1536 be64_to_cpu(lvb->lvb_imtime_packed));
1537 ocfs2_unpack_timespec(&inode->i_ctime,
1538 be64_to_cpu(lvb->lvb_ictime_packed));
1539 spin_unlock(&oi->ip_lock);
1540
1541 mlog_exit_void();
1542}
1543
Mark Fashehf9e2d822006-09-12 15:35:49 -07001544static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
1545 struct ocfs2_lock_res *lockres)
Mark Fashehccd979b2005-12-15 14:31:24 -08001546{
1547 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1548
Mark Fashehf9e2d822006-09-12 15:35:49 -07001549 if (lvb->lvb_version == OCFS2_LVB_VERSION
1550 && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
Mark Fashehccd979b2005-12-15 14:31:24 -08001551 return 1;
1552 return 0;
1553}
1554
1555/* Determine whether a lock resource needs to be refreshed, and
1556 * arbitrate who gets to refresh it.
1557 *
1558 * 0 means no refresh needed.
1559 *
1560 * > 0 means you need to refresh this and you MUST call
1561 * ocfs2_complete_lock_res_refresh afterwards. */
1562static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
1563{
1564 unsigned long flags;
1565 int status = 0;
1566
1567 mlog_entry_void();
1568
1569refresh_check:
1570 spin_lock_irqsave(&lockres->l_lock, flags);
1571 if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
1572 spin_unlock_irqrestore(&lockres->l_lock, flags);
1573 goto bail;
1574 }
1575
1576 if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
1577 spin_unlock_irqrestore(&lockres->l_lock, flags);
1578
1579 ocfs2_wait_on_refreshing_lock(lockres);
1580 goto refresh_check;
1581 }
1582
1583 /* Ok, I'll be the one to refresh this lock. */
1584 lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING);
1585 spin_unlock_irqrestore(&lockres->l_lock, flags);
1586
1587 status = 1;
1588bail:
1589 mlog_exit(status);
1590 return status;
1591}
1592
1593/* If status is non zero, I'll mark it as not being in refresh
1594 * anymroe, but i won't clear the needs refresh flag. */
1595static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres,
1596 int status)
1597{
1598 unsigned long flags;
1599 mlog_entry_void();
1600
1601 spin_lock_irqsave(&lockres->l_lock, flags);
1602 lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
1603 if (!status)
1604 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
1605 spin_unlock_irqrestore(&lockres->l_lock, flags);
1606
1607 wake_up(&lockres->l_event);
1608
1609 mlog_exit_void();
1610}
1611
1612/* may or may not return a bh if it went to disk. */
1613static int ocfs2_meta_lock_update(struct inode *inode,
1614 struct buffer_head **bh)
1615{
1616 int status = 0;
1617 struct ocfs2_inode_info *oi = OCFS2_I(inode);
Mark Fashehbe9e9862007-04-18 15:22:08 -07001618 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
Mark Fashehccd979b2005-12-15 14:31:24 -08001619 struct ocfs2_dinode *fe;
Sunil Mushranc271c5c2006-12-05 17:56:35 -08001620 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
Mark Fashehccd979b2005-12-15 14:31:24 -08001621
1622 mlog_entry_void();
1623
Mark Fashehbe9e9862007-04-18 15:22:08 -07001624 if (ocfs2_mount_local(osb))
1625 goto bail;
1626
Mark Fashehccd979b2005-12-15 14:31:24 -08001627 spin_lock(&oi->ip_lock);
1628 if (oi->ip_flags & OCFS2_INODE_DELETED) {
Mark Fashehb0697052006-03-03 10:24:33 -08001629 mlog(0, "Orphaned inode %llu was deleted while we "
Mark Fashehccd979b2005-12-15 14:31:24 -08001630 "were waiting on a lock. ip_flags = 0x%x\n",
Mark Fashehb0697052006-03-03 10:24:33 -08001631 (unsigned long long)oi->ip_blkno, oi->ip_flags);
Mark Fashehccd979b2005-12-15 14:31:24 -08001632 spin_unlock(&oi->ip_lock);
1633 status = -ENOENT;
1634 goto bail;
1635 }
1636 spin_unlock(&oi->ip_lock);
1637
Mark Fashehbe9e9862007-04-18 15:22:08 -07001638 if (!ocfs2_should_refresh_lock_res(lockres))
1639 goto bail;
Mark Fashehccd979b2005-12-15 14:31:24 -08001640
1641 /* This will discard any caching information we might have had
1642 * for the inode metadata. */
1643 ocfs2_metadata_cache_purge(inode);
1644
Mark Fasheh83418972007-04-23 18:53:12 -07001645 ocfs2_extent_map_trunc(inode, 0);
1646
Mark Fashehbe9e9862007-04-18 15:22:08 -07001647 if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
Mark Fashehb0697052006-03-03 10:24:33 -08001648 mlog(0, "Trusting LVB on inode %llu\n",
1649 (unsigned long long)oi->ip_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -08001650 ocfs2_refresh_inode_from_lvb(inode);
1651 } else {
1652 /* Boo, we have to go to disk. */
1653 /* read bh, cast, ocfs2_refresh_inode */
1654 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
1655 bh, OCFS2_BH_CACHED, inode);
1656 if (status < 0) {
1657 mlog_errno(status);
1658 goto bail_refresh;
1659 }
1660 fe = (struct ocfs2_dinode *) (*bh)->b_data;
1661
1662 /* This is a good chance to make sure we're not
1663 * locking an invalid object.
1664 *
1665 * We bug on a stale inode here because we checked
1666 * above whether it was wiped from disk. The wiping
1667 * node provides a guarantee that we receive that
1668 * message and can mark the inode before dropping any
1669 * locks associated with it. */
1670 if (!OCFS2_IS_VALID_DINODE(fe)) {
1671 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
1672 status = -EIO;
1673 goto bail_refresh;
1674 }
1675 mlog_bug_on_msg(inode->i_generation !=
1676 le32_to_cpu(fe->i_generation),
Mark Fashehb0697052006-03-03 10:24:33 -08001677 "Invalid dinode %llu disk generation: %u "
Mark Fashehccd979b2005-12-15 14:31:24 -08001678 "inode->i_generation: %u\n",
Mark Fashehb0697052006-03-03 10:24:33 -08001679 (unsigned long long)oi->ip_blkno,
1680 le32_to_cpu(fe->i_generation),
Mark Fashehccd979b2005-12-15 14:31:24 -08001681 inode->i_generation);
1682 mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) ||
1683 !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)),
Mark Fashehb0697052006-03-03 10:24:33 -08001684 "Stale dinode %llu dtime: %llu flags: 0x%x\n",
1685 (unsigned long long)oi->ip_blkno,
1686 (unsigned long long)le64_to_cpu(fe->i_dtime),
Mark Fashehccd979b2005-12-15 14:31:24 -08001687 le32_to_cpu(fe->i_flags));
1688
1689 ocfs2_refresh_inode(inode, fe);
1690 }
1691
1692 status = 0;
1693bail_refresh:
Mark Fashehbe9e9862007-04-18 15:22:08 -07001694 ocfs2_complete_lock_res_refresh(lockres, status);
Mark Fashehccd979b2005-12-15 14:31:24 -08001695bail:
1696 mlog_exit(status);
1697 return status;
1698}
1699
1700static int ocfs2_assign_bh(struct inode *inode,
1701 struct buffer_head **ret_bh,
1702 struct buffer_head *passed_bh)
1703{
1704 int status;
1705
1706 if (passed_bh) {
1707 /* Ok, the update went to disk for us, use the
1708 * returned bh. */
1709 *ret_bh = passed_bh;
1710 get_bh(*ret_bh);
1711
1712 return 0;
1713 }
1714
1715 status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
1716 OCFS2_I(inode)->ip_blkno,
1717 ret_bh,
1718 OCFS2_BH_CACHED,
1719 inode);
1720 if (status < 0)
1721 mlog_errno(status);
1722
1723 return status;
1724}
1725
1726/*
1727 * returns < 0 error if the callback will never be called, otherwise
1728 * the result of the lock will be communicated via the callback.
1729 */
1730int ocfs2_meta_lock_full(struct inode *inode,
Mark Fashehccd979b2005-12-15 14:31:24 -08001731 struct buffer_head **ret_bh,
1732 int ex,
1733 int arg_flags)
1734{
1735 int status, level, dlm_flags, acquired;
Sunil Mushranc271c5c2006-12-05 17:56:35 -08001736 struct ocfs2_lock_res *lockres = NULL;
Mark Fashehccd979b2005-12-15 14:31:24 -08001737 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1738 struct buffer_head *local_bh = NULL;
1739
1740 BUG_ON(!inode);
1741
1742 mlog_entry_void();
1743
Mark Fashehb0697052006-03-03 10:24:33 -08001744 mlog(0, "inode %llu, take %s META lock\n",
1745 (unsigned long long)OCFS2_I(inode)->ip_blkno,
Mark Fashehccd979b2005-12-15 14:31:24 -08001746 ex ? "EXMODE" : "PRMODE");
1747
1748 status = 0;
1749 acquired = 0;
1750 /* We'll allow faking a readonly metadata lock for
1751 * rodevices. */
1752 if (ocfs2_is_hard_readonly(osb)) {
1753 if (ex)
1754 status = -EROFS;
1755 goto bail;
1756 }
1757
Sunil Mushranc271c5c2006-12-05 17:56:35 -08001758 if (ocfs2_mount_local(osb))
1759 goto local;
1760
Mark Fashehccd979b2005-12-15 14:31:24 -08001761 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1762 wait_event(osb->recovery_event,
1763 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1764
Mark Fashehccd979b2005-12-15 14:31:24 -08001765 lockres = &OCFS2_I(inode)->ip_meta_lockres;
1766 level = ex ? LKM_EXMODE : LKM_PRMODE;
1767 dlm_flags = 0;
1768 if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
1769 dlm_flags |= LKM_NOQUEUE;
1770
1771 status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
1772 if (status < 0) {
1773 if (status != -EAGAIN && status != -EIOCBRETRY)
1774 mlog_errno(status);
1775 goto bail;
1776 }
1777
1778 /* Notify the error cleanup path to drop the cluster lock. */
1779 acquired = 1;
1780
1781 /* We wait twice because a node may have died while we were in
1782 * the lower dlm layers. The second time though, we've
1783 * committed to owning this lock so we don't allow signals to
1784 * abort the operation. */
1785 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1786 wait_event(osb->recovery_event,
1787 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1788
Sunil Mushranc271c5c2006-12-05 17:56:35 -08001789local:
Mark Fasheh24c19ef2006-09-22 17:28:19 -07001790 /*
1791 * We only see this flag if we're being called from
1792 * ocfs2_read_locked_inode(). It means we're locking an inode
1793 * which hasn't been populated yet, so clear the refresh flag
1794 * and let the caller handle it.
1795 */
1796 if (inode->i_state & I_NEW) {
1797 status = 0;
Sunil Mushranc271c5c2006-12-05 17:56:35 -08001798 if (lockres)
1799 ocfs2_complete_lock_res_refresh(lockres, 0);
Mark Fasheh24c19ef2006-09-22 17:28:19 -07001800 goto bail;
1801 }
1802
Mark Fashehccd979b2005-12-15 14:31:24 -08001803 /* This is fun. The caller may want a bh back, or it may
1804 * not. ocfs2_meta_lock_update definitely wants one in, but
1805 * may or may not read one, depending on what's in the
1806 * LVB. The result of all of this is that we've *only* gone to
1807 * disk if we have to, so the complexity is worthwhile. */
1808 status = ocfs2_meta_lock_update(inode, &local_bh);
1809 if (status < 0) {
1810 if (status != -ENOENT)
1811 mlog_errno(status);
1812 goto bail;
1813 }
1814
1815 if (ret_bh) {
1816 status = ocfs2_assign_bh(inode, ret_bh, local_bh);
1817 if (status < 0) {
1818 mlog_errno(status);
1819 goto bail;
1820 }
1821 }
1822
Mark Fashehccd979b2005-12-15 14:31:24 -08001823bail:
1824 if (status < 0) {
1825 if (ret_bh && (*ret_bh)) {
1826 brelse(*ret_bh);
1827 *ret_bh = NULL;
1828 }
1829 if (acquired)
1830 ocfs2_meta_unlock(inode, ex);
1831 }
1832
1833 if (local_bh)
1834 brelse(local_bh);
1835
1836 mlog_exit(status);
1837 return status;
1838}
1839
1840/*
1841 * This is working around a lock inversion between tasks acquiring DLM locks
1842 * while holding a page lock and the vote thread which blocks dlm lock acquiry
1843 * while acquiring page locks.
1844 *
1845 * ** These _with_page variantes are only intended to be called from aop
1846 * methods that hold page locks and return a very specific *positive* error
1847 * code that aop methods pass up to the VFS -- test for errors with != 0. **
1848 *
1849 * The DLM is called such that it returns -EAGAIN if it would have blocked
1850 * waiting for the vote thread. In that case we unlock our page so the vote
1851 * thread can make progress. Once we've done this we have to return
1852 * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up
1853 * into the VFS who will then immediately retry the aop call.
1854 *
1855 * We do a blocking lock and immediate unlock before returning, though, so that
1856 * the lock has a great chance of being cached on this node by the time the VFS
1857 * calls back to retry the aop. This has a potential to livelock as nodes
1858 * ping locks back and forth, but that's a risk we're willing to take to avoid
1859 * the lock inversion simply.
1860 */
1861int ocfs2_meta_lock_with_page(struct inode *inode,
Mark Fashehccd979b2005-12-15 14:31:24 -08001862 struct buffer_head **ret_bh,
1863 int ex,
1864 struct page *page)
1865{
1866 int ret;
1867
Mark Fasheh4bcec182006-10-09 16:02:40 -07001868 ret = ocfs2_meta_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
Mark Fashehccd979b2005-12-15 14:31:24 -08001869 if (ret == -EAGAIN) {
1870 unlock_page(page);
Mark Fasheh4bcec182006-10-09 16:02:40 -07001871 if (ocfs2_meta_lock(inode, ret_bh, ex) == 0)
Mark Fashehccd979b2005-12-15 14:31:24 -08001872 ocfs2_meta_unlock(inode, ex);
1873 ret = AOP_TRUNCATED_PAGE;
1874 }
1875
1876 return ret;
1877}
1878
Tiger Yang7f1a37e2006-11-15 15:48:42 +08001879int ocfs2_meta_lock_atime(struct inode *inode,
1880 struct vfsmount *vfsmnt,
1881 int *level)
1882{
1883 int ret;
1884
1885 mlog_entry_void();
1886 ret = ocfs2_meta_lock(inode, NULL, 0);
1887 if (ret < 0) {
1888 mlog_errno(ret);
1889 return ret;
1890 }
1891
1892 /*
1893 * If we should update atime, we will get EX lock,
1894 * otherwise we just get PR lock.
1895 */
1896 if (ocfs2_should_update_atime(inode, vfsmnt)) {
1897 struct buffer_head *bh = NULL;
1898
1899 ocfs2_meta_unlock(inode, 0);
1900 ret = ocfs2_meta_lock(inode, &bh, 1);
1901 if (ret < 0) {
1902 mlog_errno(ret);
1903 return ret;
1904 }
1905 *level = 1;
1906 if (ocfs2_should_update_atime(inode, vfsmnt))
1907 ocfs2_update_inode_atime(inode, bh);
1908 if (bh)
1909 brelse(bh);
1910 } else
1911 *level = 0;
1912
1913 mlog_exit(ret);
1914 return ret;
1915}
1916
Mark Fashehccd979b2005-12-15 14:31:24 -08001917void ocfs2_meta_unlock(struct inode *inode,
1918 int ex)
1919{
1920 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1921 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
Sunil Mushranc271c5c2006-12-05 17:56:35 -08001922 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
Mark Fashehccd979b2005-12-15 14:31:24 -08001923
1924 mlog_entry_void();
1925
Mark Fashehb0697052006-03-03 10:24:33 -08001926 mlog(0, "inode %llu drop %s META lock\n",
1927 (unsigned long long)OCFS2_I(inode)->ip_blkno,
Mark Fashehccd979b2005-12-15 14:31:24 -08001928 ex ? "EXMODE" : "PRMODE");
1929
Sunil Mushranc271c5c2006-12-05 17:56:35 -08001930 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
1931 !ocfs2_mount_local(osb))
Mark Fashehccd979b2005-12-15 14:31:24 -08001932 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1933
1934 mlog_exit_void();
1935}
1936
1937int ocfs2_super_lock(struct ocfs2_super *osb,
1938 int ex)
1939{
Sunil Mushranc271c5c2006-12-05 17:56:35 -08001940 int status = 0;
Mark Fashehccd979b2005-12-15 14:31:24 -08001941 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1942 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
1943 struct buffer_head *bh;
1944 struct ocfs2_slot_info *si = osb->slot_info;
1945
1946 mlog_entry_void();
1947
1948 if (ocfs2_is_hard_readonly(osb))
1949 return -EROFS;
1950
Sunil Mushranc271c5c2006-12-05 17:56:35 -08001951 if (ocfs2_mount_local(osb))
1952 goto bail;
1953
Mark Fashehccd979b2005-12-15 14:31:24 -08001954 status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
1955 if (status < 0) {
1956 mlog_errno(status);
1957 goto bail;
1958 }
1959
1960 /* The super block lock path is really in the best position to
1961 * know when resources covered by the lock need to be
1962 * refreshed, so we do it here. Of course, making sense of
1963 * everything is up to the caller :) */
1964 status = ocfs2_should_refresh_lock_res(lockres);
1965 if (status < 0) {
1966 mlog_errno(status);
1967 goto bail;
1968 }
1969 if (status) {
1970 bh = si->si_bh;
1971 status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
1972 si->si_inode);
1973 if (status == 0)
1974 ocfs2_update_slot_info(si);
1975
1976 ocfs2_complete_lock_res_refresh(lockres, status);
1977
1978 if (status < 0)
1979 mlog_errno(status);
1980 }
1981bail:
1982 mlog_exit(status);
1983 return status;
1984}
1985
1986void ocfs2_super_unlock(struct ocfs2_super *osb,
1987 int ex)
1988{
1989 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1990 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
1991
Sunil Mushranc271c5c2006-12-05 17:56:35 -08001992 if (!ocfs2_mount_local(osb))
1993 ocfs2_cluster_unlock(osb, lockres, level);
Mark Fashehccd979b2005-12-15 14:31:24 -08001994}
1995
1996int ocfs2_rename_lock(struct ocfs2_super *osb)
1997{
1998 int status;
1999 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
2000
2001 if (ocfs2_is_hard_readonly(osb))
2002 return -EROFS;
2003
Sunil Mushranc271c5c2006-12-05 17:56:35 -08002004 if (ocfs2_mount_local(osb))
2005 return 0;
2006
Mark Fashehccd979b2005-12-15 14:31:24 -08002007 status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0);
2008 if (status < 0)
2009 mlog_errno(status);
2010
2011 return status;
2012}
2013
2014void ocfs2_rename_unlock(struct ocfs2_super *osb)
2015{
2016 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
2017
Sunil Mushranc271c5c2006-12-05 17:56:35 -08002018 if (!ocfs2_mount_local(osb))
2019 ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
Mark Fashehccd979b2005-12-15 14:31:24 -08002020}
2021
Mark Fashehd680efe2006-09-08 14:14:34 -07002022int ocfs2_dentry_lock(struct dentry *dentry, int ex)
2023{
2024 int ret;
2025 int level = ex ? LKM_EXMODE : LKM_PRMODE;
2026 struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
2027 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
2028
2029 BUG_ON(!dl);
2030
2031 if (ocfs2_is_hard_readonly(osb))
2032 return -EROFS;
2033
Sunil Mushranc271c5c2006-12-05 17:56:35 -08002034 if (ocfs2_mount_local(osb))
2035 return 0;
2036
Mark Fashehd680efe2006-09-08 14:14:34 -07002037 ret = ocfs2_cluster_lock(osb, &dl->dl_lockres, level, 0, 0);
2038 if (ret < 0)
2039 mlog_errno(ret);
2040
2041 return ret;
2042}
2043
2044void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
2045{
2046 int level = ex ? LKM_EXMODE : LKM_PRMODE;
2047 struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
2048 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
2049
Sunil Mushranc271c5c2006-12-05 17:56:35 -08002050 if (!ocfs2_mount_local(osb))
2051 ocfs2_cluster_unlock(osb, &dl->dl_lockres, level);
Mark Fashehd680efe2006-09-08 14:14:34 -07002052}
2053
Mark Fashehccd979b2005-12-15 14:31:24 -08002054/* Reference counting of the dlm debug structure. We want this because
2055 * open references on the debug inodes can live on after a mount, so
2056 * we can't rely on the ocfs2_super to always exist. */
2057static void ocfs2_dlm_debug_free(struct kref *kref)
2058{
2059 struct ocfs2_dlm_debug *dlm_debug;
2060
2061 dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt);
2062
2063 kfree(dlm_debug);
2064}
2065
2066void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug)
2067{
2068 if (dlm_debug)
2069 kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free);
2070}
2071
2072static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug)
2073{
2074 kref_get(&debug->d_refcnt);
2075}
2076
2077struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
2078{
2079 struct ocfs2_dlm_debug *dlm_debug;
2080
2081 dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL);
2082 if (!dlm_debug) {
2083 mlog_errno(-ENOMEM);
2084 goto out;
2085 }
2086
2087 kref_init(&dlm_debug->d_refcnt);
2088 INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
2089 dlm_debug->d_locking_state = NULL;
2090out:
2091 return dlm_debug;
2092}
2093
2094/* Access to this is arbitrated for us via seq_file->sem. */
2095struct ocfs2_dlm_seq_priv {
2096 struct ocfs2_dlm_debug *p_dlm_debug;
2097 struct ocfs2_lock_res p_iter_res;
2098 struct ocfs2_lock_res p_tmp_res;
2099};
2100
2101static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start,
2102 struct ocfs2_dlm_seq_priv *priv)
2103{
2104 struct ocfs2_lock_res *iter, *ret = NULL;
2105 struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug;
2106
2107 assert_spin_locked(&ocfs2_dlm_tracking_lock);
2108
2109 list_for_each_entry(iter, &start->l_debug_list, l_debug_list) {
2110 /* discover the head of the list */
2111 if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) {
2112 mlog(0, "End of list found, %p\n", ret);
2113 break;
2114 }
2115
2116 /* We track our "dummy" iteration lockres' by a NULL
2117 * l_ops field. */
2118 if (iter->l_ops != NULL) {
2119 ret = iter;
2120 break;
2121 }
2122 }
2123
2124 return ret;
2125}
2126
2127static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos)
2128{
2129 struct ocfs2_dlm_seq_priv *priv = m->private;
2130 struct ocfs2_lock_res *iter;
2131
2132 spin_lock(&ocfs2_dlm_tracking_lock);
2133 iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv);
2134 if (iter) {
2135 /* Since lockres' have the lifetime of their container
2136 * (which can be inodes, ocfs2_supers, etc) we want to
2137 * copy this out to a temporary lockres while still
2138 * under the spinlock. Obviously after this we can't
2139 * trust any pointers on the copy returned, but that's
2140 * ok as the information we want isn't typically held
2141 * in them. */
2142 priv->p_tmp_res = *iter;
2143 iter = &priv->p_tmp_res;
2144 }
2145 spin_unlock(&ocfs2_dlm_tracking_lock);
2146
2147 return iter;
2148}
2149
2150static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v)
2151{
2152}
2153
2154static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
2155{
2156 struct ocfs2_dlm_seq_priv *priv = m->private;
2157 struct ocfs2_lock_res *iter = v;
2158 struct ocfs2_lock_res *dummy = &priv->p_iter_res;
2159
2160 spin_lock(&ocfs2_dlm_tracking_lock);
2161 iter = ocfs2_dlm_next_res(iter, priv);
2162 list_del_init(&dummy->l_debug_list);
2163 if (iter) {
2164 list_add(&dummy->l_debug_list, &iter->l_debug_list);
2165 priv->p_tmp_res = *iter;
2166 iter = &priv->p_tmp_res;
2167 }
2168 spin_unlock(&ocfs2_dlm_tracking_lock);
2169
2170 return iter;
2171}
2172
2173/* So that debugfs.ocfs2 can determine which format is being used */
2174#define OCFS2_DLM_DEBUG_STR_VERSION 1
2175static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
2176{
2177 int i;
2178 char *lvb;
2179 struct ocfs2_lock_res *lockres = v;
2180
2181 if (!lockres)
2182 return -EINVAL;
2183
Mark Fashehd680efe2006-09-08 14:14:34 -07002184 seq_printf(m, "0x%x\t", OCFS2_DLM_DEBUG_STR_VERSION);
2185
2186 if (lockres->l_type == OCFS2_LOCK_TYPE_DENTRY)
2187 seq_printf(m, "%.*s%08x\t", OCFS2_DENTRY_LOCK_INO_START - 1,
2188 lockres->l_name,
2189 (unsigned int)ocfs2_get_dentry_lock_ino(lockres));
2190 else
2191 seq_printf(m, "%.*s\t", OCFS2_LOCK_ID_MAX_LEN, lockres->l_name);
2192
2193 seq_printf(m, "%d\t"
Mark Fashehccd979b2005-12-15 14:31:24 -08002194 "0x%lx\t"
2195 "0x%x\t"
2196 "0x%x\t"
2197 "%u\t"
2198 "%u\t"
2199 "%d\t"
2200 "%d\t",
Mark Fashehccd979b2005-12-15 14:31:24 -08002201 lockres->l_level,
2202 lockres->l_flags,
2203 lockres->l_action,
2204 lockres->l_unlock_action,
2205 lockres->l_ro_holders,
2206 lockres->l_ex_holders,
2207 lockres->l_requested,
2208 lockres->l_blocking);
2209
2210 /* Dump the raw LVB */
2211 lvb = lockres->l_lksb.lvb;
2212 for(i = 0; i < DLM_LVB_LEN; i++)
2213 seq_printf(m, "0x%x\t", lvb[i]);
2214
2215 /* End the line */
2216 seq_printf(m, "\n");
2217 return 0;
2218}
2219
2220static struct seq_operations ocfs2_dlm_seq_ops = {
2221 .start = ocfs2_dlm_seq_start,
2222 .stop = ocfs2_dlm_seq_stop,
2223 .next = ocfs2_dlm_seq_next,
2224 .show = ocfs2_dlm_seq_show,
2225};
2226
2227static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
2228{
2229 struct seq_file *seq = (struct seq_file *) file->private_data;
2230 struct ocfs2_dlm_seq_priv *priv = seq->private;
2231 struct ocfs2_lock_res *res = &priv->p_iter_res;
2232
2233 ocfs2_remove_lockres_tracking(res);
2234 ocfs2_put_dlm_debug(priv->p_dlm_debug);
2235 return seq_release_private(inode, file);
2236}
2237
2238static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
2239{
2240 int ret;
2241 struct ocfs2_dlm_seq_priv *priv;
2242 struct seq_file *seq;
2243 struct ocfs2_super *osb;
2244
2245 priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL);
2246 if (!priv) {
2247 ret = -ENOMEM;
2248 mlog_errno(ret);
2249 goto out;
2250 }
Theodore Ts'o8e18e292006-09-27 01:50:46 -07002251 osb = inode->i_private;
Mark Fashehccd979b2005-12-15 14:31:24 -08002252 ocfs2_get_dlm_debug(osb->osb_dlm_debug);
2253 priv->p_dlm_debug = osb->osb_dlm_debug;
2254 INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
2255
2256 ret = seq_open(file, &ocfs2_dlm_seq_ops);
2257 if (ret) {
2258 kfree(priv);
2259 mlog_errno(ret);
2260 goto out;
2261 }
2262
2263 seq = (struct seq_file *) file->private_data;
2264 seq->private = priv;
2265
2266 ocfs2_add_lockres_tracking(&priv->p_iter_res,
2267 priv->p_dlm_debug);
2268
2269out:
2270 return ret;
2271}
2272
Arjan van de Ven4b6f5d22006-03-28 01:56:42 -08002273static const struct file_operations ocfs2_dlm_debug_fops = {
Mark Fashehccd979b2005-12-15 14:31:24 -08002274 .open = ocfs2_dlm_debug_open,
2275 .release = ocfs2_dlm_debug_release,
2276 .read = seq_read,
2277 .llseek = seq_lseek,
2278};
2279
2280static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
2281{
2282 int ret = 0;
2283 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
2284
2285 dlm_debug->d_locking_state = debugfs_create_file("locking_state",
2286 S_IFREG|S_IRUSR,
2287 osb->osb_debug_root,
2288 osb,
2289 &ocfs2_dlm_debug_fops);
2290 if (!dlm_debug->d_locking_state) {
2291 ret = -EINVAL;
2292 mlog(ML_ERROR,
2293 "Unable to create locking state debugfs file.\n");
2294 goto out;
2295 }
2296
2297 ocfs2_get_dlm_debug(dlm_debug);
2298out:
2299 return ret;
2300}
2301
2302static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
2303{
2304 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
2305
2306 if (dlm_debug) {
2307 debugfs_remove(dlm_debug->d_locking_state);
2308 ocfs2_put_dlm_debug(dlm_debug);
2309 }
2310}
2311
2312int ocfs2_dlm_init(struct ocfs2_super *osb)
2313{
Sunil Mushranc271c5c2006-12-05 17:56:35 -08002314 int status = 0;
Mark Fashehccd979b2005-12-15 14:31:24 -08002315 u32 dlm_key;
Sunil Mushranc271c5c2006-12-05 17:56:35 -08002316 struct dlm_ctxt *dlm = NULL;
Mark Fashehccd979b2005-12-15 14:31:24 -08002317
2318 mlog_entry_void();
2319
Sunil Mushranc271c5c2006-12-05 17:56:35 -08002320 if (ocfs2_mount_local(osb))
2321 goto local;
2322
Mark Fashehccd979b2005-12-15 14:31:24 -08002323 status = ocfs2_dlm_init_debug(osb);
2324 if (status < 0) {
2325 mlog_errno(status);
2326 goto bail;
2327 }
2328
2329 /* launch vote thread */
Mark Fasheh78427042006-05-04 12:03:26 -07002330 osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote");
Mark Fashehccd979b2005-12-15 14:31:24 -08002331 if (IS_ERR(osb->vote_task)) {
2332 status = PTR_ERR(osb->vote_task);
2333 osb->vote_task = NULL;
2334 mlog_errno(status);
2335 goto bail;
2336 }
2337
2338 /* used by the dlm code to make message headers unique, each
2339 * node in this domain must agree on this. */
2340 dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
2341
2342 /* for now, uuid == domain */
2343 dlm = dlm_register_domain(osb->uuid_str, dlm_key);
2344 if (IS_ERR(dlm)) {
2345 status = PTR_ERR(dlm);
2346 mlog_errno(status);
2347 goto bail;
2348 }
2349
Sunil Mushranc271c5c2006-12-05 17:56:35 -08002350 dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb);
2351
2352local:
Mark Fashehccd979b2005-12-15 14:31:24 -08002353 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
2354 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
2355
Mark Fashehccd979b2005-12-15 14:31:24 -08002356 osb->dlm = dlm;
2357
2358 status = 0;
2359bail:
2360 if (status < 0) {
2361 ocfs2_dlm_shutdown_debug(osb);
2362 if (osb->vote_task)
2363 kthread_stop(osb->vote_task);
2364 }
2365
2366 mlog_exit(status);
2367 return status;
2368}
2369
2370void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
2371{
2372 mlog_entry_void();
2373
2374 dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
2375
2376 ocfs2_drop_osb_locks(osb);
2377
2378 if (osb->vote_task) {
2379 kthread_stop(osb->vote_task);
2380 osb->vote_task = NULL;
2381 }
2382
2383 ocfs2_lock_res_free(&osb->osb_super_lockres);
2384 ocfs2_lock_res_free(&osb->osb_rename_lockres);
2385
2386 dlm_unregister_domain(osb->dlm);
2387 osb->dlm = NULL;
2388
2389 ocfs2_dlm_shutdown_debug(osb);
2390
2391 mlog_exit_void();
2392}
2393
Mark Fasheh2a45f2d2006-09-12 21:36:58 -07002394static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
Mark Fashehccd979b2005-12-15 14:31:24 -08002395{
2396 struct ocfs2_lock_res *lockres = opaque;
2397 unsigned long flags;
2398
2399 mlog_entry_void();
2400
2401 mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
2402 lockres->l_unlock_action);
2403
2404 spin_lock_irqsave(&lockres->l_lock, flags);
2405 /* We tried to cancel a convert request, but it was already
2406 * granted. All we want to do here is clear our unlock
2407 * state. The wake_up call done at the bottom is redundant
2408 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
2409 * hurt anything anyway */
2410 if (status == DLM_CANCELGRANT &&
2411 lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
2412 mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
2413
2414 /* We don't clear the busy flag in this case as it
2415 * should have been cleared by the ast which the dlm
2416 * has called. */
2417 goto complete_unlock;
2418 }
2419
2420 if (status != DLM_NORMAL) {
2421 mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
2422 "unlock_action %d\n", status, lockres->l_name,
2423 lockres->l_unlock_action);
2424 spin_unlock_irqrestore(&lockres->l_lock, flags);
2425 return;
2426 }
2427
2428 switch(lockres->l_unlock_action) {
2429 case OCFS2_UNLOCK_CANCEL_CONVERT:
2430 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
2431 lockres->l_action = OCFS2_AST_INVALID;
2432 break;
2433 case OCFS2_UNLOCK_DROP_LOCK:
2434 lockres->l_level = LKM_IVMODE;
2435 break;
2436 default:
2437 BUG();
2438 }
2439
2440 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
2441complete_unlock:
2442 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
2443 spin_unlock_irqrestore(&lockres->l_lock, flags);
2444
2445 wake_up(&lockres->l_event);
2446
2447 mlog_exit_void();
2448}
2449
Mark Fashehccd979b2005-12-15 14:31:24 -08002450static int ocfs2_drop_lock(struct ocfs2_super *osb,
Mark Fasheh0d5dc6c2006-09-14 14:44:51 -07002451 struct ocfs2_lock_res *lockres)
Mark Fashehccd979b2005-12-15 14:31:24 -08002452{
2453 enum dlm_status status;
2454 unsigned long flags;
Mark Fashehb80fc012006-09-12 22:08:14 -07002455 int lkm_flags = 0;
Mark Fashehccd979b2005-12-15 14:31:24 -08002456
2457 /* We didn't get anywhere near actually using this lockres. */
2458 if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
2459 goto out;
2460
Mark Fashehb80fc012006-09-12 22:08:14 -07002461 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
2462 lkm_flags |= LKM_VALBLK;
2463
Mark Fashehccd979b2005-12-15 14:31:24 -08002464 spin_lock_irqsave(&lockres->l_lock, flags);
2465
2466 mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING),
2467 "lockres %s, flags 0x%lx\n",
2468 lockres->l_name, lockres->l_flags);
2469
2470 while (lockres->l_flags & OCFS2_LOCK_BUSY) {
2471 mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = "
2472 "%u, unlock_action = %u\n",
2473 lockres->l_name, lockres->l_flags, lockres->l_action,
2474 lockres->l_unlock_action);
2475
2476 spin_unlock_irqrestore(&lockres->l_lock, flags);
2477
2478 /* XXX: Today we just wait on any busy
2479 * locks... Perhaps we need to cancel converts in the
2480 * future? */
2481 ocfs2_wait_on_busy_lock(lockres);
2482
2483 spin_lock_irqsave(&lockres->l_lock, flags);
2484 }
2485
Mark Fasheh0d5dc6c2006-09-14 14:44:51 -07002486 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
2487 if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
2488 lockres->l_level == LKM_EXMODE &&
2489 !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
2490 lockres->l_ops->set_lvb(lockres);
2491 }
Mark Fashehccd979b2005-12-15 14:31:24 -08002492
2493 if (lockres->l_flags & OCFS2_LOCK_BUSY)
2494 mlog(ML_ERROR, "destroying busy lock: \"%s\"\n",
2495 lockres->l_name);
2496 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
2497 mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name);
2498
2499 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
2500 spin_unlock_irqrestore(&lockres->l_lock, flags);
2501 goto out;
2502 }
2503
2504 lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED);
2505
2506 /* make sure we never get here while waiting for an ast to
2507 * fire. */
2508 BUG_ON(lockres->l_action != OCFS2_AST_INVALID);
2509
2510 /* is this necessary? */
2511 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
2512 lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
2513 spin_unlock_irqrestore(&lockres->l_lock, flags);
2514
2515 mlog(0, "lock %s\n", lockres->l_name);
2516
Mark Fashehb80fc012006-09-12 22:08:14 -07002517 status = dlmunlock(osb->dlm, &lockres->l_lksb, lkm_flags,
Mark Fasheh2a45f2d2006-09-12 21:36:58 -07002518 ocfs2_unlock_ast, lockres);
Mark Fashehccd979b2005-12-15 14:31:24 -08002519 if (status != DLM_NORMAL) {
2520 ocfs2_log_dlm_error("dlmunlock", status, lockres);
2521 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
2522 dlm_print_one_lock(lockres->l_lksb.lockid);
2523 BUG();
2524 }
2525 mlog(0, "lock %s, successfull return from dlmunlock\n",
2526 lockres->l_name);
2527
2528 ocfs2_wait_on_busy_lock(lockres);
2529out:
2530 mlog_exit(0);
2531 return 0;
2532}
2533
2534/* Mark the lockres as being dropped. It will no longer be
2535 * queued if blocking, but we still may have to wait on it
2536 * being dequeued from the vote thread before we can consider
2537 * it safe to drop.
2538 *
2539 * You can *not* attempt to call cluster_lock on this lockres anymore. */
2540void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
2541{
2542 int status;
2543 struct ocfs2_mask_waiter mw;
2544 unsigned long flags;
2545
2546 ocfs2_init_mask_waiter(&mw);
2547
2548 spin_lock_irqsave(&lockres->l_lock, flags);
2549 lockres->l_flags |= OCFS2_LOCK_FREEING;
2550 while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
2551 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
2552 spin_unlock_irqrestore(&lockres->l_lock, flags);
2553
2554 mlog(0, "Waiting on lockres %s\n", lockres->l_name);
2555
2556 status = ocfs2_wait_for_mask(&mw);
2557 if (status)
2558 mlog_errno(status);
2559
2560 spin_lock_irqsave(&lockres->l_lock, flags);
2561 }
2562 spin_unlock_irqrestore(&lockres->l_lock, flags);
2563}
2564
Mark Fashehd680efe2006-09-08 14:14:34 -07002565void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
2566 struct ocfs2_lock_res *lockres)
2567{
2568 int ret;
2569
2570 ocfs2_mark_lockres_freeing(lockres);
Mark Fasheh0d5dc6c2006-09-14 14:44:51 -07002571 ret = ocfs2_drop_lock(osb, lockres);
Mark Fashehd680efe2006-09-08 14:14:34 -07002572 if (ret)
2573 mlog_errno(ret);
2574}
2575
Mark Fashehccd979b2005-12-15 14:31:24 -08002576static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
2577{
Mark Fashehd680efe2006-09-08 14:14:34 -07002578 ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
2579 ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
Mark Fashehccd979b2005-12-15 14:31:24 -08002580}
2581
Mark Fashehccd979b2005-12-15 14:31:24 -08002582int ocfs2_drop_inode_locks(struct inode *inode)
2583{
2584 int status, err;
Mark Fashehccd979b2005-12-15 14:31:24 -08002585
2586 mlog_entry_void();
2587
2588 /* No need to call ocfs2_mark_lockres_freeing here -
2589 * ocfs2_clear_inode has done it for us. */
2590
2591 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
Tiger Yang50008632007-03-20 16:01:38 -07002592 &OCFS2_I(inode)->ip_open_lockres);
Mark Fashehccd979b2005-12-15 14:31:24 -08002593 if (err < 0)
2594 mlog_errno(err);
2595
2596 status = err;
2597
2598 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
Tiger Yang50008632007-03-20 16:01:38 -07002599 &OCFS2_I(inode)->ip_data_lockres);
2600 if (err < 0)
2601 mlog_errno(err);
2602 if (err < 0 && !status)
2603 status = err;
2604
2605 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
Mark Fasheh0d5dc6c2006-09-14 14:44:51 -07002606 &OCFS2_I(inode)->ip_meta_lockres);
Mark Fashehccd979b2005-12-15 14:31:24 -08002607 if (err < 0)
2608 mlog_errno(err);
2609 if (err < 0 && !status)
2610 status = err;
2611
2612 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
Mark Fasheh0d5dc6c2006-09-14 14:44:51 -07002613 &OCFS2_I(inode)->ip_rw_lockres);
Mark Fashehccd979b2005-12-15 14:31:24 -08002614 if (err < 0)
2615 mlog_errno(err);
2616 if (err < 0 && !status)
2617 status = err;
2618
2619 mlog_exit(status);
2620 return status;
2621}
2622
2623static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
2624 int new_level)
2625{
2626 assert_spin_locked(&lockres->l_lock);
2627
2628 BUG_ON(lockres->l_blocking <= LKM_NLMODE);
2629
2630 if (lockres->l_level <= new_level) {
2631 mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n",
2632 lockres->l_level, new_level);
2633 BUG();
2634 }
2635
2636 mlog(0, "lock %s, new_level = %d, l_blocking = %d\n",
2637 lockres->l_name, new_level, lockres->l_blocking);
2638
2639 lockres->l_action = OCFS2_AST_DOWNCONVERT;
2640 lockres->l_requested = new_level;
2641 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
2642}
2643
2644static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
2645 struct ocfs2_lock_res *lockres,
2646 int new_level,
2647 int lvb)
2648{
2649 int ret, dlm_flags = LKM_CONVERT;
2650 enum dlm_status status;
2651
2652 mlog_entry_void();
2653
2654 if (lvb)
2655 dlm_flags |= LKM_VALBLK;
2656
2657 status = dlmlock(osb->dlm,
2658 new_level,
2659 &lockres->l_lksb,
2660 dlm_flags,
2661 lockres->l_name,
Mark Fashehf0681062006-09-08 11:40:10 -07002662 OCFS2_LOCK_ID_MAX_LEN - 1,
Mark Fashehe92d57d2006-09-12 21:34:35 -07002663 ocfs2_locking_ast,
Mark Fashehccd979b2005-12-15 14:31:24 -08002664 lockres,
Mark Fashehaa2623a2006-09-12 21:58:23 -07002665 ocfs2_blocking_ast);
Mark Fashehccd979b2005-12-15 14:31:24 -08002666 if (status != DLM_NORMAL) {
2667 ocfs2_log_dlm_error("dlmlock", status, lockres);
2668 ret = -EINVAL;
2669 ocfs2_recover_from_dlm_error(lockres, 1);
2670 goto bail;
2671 }
2672
2673 ret = 0;
2674bail:
2675 mlog_exit(ret);
2676 return ret;
2677}
2678
2679/* returns 1 when the caller should unlock and call dlmunlock */
2680static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
2681 struct ocfs2_lock_res *lockres)
2682{
2683 assert_spin_locked(&lockres->l_lock);
2684
2685 mlog_entry_void();
2686 mlog(0, "lock %s\n", lockres->l_name);
2687
2688 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
2689 /* If we're already trying to cancel a lock conversion
2690 * then just drop the spinlock and allow the caller to
2691 * requeue this lock. */
2692
2693 mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
2694 return 0;
2695 }
2696
2697 /* were we in a convert when we got the bast fire? */
2698 BUG_ON(lockres->l_action != OCFS2_AST_CONVERT &&
2699 lockres->l_action != OCFS2_AST_DOWNCONVERT);
2700 /* set things up for the unlockast to know to just
2701 * clear out the ast_action and unset busy, etc. */
2702 lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT;
2703
2704 mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY),
2705 "lock %s, invalid flags: 0x%lx\n",
2706 lockres->l_name, lockres->l_flags);
2707
2708 return 1;
2709}
2710
2711static int ocfs2_cancel_convert(struct ocfs2_super *osb,
2712 struct ocfs2_lock_res *lockres)
2713{
2714 int ret;
2715 enum dlm_status status;
2716
2717 mlog_entry_void();
2718 mlog(0, "lock %s\n", lockres->l_name);
2719
2720 ret = 0;
2721 status = dlmunlock(osb->dlm,
2722 &lockres->l_lksb,
2723 LKM_CANCEL,
Mark Fasheh2a45f2d2006-09-12 21:36:58 -07002724 ocfs2_unlock_ast,
Mark Fashehccd979b2005-12-15 14:31:24 -08002725 lockres);
2726 if (status != DLM_NORMAL) {
2727 ocfs2_log_dlm_error("dlmunlock", status, lockres);
2728 ret = -EINVAL;
2729 ocfs2_recover_from_dlm_error(lockres, 0);
2730 }
2731
2732 mlog(0, "lock %s return from dlmunlock\n", lockres->l_name);
2733
2734 mlog_exit(ret);
2735 return ret;
2736}
2737
Mark Fashehb5e500e2006-09-13 22:01:16 -07002738static int ocfs2_unblock_lock(struct ocfs2_super *osb,
2739 struct ocfs2_lock_res *lockres,
2740 struct ocfs2_unblock_ctl *ctl)
Mark Fashehccd979b2005-12-15 14:31:24 -08002741{
2742 unsigned long flags;
2743 int blocking;
2744 int new_level;
2745 int ret = 0;
Mark Fasheh5ef0d4e2006-09-13 21:21:52 -07002746 int set_lvb = 0;
Mark Fashehccd979b2005-12-15 14:31:24 -08002747
2748 mlog_entry_void();
2749
2750 spin_lock_irqsave(&lockres->l_lock, flags);
2751
2752 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
2753
2754recheck:
2755 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
Mark Fashehd680efe2006-09-08 14:14:34 -07002756 ctl->requeue = 1;
Mark Fashehccd979b2005-12-15 14:31:24 -08002757 ret = ocfs2_prepare_cancel_convert(osb, lockres);
2758 spin_unlock_irqrestore(&lockres->l_lock, flags);
2759 if (ret) {
2760 ret = ocfs2_cancel_convert(osb, lockres);
2761 if (ret < 0)
2762 mlog_errno(ret);
2763 }
2764 goto leave;
2765 }
2766
2767 /* if we're blocking an exclusive and we have *any* holders,
2768 * then requeue. */
2769 if ((lockres->l_blocking == LKM_EXMODE)
Mark Fashehf7fbfdd2006-09-13 21:02:29 -07002770 && (lockres->l_ex_holders || lockres->l_ro_holders))
2771 goto leave_requeue;
Mark Fashehccd979b2005-12-15 14:31:24 -08002772
2773 /* If it's a PR we're blocking, then only
2774 * requeue if we've got any EX holders */
2775 if (lockres->l_blocking == LKM_PRMODE &&
Mark Fashehf7fbfdd2006-09-13 21:02:29 -07002776 lockres->l_ex_holders)
2777 goto leave_requeue;
2778
2779 /*
2780 * Can we get a lock in this state if the holder counts are
2781 * zero? The meta data unblock code used to check this.
2782 */
2783 if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
2784 && (lockres->l_flags & OCFS2_LOCK_REFRESHING))
2785 goto leave_requeue;
Mark Fashehccd979b2005-12-15 14:31:24 -08002786
Mark Fasheh16d5b9562006-09-13 21:10:12 -07002787 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
2788
2789 if (lockres->l_ops->check_downconvert
2790 && !lockres->l_ops->check_downconvert(lockres, new_level))
2791 goto leave_requeue;
2792
Mark Fashehccd979b2005-12-15 14:31:24 -08002793 /* If we get here, then we know that there are no more
2794 * incompatible holders (and anyone asking for an incompatible
2795 * lock is blocked). We can now downconvert the lock */
Mark Fashehcc567d82006-09-13 21:52:21 -07002796 if (!lockres->l_ops->downconvert_worker)
Mark Fashehccd979b2005-12-15 14:31:24 -08002797 goto downconvert;
2798
2799 /* Some lockres types want to do a bit of work before
2800 * downconverting a lock. Allow that here. The worker function
2801 * may sleep, so we save off a copy of what we're blocking as
2802 * it may change while we're not holding the spin lock. */
2803 blocking = lockres->l_blocking;
2804 spin_unlock_irqrestore(&lockres->l_lock, flags);
2805
Mark Fashehcc567d82006-09-13 21:52:21 -07002806 ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking);
Mark Fashehd680efe2006-09-08 14:14:34 -07002807
2808 if (ctl->unblock_action == UNBLOCK_STOP_POST)
2809 goto leave;
Mark Fashehccd979b2005-12-15 14:31:24 -08002810
2811 spin_lock_irqsave(&lockres->l_lock, flags);
2812 if (blocking != lockres->l_blocking) {
2813 /* If this changed underneath us, then we can't drop
2814 * it just yet. */
2815 goto recheck;
2816 }
2817
2818downconvert:
Mark Fashehd680efe2006-09-08 14:14:34 -07002819 ctl->requeue = 0;
Mark Fashehccd979b2005-12-15 14:31:24 -08002820
Mark Fasheh5ef0d4e2006-09-13 21:21:52 -07002821 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
2822 if (lockres->l_level == LKM_EXMODE)
2823 set_lvb = 1;
2824
2825 /*
2826 * We only set the lvb if the lock has been fully
2827 * refreshed - otherwise we risk setting stale
2828 * data. Otherwise, there's no need to actually clear
2829 * out the lvb here as it's value is still valid.
2830 */
2831 if (set_lvb && !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
2832 lockres->l_ops->set_lvb(lockres);
2833 }
2834
Mark Fashehccd979b2005-12-15 14:31:24 -08002835 ocfs2_prepare_downconvert(lockres, new_level);
2836 spin_unlock_irqrestore(&lockres->l_lock, flags);
Mark Fasheh5ef0d4e2006-09-13 21:21:52 -07002837 ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
Mark Fashehccd979b2005-12-15 14:31:24 -08002838leave:
2839 mlog_exit(ret);
2840 return ret;
Mark Fashehf7fbfdd2006-09-13 21:02:29 -07002841
2842leave_requeue:
2843 spin_unlock_irqrestore(&lockres->l_lock, flags);
2844 ctl->requeue = 1;
2845
2846 mlog_exit(0);
2847 return 0;
Mark Fashehccd979b2005-12-15 14:31:24 -08002848}
2849
Mark Fashehd680efe2006-09-08 14:14:34 -07002850static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
2851 int blocking)
Mark Fashehccd979b2005-12-15 14:31:24 -08002852{
2853 struct inode *inode;
2854 struct address_space *mapping;
2855
Mark Fashehccd979b2005-12-15 14:31:24 -08002856 inode = ocfs2_lock_res_inode(lockres);
2857 mapping = inode->i_mapping;
2858
Mark Fasheh7f4a2a92006-12-11 11:06:36 -08002859 /*
2860 * We need this before the filemap_fdatawrite() so that it can
2861 * transfer the dirty bit from the PTE to the
2862 * page. Unfortunately this means that even for EX->PR
2863 * downconverts, we'll lose our mappings and have to build
2864 * them up again.
2865 */
2866 unmap_mapping_range(mapping, 0, 0, 0);
2867
Mark Fashehccd979b2005-12-15 14:31:24 -08002868 if (filemap_fdatawrite(mapping)) {
Mark Fashehb0697052006-03-03 10:24:33 -08002869 mlog(ML_ERROR, "Could not sync inode %llu for downconvert!",
2870 (unsigned long long)OCFS2_I(inode)->ip_blkno);
Mark Fashehccd979b2005-12-15 14:31:24 -08002871 }
2872 sync_mapping_buffers(mapping);
2873 if (blocking == LKM_EXMODE) {
2874 truncate_inode_pages(mapping, 0);
Mark Fashehccd979b2005-12-15 14:31:24 -08002875 } else {
2876 /* We only need to wait on the I/O if we're not also
2877 * truncating pages because truncate_inode_pages waits
2878 * for us above. We don't truncate pages if we're
2879 * blocking anything < EXMODE because we want to keep
2880 * them around in that case. */
2881 filemap_fdatawait(mapping);
2882 }
2883
Mark Fashehd680efe2006-09-08 14:14:34 -07002884 return UNBLOCK_CONTINUE;
Mark Fashehccd979b2005-12-15 14:31:24 -08002885}
2886
Mark Fasheh810d5ae2006-09-13 21:39:52 -07002887static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
2888 int new_level)
2889{
2890 struct inode *inode = ocfs2_lock_res_inode(lockres);
2891 int checkpointed = ocfs2_inode_fully_checkpointed(inode);
2892
2893 BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE);
2894 BUG_ON(lockres->l_level != LKM_EXMODE && !checkpointed);
2895
2896 if (checkpointed)
2897 return 1;
2898
2899 ocfs2_start_checkpoint(OCFS2_SB(inode->i_sb));
2900 return 0;
2901}
2902
2903static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
2904{
2905 struct inode *inode = ocfs2_lock_res_inode(lockres);
2906
2907 __ocfs2_stuff_meta_lvb(inode);
2908}
2909
Mark Fashehd680efe2006-09-08 14:14:34 -07002910/*
2911 * Does the final reference drop on our dentry lock. Right now this
2912 * happens in the vote thread, but we could choose to simplify the
2913 * dlmglue API and push these off to the ocfs2_wq in the future.
2914 */
2915static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
2916 struct ocfs2_lock_res *lockres)
2917{
2918 struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres);
2919 ocfs2_dentry_lock_put(osb, dl);
2920}
2921
2922/*
2923 * d_delete() matching dentries before the lock downconvert.
2924 *
2925 * At this point, any process waiting to destroy the
2926 * dentry_lock due to last ref count is stopped by the
2927 * OCFS2_LOCK_QUEUED flag.
2928 *
2929 * We have two potential problems
2930 *
2931 * 1) If we do the last reference drop on our dentry_lock (via dput)
2932 * we'll wind up in ocfs2_release_dentry_lock(), waiting on
2933 * the downconvert to finish. Instead we take an elevated
2934 * reference and push the drop until after we've completed our
2935 * unblock processing.
2936 *
2937 * 2) There might be another process with a final reference,
2938 * waiting on us to finish processing. If this is the case, we
2939 * detect it and exit out - there's no more dentries anyway.
2940 */
2941static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
2942 int blocking)
2943{
2944 struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres);
2945 struct ocfs2_inode_info *oi = OCFS2_I(dl->dl_inode);
2946 struct dentry *dentry;
2947 unsigned long flags;
2948 int extra_ref = 0;
2949
2950 /*
2951 * This node is blocking another node from getting a read
2952 * lock. This happens when we've renamed within a
2953 * directory. We've forced the other nodes to d_delete(), but
2954 * we never actually dropped our lock because it's still
2955 * valid. The downconvert code will retain a PR for this node,
2956 * so there's no further work to do.
2957 */
2958 if (blocking == LKM_PRMODE)
2959 return UNBLOCK_CONTINUE;
2960
2961 /*
2962 * Mark this inode as potentially orphaned. The code in
2963 * ocfs2_delete_inode() will figure out whether it actually
2964 * needs to be freed or not.
2965 */
2966 spin_lock(&oi->ip_lock);
2967 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
2968 spin_unlock(&oi->ip_lock);
2969
2970 /*
2971 * Yuck. We need to make sure however that the check of
2972 * OCFS2_LOCK_FREEING and the extra reference are atomic with
2973 * respect to a reference decrement or the setting of that
2974 * flag.
2975 */
2976 spin_lock_irqsave(&lockres->l_lock, flags);
2977 spin_lock(&dentry_attach_lock);
2978 if (!(lockres->l_flags & OCFS2_LOCK_FREEING)
2979 && dl->dl_count) {
2980 dl->dl_count++;
2981 extra_ref = 1;
2982 }
2983 spin_unlock(&dentry_attach_lock);
2984 spin_unlock_irqrestore(&lockres->l_lock, flags);
2985
2986 mlog(0, "extra_ref = %d\n", extra_ref);
2987
2988 /*
2989 * We have a process waiting on us in ocfs2_dentry_iput(),
2990 * which means we can't have any more outstanding
2991 * aliases. There's no need to do any more work.
2992 */
2993 if (!extra_ref)
2994 return UNBLOCK_CONTINUE;
2995
2996 spin_lock(&dentry_attach_lock);
2997 while (1) {
2998 dentry = ocfs2_find_local_alias(dl->dl_inode,
2999 dl->dl_parent_blkno, 1);
3000 if (!dentry)
3001 break;
3002 spin_unlock(&dentry_attach_lock);
3003
3004 mlog(0, "d_delete(%.*s);\n", dentry->d_name.len,
3005 dentry->d_name.name);
3006
3007 /*
3008 * The following dcache calls may do an
3009 * iput(). Normally we don't want that from the
3010 * downconverting thread, but in this case it's ok
3011 * because the requesting node already has an
3012 * exclusive lock on the inode, so it can't be queued
3013 * for a downconvert.
3014 */
3015 d_delete(dentry);
3016 dput(dentry);
3017
3018 spin_lock(&dentry_attach_lock);
3019 }
3020 spin_unlock(&dentry_attach_lock);
3021
3022 /*
3023 * If we are the last holder of this dentry lock, there is no
3024 * reason to downconvert so skip straight to the unlock.
3025 */
3026 if (dl->dl_count == 1)
3027 return UNBLOCK_STOP_POST;
3028
3029 return UNBLOCK_CONTINUE_POST;
3030}
3031
Mark Fashehccd979b2005-12-15 14:31:24 -08003032void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3033 struct ocfs2_lock_res *lockres)
3034{
3035 int status;
Mark Fashehd680efe2006-09-08 14:14:34 -07003036 struct ocfs2_unblock_ctl ctl = {0, 0,};
Mark Fashehccd979b2005-12-15 14:31:24 -08003037 unsigned long flags;
3038
3039 /* Our reference to the lockres in this function can be
3040 * considered valid until we remove the OCFS2_LOCK_QUEUED
3041 * flag. */
3042
3043 mlog_entry_void();
3044
3045 BUG_ON(!lockres);
3046 BUG_ON(!lockres->l_ops);
Mark Fashehccd979b2005-12-15 14:31:24 -08003047
3048 mlog(0, "lockres %s blocked.\n", lockres->l_name);
3049
3050 /* Detect whether a lock has been marked as going away while
3051 * the vote thread was processing other things. A lock can
3052 * still be marked with OCFS2_LOCK_FREEING after this check,
3053 * but short circuiting here will still save us some
3054 * performance. */
3055 spin_lock_irqsave(&lockres->l_lock, flags);
3056 if (lockres->l_flags & OCFS2_LOCK_FREEING)
3057 goto unqueue;
3058 spin_unlock_irqrestore(&lockres->l_lock, flags);
3059
Mark Fashehb5e500e2006-09-13 22:01:16 -07003060 status = ocfs2_unblock_lock(osb, lockres, &ctl);
Mark Fashehccd979b2005-12-15 14:31:24 -08003061 if (status < 0)
3062 mlog_errno(status);
3063
3064 spin_lock_irqsave(&lockres->l_lock, flags);
3065unqueue:
Mark Fashehd680efe2006-09-08 14:14:34 -07003066 if (lockres->l_flags & OCFS2_LOCK_FREEING || !ctl.requeue) {
Mark Fashehccd979b2005-12-15 14:31:24 -08003067 lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
3068 } else
3069 ocfs2_schedule_blocked_lock(osb, lockres);
3070
3071 mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name,
Mark Fashehd680efe2006-09-08 14:14:34 -07003072 ctl.requeue ? "yes" : "no");
Mark Fashehccd979b2005-12-15 14:31:24 -08003073 spin_unlock_irqrestore(&lockres->l_lock, flags);
3074
Mark Fashehd680efe2006-09-08 14:14:34 -07003075 if (ctl.unblock_action != UNBLOCK_CONTINUE
3076 && lockres->l_ops->post_unlock)
3077 lockres->l_ops->post_unlock(osb, lockres);
3078
Mark Fashehccd979b2005-12-15 14:31:24 -08003079 mlog_exit_void();
3080}
3081
3082static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
3083 struct ocfs2_lock_res *lockres)
3084{
3085 mlog_entry_void();
3086
3087 assert_spin_locked(&lockres->l_lock);
3088
3089 if (lockres->l_flags & OCFS2_LOCK_FREEING) {
3090 /* Do not schedule a lock for downconvert when it's on
3091 * the way to destruction - any nodes wanting access
3092 * to the resource will get it soon. */
3093 mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n",
3094 lockres->l_name, lockres->l_flags);
3095 return;
3096 }
3097
3098 lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
3099
3100 spin_lock(&osb->vote_task_lock);
3101 if (list_empty(&lockres->l_blocked_list)) {
3102 list_add_tail(&lockres->l_blocked_list,
3103 &osb->blocked_lock_list);
3104 osb->blocked_lock_count++;
3105 }
3106 spin_unlock(&osb->vote_task_lock);
3107
3108 mlog_exit_void();
3109}