Kurt Hackel | 6714d8e | 2005-12-15 14:31:23 -0800 | [diff] [blame] | 1 | /* -*- mode: c; c-basic-offset: 8; -*- |
| 2 | * vim: noexpandtab sw=8 ts=8 sts=0: |
| 3 | * |
| 4 | * dlmast.c |
| 5 | * |
| 6 | * AST and BAST functionality for local and remote nodes |
| 7 | * |
| 8 | * Copyright (C) 2004 Oracle. All rights reserved. |
| 9 | * |
| 10 | * This program is free software; you can redistribute it and/or |
| 11 | * modify it under the terms of the GNU General Public |
| 12 | * License as published by the Free Software Foundation; either |
| 13 | * version 2 of the License, or (at your option) any later version. |
| 14 | * |
| 15 | * This program is distributed in the hope that it will be useful, |
| 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 18 | * General Public License for more details. |
| 19 | * |
| 20 | * You should have received a copy of the GNU General Public |
| 21 | * License along with this program; if not, write to the |
| 22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
| 23 | * Boston, MA 021110-1307, USA. |
| 24 | * |
| 25 | */ |
| 26 | |
| 27 | |
| 28 | #include <linux/module.h> |
| 29 | #include <linux/fs.h> |
| 30 | #include <linux/types.h> |
| 31 | #include <linux/slab.h> |
| 32 | #include <linux/highmem.h> |
| 33 | #include <linux/utsname.h> |
| 34 | #include <linux/init.h> |
| 35 | #include <linux/sysctl.h> |
| 36 | #include <linux/random.h> |
| 37 | #include <linux/blkdev.h> |
| 38 | #include <linux/socket.h> |
| 39 | #include <linux/inet.h> |
| 40 | #include <linux/spinlock.h> |
| 41 | |
| 42 | |
| 43 | #include "cluster/heartbeat.h" |
| 44 | #include "cluster/nodemanager.h" |
| 45 | #include "cluster/tcp.h" |
| 46 | #include "cluster/endian.h" |
| 47 | |
| 48 | #include "dlmapi.h" |
| 49 | #include "dlmcommon.h" |
| 50 | |
| 51 | #define MLOG_MASK_PREFIX ML_DLM |
| 52 | #include "cluster/masklog.h" |
| 53 | |
| 54 | static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, |
| 55 | struct dlm_lock *lock); |
| 56 | static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); |
| 57 | |
| 58 | /* Should be called as an ast gets queued to see if the new |
| 59 | * lock level will obsolete a pending bast. |
| 60 | * For example, if dlm_thread queued a bast for an EX lock that |
| 61 | * was blocking another EX, but before sending the bast the |
| 62 | * lock owner downconverted to NL, the bast is now obsolete. |
| 63 | * Only the ast should be sent. |
| 64 | * This is needed because the lock and convert paths can queue |
| 65 | * asts out-of-band (not waiting for dlm_thread) in order to |
| 66 | * allow for LKM_NOQUEUE to get immediate responses. */ |
| 67 | static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) |
| 68 | { |
| 69 | assert_spin_locked(&dlm->ast_lock); |
| 70 | assert_spin_locked(&lock->spinlock); |
| 71 | |
| 72 | if (lock->ml.highest_blocked == LKM_IVMODE) |
| 73 | return 0; |
| 74 | BUG_ON(lock->ml.highest_blocked == LKM_NLMODE); |
| 75 | |
| 76 | if (lock->bast_pending && |
| 77 | list_empty(&lock->bast_list)) |
| 78 | /* old bast already sent, ok */ |
| 79 | return 0; |
| 80 | |
| 81 | if (lock->ml.type == LKM_EXMODE) |
| 82 | /* EX blocks anything left, any bast still valid */ |
| 83 | return 0; |
| 84 | else if (lock->ml.type == LKM_NLMODE) |
| 85 | /* NL blocks nothing, no reason to send any bast, cancel it */ |
| 86 | return 1; |
| 87 | else if (lock->ml.highest_blocked != LKM_EXMODE) |
| 88 | /* PR only blocks EX */ |
| 89 | return 1; |
| 90 | |
| 91 | return 0; |
| 92 | } |
| 93 | |
| 94 | static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) |
| 95 | { |
| 96 | mlog_entry_void(); |
| 97 | |
| 98 | BUG_ON(!dlm); |
| 99 | BUG_ON(!lock); |
| 100 | |
| 101 | assert_spin_locked(&dlm->ast_lock); |
| 102 | if (!list_empty(&lock->ast_list)) { |
| 103 | mlog(ML_ERROR, "ast list not empty!! pending=%d, newlevel=%d\n", |
| 104 | lock->ast_pending, lock->ml.type); |
| 105 | BUG(); |
| 106 | } |
| 107 | BUG_ON(!list_empty(&lock->ast_list)); |
| 108 | if (lock->ast_pending) |
| 109 | mlog(0, "lock has an ast getting flushed right now\n"); |
| 110 | |
| 111 | /* putting lock on list, add a ref */ |
| 112 | dlm_lock_get(lock); |
| 113 | spin_lock(&lock->spinlock); |
| 114 | |
| 115 | /* check to see if this ast obsoletes the bast */ |
| 116 | if (dlm_should_cancel_bast(dlm, lock)) { |
| 117 | struct dlm_lock_resource *res = lock->lockres; |
| 118 | mlog(0, "%s: cancelling bast for %.*s\n", |
| 119 | dlm->name, res->lockname.len, res->lockname.name); |
| 120 | lock->bast_pending = 0; |
| 121 | list_del_init(&lock->bast_list); |
| 122 | lock->ml.highest_blocked = LKM_IVMODE; |
| 123 | /* removing lock from list, remove a ref. guaranteed |
| 124 | * this won't be the last ref because of the get above, |
| 125 | * so res->spinlock will not be taken here */ |
| 126 | dlm_lock_put(lock); |
| 127 | /* free up the reserved bast that we are cancelling. |
| 128 | * guaranteed that this will not be the last reserved |
| 129 | * ast because *both* an ast and a bast were reserved |
| 130 | * to get to this point. the res->spinlock will not be |
| 131 | * taken here */ |
| 132 | dlm_lockres_release_ast(dlm, res); |
| 133 | } |
| 134 | list_add_tail(&lock->ast_list, &dlm->pending_asts); |
| 135 | lock->ast_pending = 1; |
| 136 | spin_unlock(&lock->spinlock); |
| 137 | } |
| 138 | |
| 139 | void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) |
| 140 | { |
| 141 | mlog_entry_void(); |
| 142 | |
| 143 | BUG_ON(!dlm); |
| 144 | BUG_ON(!lock); |
| 145 | |
| 146 | spin_lock(&dlm->ast_lock); |
| 147 | __dlm_queue_ast(dlm, lock); |
| 148 | spin_unlock(&dlm->ast_lock); |
| 149 | } |
| 150 | |
| 151 | |
| 152 | static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) |
| 153 | { |
| 154 | mlog_entry_void(); |
| 155 | |
| 156 | BUG_ON(!dlm); |
| 157 | BUG_ON(!lock); |
| 158 | assert_spin_locked(&dlm->ast_lock); |
| 159 | |
| 160 | BUG_ON(!list_empty(&lock->bast_list)); |
| 161 | if (lock->bast_pending) |
| 162 | mlog(0, "lock has a bast getting flushed right now\n"); |
| 163 | |
| 164 | /* putting lock on list, add a ref */ |
| 165 | dlm_lock_get(lock); |
| 166 | spin_lock(&lock->spinlock); |
| 167 | list_add_tail(&lock->bast_list, &dlm->pending_basts); |
| 168 | lock->bast_pending = 1; |
| 169 | spin_unlock(&lock->spinlock); |
| 170 | } |
| 171 | |
| 172 | void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) |
| 173 | { |
| 174 | mlog_entry_void(); |
| 175 | |
| 176 | BUG_ON(!dlm); |
| 177 | BUG_ON(!lock); |
| 178 | |
| 179 | spin_lock(&dlm->ast_lock); |
| 180 | __dlm_queue_bast(dlm, lock); |
| 181 | spin_unlock(&dlm->ast_lock); |
| 182 | } |
| 183 | |
| 184 | static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, |
| 185 | struct dlm_lock *lock) |
| 186 | { |
| 187 | struct dlm_lockstatus *lksb = lock->lksb; |
| 188 | BUG_ON(!lksb); |
| 189 | |
| 190 | /* only updates if this node masters the lockres */ |
| 191 | if (res->owner == dlm->node_num) { |
| 192 | |
| 193 | spin_lock(&res->spinlock); |
| 194 | /* check the lksb flags for the direction */ |
| 195 | if (lksb->flags & DLM_LKSB_GET_LVB) { |
| 196 | mlog(0, "getting lvb from lockres for %s node\n", |
| 197 | lock->ml.node == dlm->node_num ? "master" : |
| 198 | "remote"); |
| 199 | memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN); |
| 200 | } else if (lksb->flags & DLM_LKSB_PUT_LVB) { |
| 201 | mlog(0, "setting lvb from lockres for %s node\n", |
| 202 | lock->ml.node == dlm->node_num ? "master" : |
| 203 | "remote"); |
| 204 | memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN); |
| 205 | } |
| 206 | spin_unlock(&res->spinlock); |
| 207 | } |
| 208 | |
| 209 | /* reset any lvb flags on the lksb */ |
| 210 | lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB); |
| 211 | } |
| 212 | |
| 213 | void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, |
| 214 | struct dlm_lock *lock) |
| 215 | { |
| 216 | dlm_astlockfunc_t *fn; |
| 217 | struct dlm_lockstatus *lksb; |
| 218 | |
| 219 | mlog_entry_void(); |
| 220 | |
| 221 | lksb = lock->lksb; |
| 222 | fn = lock->ast; |
| 223 | BUG_ON(lock->ml.node != dlm->node_num); |
| 224 | |
| 225 | dlm_update_lvb(dlm, res, lock); |
| 226 | (*fn)(lock->astdata); |
| 227 | } |
| 228 | |
| 229 | |
| 230 | int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, |
| 231 | struct dlm_lock *lock) |
| 232 | { |
| 233 | int ret; |
| 234 | struct dlm_lockstatus *lksb; |
| 235 | int lksbflags; |
| 236 | |
| 237 | mlog_entry_void(); |
| 238 | |
| 239 | lksb = lock->lksb; |
| 240 | BUG_ON(lock->ml.node == dlm->node_num); |
| 241 | |
| 242 | lksbflags = lksb->flags; |
| 243 | dlm_update_lvb(dlm, res, lock); |
| 244 | |
| 245 | /* lock request came from another node |
| 246 | * go do the ast over there */ |
| 247 | ret = dlm_send_proxy_ast(dlm, res, lock, lksbflags); |
| 248 | return ret; |
| 249 | } |
| 250 | |
| 251 | void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, |
| 252 | struct dlm_lock *lock, int blocked_type) |
| 253 | { |
| 254 | dlm_bastlockfunc_t *fn = lock->bast; |
| 255 | |
| 256 | mlog_entry_void(); |
| 257 | BUG_ON(lock->ml.node != dlm->node_num); |
| 258 | |
| 259 | (*fn)(lock->astdata, blocked_type); |
| 260 | } |
| 261 | |
| 262 | |
| 263 | |
| 264 | int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data) |
| 265 | { |
| 266 | int ret; |
| 267 | unsigned int locklen; |
| 268 | struct dlm_ctxt *dlm = data; |
| 269 | struct dlm_lock_resource *res = NULL; |
| 270 | struct dlm_lock *lock = NULL; |
| 271 | struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf; |
| 272 | char *name; |
| 273 | struct list_head *iter, *head=NULL; |
| 274 | u64 cookie; |
| 275 | u32 flags; |
| 276 | |
| 277 | if (!dlm_grab(dlm)) { |
| 278 | dlm_error(DLM_REJECTED); |
| 279 | return DLM_REJECTED; |
| 280 | } |
| 281 | |
| 282 | mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), |
| 283 | "Domain %s not fully joined!\n", dlm->name); |
| 284 | |
| 285 | name = past->name; |
| 286 | locklen = past->namelen; |
| 287 | cookie = be64_to_cpu(past->cookie); |
| 288 | flags = be32_to_cpu(past->flags); |
| 289 | |
| 290 | if (locklen > DLM_LOCKID_NAME_MAX) { |
| 291 | ret = DLM_IVBUFLEN; |
| 292 | mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n"); |
| 293 | goto leave; |
| 294 | } |
| 295 | |
| 296 | if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) == |
| 297 | (LKM_PUT_LVB|LKM_GET_LVB)) { |
| 298 | mlog(ML_ERROR, "both PUT and GET lvb specified\n"); |
| 299 | ret = DLM_BADARGS; |
| 300 | goto leave; |
| 301 | } |
| 302 | |
| 303 | mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : |
| 304 | (flags & LKM_GET_LVB ? "get lvb" : "none")); |
| 305 | |
| 306 | mlog(0, "type=%d, blocked_type=%d\n", past->type, past->blocked_type); |
| 307 | |
| 308 | if (past->type != DLM_AST && |
| 309 | past->type != DLM_BAST) { |
| 310 | mlog(ML_ERROR, "Unknown ast type! %d, cookie=%"MLFu64", " |
| 311 | "name=%.*s\n", past->type, cookie, locklen, name); |
| 312 | ret = DLM_IVLOCKID; |
| 313 | goto leave; |
| 314 | } |
| 315 | |
| 316 | res = dlm_lookup_lockres(dlm, name, locklen); |
| 317 | if (!res) { |
| 318 | mlog(ML_ERROR, "got %sast for unknown lockres! " |
| 319 | "cookie=%"MLFu64", name=%.*s, namelen=%u\n", |
| 320 | past->type == DLM_AST ? "" : "b", |
| 321 | cookie, locklen, name, locklen); |
| 322 | ret = DLM_IVLOCKID; |
| 323 | goto leave; |
| 324 | } |
| 325 | |
| 326 | /* cannot get a proxy ast message if this node owns it */ |
| 327 | BUG_ON(res->owner == dlm->node_num); |
| 328 | |
| 329 | mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name); |
| 330 | |
| 331 | spin_lock(&res->spinlock); |
| 332 | if (res->state & DLM_LOCK_RES_RECOVERING) { |
| 333 | mlog(0, "responding with DLM_RECOVERING!\n"); |
| 334 | ret = DLM_RECOVERING; |
| 335 | goto unlock_out; |
| 336 | } |
| 337 | if (res->state & DLM_LOCK_RES_MIGRATING) { |
| 338 | mlog(0, "responding with DLM_MIGRATING!\n"); |
| 339 | ret = DLM_MIGRATING; |
| 340 | goto unlock_out; |
| 341 | } |
| 342 | /* try convert queue for both ast/bast */ |
| 343 | head = &res->converting; |
| 344 | lock = NULL; |
| 345 | list_for_each(iter, head) { |
| 346 | lock = list_entry (iter, struct dlm_lock, list); |
| 347 | if (be64_to_cpu(lock->ml.cookie) == cookie) |
| 348 | goto do_ast; |
| 349 | } |
| 350 | |
| 351 | /* if not on convert, try blocked for ast, granted for bast */ |
| 352 | if (past->type == DLM_AST) |
| 353 | head = &res->blocked; |
| 354 | else |
| 355 | head = &res->granted; |
| 356 | |
| 357 | list_for_each(iter, head) { |
| 358 | lock = list_entry (iter, struct dlm_lock, list); |
| 359 | if (be64_to_cpu(lock->ml.cookie) == cookie) |
| 360 | goto do_ast; |
| 361 | } |
| 362 | |
| 363 | mlog(ML_ERROR, "got %sast for unknown lock! cookie=%"MLFu64", " |
| 364 | "name=%.*s, namelen=%u\n", |
| 365 | past->type == DLM_AST ? "" : "b", cookie, locklen, name, locklen); |
| 366 | |
| 367 | ret = DLM_NORMAL; |
| 368 | unlock_out: |
| 369 | spin_unlock(&res->spinlock); |
| 370 | goto leave; |
| 371 | |
| 372 | do_ast: |
| 373 | ret = DLM_NORMAL; |
| 374 | if (past->type == DLM_AST) { |
| 375 | /* do not alter lock refcount. switching lists. */ |
| 376 | list_del_init(&lock->list); |
| 377 | list_add_tail(&lock->list, &res->granted); |
| 378 | mlog(0, "ast: adding to granted list... type=%d, " |
| 379 | "convert_type=%d\n", lock->ml.type, lock->ml.convert_type); |
| 380 | if (lock->ml.convert_type != LKM_IVMODE) { |
| 381 | lock->ml.type = lock->ml.convert_type; |
| 382 | lock->ml.convert_type = LKM_IVMODE; |
| 383 | } else { |
| 384 | // should already be there.... |
| 385 | } |
| 386 | |
| 387 | lock->lksb->status = DLM_NORMAL; |
| 388 | |
| 389 | /* if we requested the lvb, fetch it into our lksb now */ |
| 390 | if (flags & LKM_GET_LVB) { |
| 391 | BUG_ON(!(lock->lksb->flags & DLM_LKSB_GET_LVB)); |
| 392 | memcpy(lock->lksb->lvb, past->lvb, DLM_LVB_LEN); |
| 393 | } |
| 394 | } |
| 395 | spin_unlock(&res->spinlock); |
| 396 | |
| 397 | if (past->type == DLM_AST) |
| 398 | dlm_do_local_ast(dlm, res, lock); |
| 399 | else |
| 400 | dlm_do_local_bast(dlm, res, lock, past->blocked_type); |
| 401 | |
| 402 | leave: |
| 403 | |
| 404 | if (res) |
| 405 | dlm_lockres_put(res); |
| 406 | |
| 407 | dlm_put(dlm); |
| 408 | return ret; |
| 409 | } |
| 410 | |
| 411 | |
| 412 | |
| 413 | int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, |
| 414 | struct dlm_lock *lock, int msg_type, |
| 415 | int blocked_type, int flags) |
| 416 | { |
| 417 | int ret = 0; |
| 418 | struct dlm_proxy_ast past; |
| 419 | struct kvec vec[2]; |
| 420 | size_t veclen = 1; |
| 421 | int status; |
| 422 | |
| 423 | mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n", |
| 424 | res->lockname.len, res->lockname.name, lock->ml.node, |
| 425 | msg_type, blocked_type); |
| 426 | |
| 427 | memset(&past, 0, sizeof(struct dlm_proxy_ast)); |
| 428 | past.node_idx = dlm->node_num; |
| 429 | past.type = msg_type; |
| 430 | past.blocked_type = blocked_type; |
| 431 | past.namelen = res->lockname.len; |
| 432 | memcpy(past.name, res->lockname.name, past.namelen); |
| 433 | past.cookie = lock->ml.cookie; |
| 434 | |
| 435 | vec[0].iov_len = sizeof(struct dlm_proxy_ast); |
| 436 | vec[0].iov_base = &past; |
| 437 | if (flags & DLM_LKSB_GET_LVB) { |
| 438 | mlog(0, "returning requested LVB data\n"); |
| 439 | be32_add_cpu(&past.flags, LKM_GET_LVB); |
| 440 | vec[1].iov_len = DLM_LVB_LEN; |
| 441 | vec[1].iov_base = lock->lksb->lvb; |
| 442 | veclen++; |
| 443 | } |
| 444 | |
| 445 | ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen, |
| 446 | lock->ml.node, &status); |
| 447 | if (ret < 0) |
| 448 | mlog_errno(ret); |
| 449 | else { |
| 450 | if (status == DLM_RECOVERING) { |
| 451 | mlog(ML_ERROR, "sent AST to node %u, it thinks this " |
| 452 | "node is dead!\n", lock->ml.node); |
| 453 | BUG(); |
| 454 | } else if (status == DLM_MIGRATING) { |
| 455 | mlog(ML_ERROR, "sent AST to node %u, it returned " |
| 456 | "DLM_MIGRATING!\n", lock->ml.node); |
| 457 | BUG(); |
| 458 | } else if (status != DLM_NORMAL) { |
| 459 | mlog(ML_ERROR, "AST to node %u returned %d!\n", |
| 460 | lock->ml.node, status); |
| 461 | /* ignore it */ |
| 462 | } |
| 463 | ret = 0; |
| 464 | } |
| 465 | return ret; |
| 466 | } |