blob: 8b8a38dc6492f91776eba2ba035adb69b1d2f0d4 [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
Philipp Reisnerb411b362009-09-25 16:07:19 -070029#include <linux/module.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070030#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020035#include <linux/mutex.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070036#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070055#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020067static DEFINE_MUTEX(drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -070068int drbdd_init(struct drbd_thread *);
69int drbd_worker(struct drbd_thread *);
70int drbd_asender(struct drbd_thread *);
71
72int drbd_init(void);
73static int drbd_open(struct block_device *bdev, fmode_t mode);
74static int drbd_release(struct gendisk *gd, fmode_t mode);
75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79static void md_sync_timer_fn(unsigned long data);
80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +020081static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Philipp Reisnerb411b362009-09-25 16:07:19 -070082
Philipp Reisnerb411b362009-09-25 16:07:19 -070083MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84 "Lars Ellenberg <lars@linbit.com>");
85MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86MODULE_VERSION(REL_VERSION);
87MODULE_LICENSE("GPL");
88MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
89MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
90
91#include <linux/moduleparam.h>
92/* allow_open_on_secondary */
93MODULE_PARM_DESC(allow_oos, "DONT USE!");
94/* thanks to these macros, if compiled into the kernel (not-module),
95 * this becomes the boot parameter drbd.minor_count */
96module_param(minor_count, uint, 0444);
97module_param(disable_sendpage, bool, 0644);
98module_param(allow_oos, bool, 0);
99module_param(cn_idx, uint, 0444);
100module_param(proc_details, int, 0644);
101
102#ifdef CONFIG_DRBD_FAULT_INJECTION
103int enable_faults;
104int fault_rate;
105static int fault_count;
106int fault_devs;
107/* bitmap of enabled faults */
108module_param(enable_faults, int, 0664);
109/* fault rate % value - applies to all enabled faults */
110module_param(fault_rate, int, 0664);
111/* count of faults inserted */
112module_param(fault_count, int, 0664);
113/* bitmap of devices to insert faults on */
114module_param(fault_devs, int, 0644);
115#endif
116
117/* module parameter, defined */
118unsigned int minor_count = 32;
119int disable_sendpage;
120int allow_oos;
121unsigned int cn_idx = CN_IDX_DRBD;
122int proc_details; /* Detail level in proc drbd*/
123
124/* Module parameter for setting the user mode helper program
125 * to run. Default is /sbin/drbdadm */
126char usermode_helper[80] = "/sbin/drbdadm";
127
128module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
129
130/* in 2.6.x, our device mapping and config info contains our virtual gendisks
131 * as member "struct gendisk *vdisk;"
132 */
133struct drbd_conf **minor_table;
134
135struct kmem_cache *drbd_request_cache;
136struct kmem_cache *drbd_ee_cache; /* epoch entries */
137struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
138struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
139mempool_t *drbd_request_mempool;
140mempool_t *drbd_ee_mempool;
141
142/* I do not use a standard mempool, because:
143 1) I want to hand out the pre-allocated objects first.
144 2) I want to be able to interrupt sleeping allocation with a signal.
145 Note: This is a single linked list, the next pointer is the private
146 member of struct page.
147 */
148struct page *drbd_pp_pool;
149spinlock_t drbd_pp_lock;
150int drbd_pp_vacant;
151wait_queue_head_t drbd_pp_wait;
152
153DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
154
Emese Revfy7d4e9d02009-12-14 00:59:30 +0100155static const struct block_device_operations drbd_ops = {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700156 .owner = THIS_MODULE,
157 .open = drbd_open,
158 .release = drbd_release,
159};
160
161#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
162
163#ifdef __CHECKER__
164/* When checking with sparse, and this is an inline function, sparse will
165 give tons of false positives. When this is a real functions sparse works.
166 */
167int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
168{
169 int io_allowed;
170
171 atomic_inc(&mdev->local_cnt);
172 io_allowed = (mdev->state.disk >= mins);
173 if (!io_allowed) {
174 if (atomic_dec_and_test(&mdev->local_cnt))
175 wake_up(&mdev->misc_wait);
176 }
177 return io_allowed;
178}
179
180#endif
181
182/**
183 * DOC: The transfer log
184 *
185 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
186 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
187 * of the list. There is always at least one &struct drbd_tl_epoch object.
188 *
189 * Each &struct drbd_tl_epoch has a circular double linked list of requests
190 * attached.
191 */
192static int tl_init(struct drbd_conf *mdev)
193{
194 struct drbd_tl_epoch *b;
195
196 /* during device minor initialization, we may well use GFP_KERNEL */
197 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
198 if (!b)
199 return 0;
200 INIT_LIST_HEAD(&b->requests);
201 INIT_LIST_HEAD(&b->w.list);
202 b->next = NULL;
203 b->br_number = 4711;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200204 b->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700205 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
206
207 mdev->oldest_tle = b;
208 mdev->newest_tle = b;
209 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
210
211 mdev->tl_hash = NULL;
212 mdev->tl_hash_s = 0;
213
214 return 1;
215}
216
217static void tl_cleanup(struct drbd_conf *mdev)
218{
219 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
220 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
221 kfree(mdev->oldest_tle);
222 mdev->oldest_tle = NULL;
223 kfree(mdev->unused_spare_tle);
224 mdev->unused_spare_tle = NULL;
225 kfree(mdev->tl_hash);
226 mdev->tl_hash = NULL;
227 mdev->tl_hash_s = 0;
228}
229
230/**
231 * _tl_add_barrier() - Adds a barrier to the transfer log
232 * @mdev: DRBD device.
233 * @new: Barrier to be added before the current head of the TL.
234 *
235 * The caller must hold the req_lock.
236 */
237void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
238{
239 struct drbd_tl_epoch *newest_before;
240
241 INIT_LIST_HEAD(&new->requests);
242 INIT_LIST_HEAD(&new->w.list);
243 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
244 new->next = NULL;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200245 new->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700246
247 newest_before = mdev->newest_tle;
248 /* never send a barrier number == 0, because that is special-cased
249 * when using TCQ for our write ordering code */
250 new->br_number = (newest_before->br_number+1) ?: 1;
251 if (mdev->newest_tle != new) {
252 mdev->newest_tle->next = new;
253 mdev->newest_tle = new;
254 }
255}
256
257/**
258 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
259 * @mdev: DRBD device.
260 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
261 * @set_size: Expected number of requests before that barrier.
262 *
263 * In case the passed barrier_nr or set_size does not match the oldest
264 * &struct drbd_tl_epoch objects this function will cause a termination
265 * of the connection.
266 */
267void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
268 unsigned int set_size)
269{
270 struct drbd_tl_epoch *b, *nob; /* next old barrier */
271 struct list_head *le, *tle;
272 struct drbd_request *r;
273
274 spin_lock_irq(&mdev->req_lock);
275
276 b = mdev->oldest_tle;
277
278 /* first some paranoia code */
279 if (b == NULL) {
280 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
281 barrier_nr);
282 goto bail;
283 }
284 if (b->br_number != barrier_nr) {
285 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
286 barrier_nr, b->br_number);
287 goto bail;
288 }
Philipp Reisner7e602c02010-05-27 14:49:27 +0200289 if (b->n_writes != set_size) {
290 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
291 barrier_nr, set_size, b->n_writes);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700292 goto bail;
293 }
294
295 /* Clean up list of requests processed during current epoch */
296 list_for_each_safe(le, tle, &b->requests) {
297 r = list_entry(le, struct drbd_request, tl_requests);
298 _req_mod(r, barrier_acked);
299 }
300 /* There could be requests on the list waiting for completion
301 of the write to the local disk. To avoid corruptions of
302 slab's data structures we have to remove the lists head.
303
304 Also there could have been a barrier ack out of sequence, overtaking
305 the write acks - which would be a bug and violating write ordering.
306 To not deadlock in case we lose connection while such requests are
307 still pending, we need some way to find them for the
308 _req_mode(connection_lost_while_pending).
309
310 These have been list_move'd to the out_of_sequence_requests list in
311 _req_mod(, barrier_acked) above.
312 */
313 list_del_init(&b->requests);
314
315 nob = b->next;
316 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
317 _tl_add_barrier(mdev, b);
318 if (nob)
319 mdev->oldest_tle = nob;
320 /* if nob == NULL b was the only barrier, and becomes the new
321 barrier. Therefore mdev->oldest_tle points already to b */
322 } else {
323 D_ASSERT(nob != NULL);
324 mdev->oldest_tle = nob;
325 kfree(b);
326 }
327
328 spin_unlock_irq(&mdev->req_lock);
329 dec_ap_pending(mdev);
330
331 return;
332
333bail:
334 spin_unlock_irq(&mdev->req_lock);
335 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
336}
337
Philipp Reisner11b58e72010-05-12 17:08:26 +0200338/**
339 * _tl_restart() - Walks the transfer log, and applies an action to all requests
340 * @mdev: DRBD device.
341 * @what: The action/event to perform with all request objects
342 *
343 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
344 * restart_frozen_disk_io.
345 */
346static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
347{
348 struct drbd_tl_epoch *b, *tmp, **pn;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200349 struct list_head *le, *tle, carry_reads;
Philipp Reisner11b58e72010-05-12 17:08:26 +0200350 struct drbd_request *req;
351 int rv, n_writes, n_reads;
352
353 b = mdev->oldest_tle;
354 pn = &mdev->oldest_tle;
355 while (b) {
356 n_writes = 0;
357 n_reads = 0;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200358 INIT_LIST_HEAD(&carry_reads);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200359 list_for_each_safe(le, tle, &b->requests) {
360 req = list_entry(le, struct drbd_request, tl_requests);
361 rv = _req_mod(req, what);
362
363 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
364 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
365 }
366 tmp = b->next;
367
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200368 if (n_writes) {
Philipp Reisner11b58e72010-05-12 17:08:26 +0200369 if (what == resend) {
370 b->n_writes = n_writes;
371 if (b->w.cb == NULL) {
372 b->w.cb = w_send_barrier;
373 inc_ap_pending(mdev);
374 set_bit(CREATE_BARRIER, &mdev->flags);
375 }
376
377 drbd_queue_work(&mdev->data.work, &b->w);
378 }
379 pn = &b->next;
380 } else {
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200381 if (n_reads)
382 list_add(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200383 /* there could still be requests on that ring list,
384 * in case local io is still pending */
385 list_del(&b->requests);
386
387 /* dec_ap_pending corresponding to queue_barrier.
388 * the newest barrier may not have been queued yet,
389 * in which case w.cb is still NULL. */
390 if (b->w.cb != NULL)
391 dec_ap_pending(mdev);
392
393 if (b == mdev->newest_tle) {
394 /* recycle, but reinit! */
395 D_ASSERT(tmp == NULL);
396 INIT_LIST_HEAD(&b->requests);
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200397 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200398 INIT_LIST_HEAD(&b->w.list);
399 b->w.cb = NULL;
400 b->br_number = net_random();
401 b->n_writes = 0;
402
403 *pn = b;
404 break;
405 }
406 *pn = tmp;
407 kfree(b);
408 }
409 b = tmp;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200410 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200411 }
412}
413
Philipp Reisnerb411b362009-09-25 16:07:19 -0700414
415/**
416 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
417 * @mdev: DRBD device.
418 *
419 * This is called after the connection to the peer was lost. The storage covered
420 * by the requests on the transfer gets marked as our of sync. Called from the
421 * receiver thread and the worker thread.
422 */
423void tl_clear(struct drbd_conf *mdev)
424{
Philipp Reisnerb411b362009-09-25 16:07:19 -0700425 struct list_head *le, *tle;
426 struct drbd_request *r;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700427
428 spin_lock_irq(&mdev->req_lock);
429
Philipp Reisner11b58e72010-05-12 17:08:26 +0200430 _tl_restart(mdev, connection_lost_while_pending);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700431
432 /* we expect this list to be empty. */
433 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
434
435 /* but just in case, clean it up anyways! */
436 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
437 r = list_entry(le, struct drbd_request, tl_requests);
438 /* It would be nice to complete outside of spinlock.
439 * But this is easier for now. */
440 _req_mod(r, connection_lost_while_pending);
441 }
442
443 /* ensure bit indicating barrier is required is clear */
444 clear_bit(CREATE_BARRIER, &mdev->flags);
445
Philipp Reisner288f4222010-05-27 15:07:43 +0200446 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
447
Philipp Reisnerb411b362009-09-25 16:07:19 -0700448 spin_unlock_irq(&mdev->req_lock);
449}
450
Philipp Reisner11b58e72010-05-12 17:08:26 +0200451void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
452{
453 spin_lock_irq(&mdev->req_lock);
454 _tl_restart(mdev, what);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700455 spin_unlock_irq(&mdev->req_lock);
456}
457
458/**
459 * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
460 * @mdev: DRBD device.
461 * @os: old (current) state.
462 * @ns: new (wanted) state.
463 */
464static int cl_wide_st_chg(struct drbd_conf *mdev,
465 union drbd_state os, union drbd_state ns)
466{
467 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
468 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
469 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
470 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
471 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
472 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
473 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
474}
475
476int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
477 union drbd_state mask, union drbd_state val)
478{
479 unsigned long flags;
480 union drbd_state os, ns;
481 int rv;
482
483 spin_lock_irqsave(&mdev->req_lock, flags);
484 os = mdev->state;
485 ns.i = (os.i & ~mask.i) | val.i;
486 rv = _drbd_set_state(mdev, ns, f, NULL);
487 ns = mdev->state;
488 spin_unlock_irqrestore(&mdev->req_lock, flags);
489
490 return rv;
491}
492
493/**
494 * drbd_force_state() - Impose a change which happens outside our control on our state
495 * @mdev: DRBD device.
496 * @mask: mask of state bits to change.
497 * @val: value of new state bits.
498 */
499void drbd_force_state(struct drbd_conf *mdev,
500 union drbd_state mask, union drbd_state val)
501{
502 drbd_change_state(mdev, CS_HARD, mask, val);
503}
504
505static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
506static int is_valid_state_transition(struct drbd_conf *,
507 union drbd_state, union drbd_state);
508static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200509 union drbd_state ns, const char **warn_sync_abort);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700510int drbd_send_state_req(struct drbd_conf *,
511 union drbd_state, union drbd_state);
512
513static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
514 union drbd_state mask, union drbd_state val)
515{
516 union drbd_state os, ns;
517 unsigned long flags;
518 int rv;
519
520 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
521 return SS_CW_SUCCESS;
522
523 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
524 return SS_CW_FAILED_BY_PEER;
525
526 rv = 0;
527 spin_lock_irqsave(&mdev->req_lock, flags);
528 os = mdev->state;
529 ns.i = (os.i & ~mask.i) | val.i;
530 ns = sanitize_state(mdev, os, ns, NULL);
531
532 if (!cl_wide_st_chg(mdev, os, ns))
533 rv = SS_CW_NO_NEED;
534 if (!rv) {
535 rv = is_valid_state(mdev, ns);
536 if (rv == SS_SUCCESS) {
537 rv = is_valid_state_transition(mdev, ns, os);
538 if (rv == SS_SUCCESS)
539 rv = 0; /* cont waiting, otherwise fail. */
540 }
541 }
542 spin_unlock_irqrestore(&mdev->req_lock, flags);
543
544 return rv;
545}
546
547/**
548 * drbd_req_state() - Perform an eventually cluster wide state change
549 * @mdev: DRBD device.
550 * @mask: mask of state bits to change.
551 * @val: value of new state bits.
552 * @f: flags
553 *
554 * Should not be called directly, use drbd_request_state() or
555 * _drbd_request_state().
556 */
557static int drbd_req_state(struct drbd_conf *mdev,
558 union drbd_state mask, union drbd_state val,
559 enum chg_state_flags f)
560{
561 struct completion done;
562 unsigned long flags;
563 union drbd_state os, ns;
564 int rv;
565
566 init_completion(&done);
567
568 if (f & CS_SERIALIZE)
569 mutex_lock(&mdev->state_mutex);
570
571 spin_lock_irqsave(&mdev->req_lock, flags);
572 os = mdev->state;
573 ns.i = (os.i & ~mask.i) | val.i;
574 ns = sanitize_state(mdev, os, ns, NULL);
575
576 if (cl_wide_st_chg(mdev, os, ns)) {
577 rv = is_valid_state(mdev, ns);
578 if (rv == SS_SUCCESS)
579 rv = is_valid_state_transition(mdev, ns, os);
580 spin_unlock_irqrestore(&mdev->req_lock, flags);
581
582 if (rv < SS_SUCCESS) {
583 if (f & CS_VERBOSE)
584 print_st_err(mdev, os, ns, rv);
585 goto abort;
586 }
587
588 drbd_state_lock(mdev);
589 if (!drbd_send_state_req(mdev, mask, val)) {
590 drbd_state_unlock(mdev);
591 rv = SS_CW_FAILED_BY_PEER;
592 if (f & CS_VERBOSE)
593 print_st_err(mdev, os, ns, rv);
594 goto abort;
595 }
596
597 wait_event(mdev->state_wait,
598 (rv = _req_st_cond(mdev, mask, val)));
599
600 if (rv < SS_SUCCESS) {
601 drbd_state_unlock(mdev);
602 if (f & CS_VERBOSE)
603 print_st_err(mdev, os, ns, rv);
604 goto abort;
605 }
606 spin_lock_irqsave(&mdev->req_lock, flags);
607 os = mdev->state;
608 ns.i = (os.i & ~mask.i) | val.i;
609 rv = _drbd_set_state(mdev, ns, f, &done);
610 drbd_state_unlock(mdev);
611 } else {
612 rv = _drbd_set_state(mdev, ns, f, &done);
613 }
614
615 spin_unlock_irqrestore(&mdev->req_lock, flags);
616
617 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
618 D_ASSERT(current != mdev->worker.task);
619 wait_for_completion(&done);
620 }
621
622abort:
623 if (f & CS_SERIALIZE)
624 mutex_unlock(&mdev->state_mutex);
625
626 return rv;
627}
628
629/**
630 * _drbd_request_state() - Request a state change (with flags)
631 * @mdev: DRBD device.
632 * @mask: mask of state bits to change.
633 * @val: value of new state bits.
634 * @f: flags
635 *
636 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
637 * flag, or when logging of failed state change requests is not desired.
638 */
639int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
640 union drbd_state val, enum chg_state_flags f)
641{
642 int rv;
643
644 wait_event(mdev->state_wait,
645 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
646
647 return rv;
648}
649
650static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
651{
652 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
653 name,
654 drbd_conn_str(ns.conn),
655 drbd_role_str(ns.role),
656 drbd_role_str(ns.peer),
657 drbd_disk_str(ns.disk),
658 drbd_disk_str(ns.pdsk),
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200659 is_susp(ns) ? 's' : 'r',
Philipp Reisnerb411b362009-09-25 16:07:19 -0700660 ns.aftr_isp ? 'a' : '-',
661 ns.peer_isp ? 'p' : '-',
662 ns.user_isp ? 'u' : '-'
663 );
664}
665
666void print_st_err(struct drbd_conf *mdev,
667 union drbd_state os, union drbd_state ns, int err)
668{
669 if (err == SS_IN_TRANSIENT_STATE)
670 return;
671 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
672 print_st(mdev, " state", os);
673 print_st(mdev, "wanted", ns);
674}
675
676
677#define drbd_peer_str drbd_role_str
678#define drbd_pdsk_str drbd_disk_str
679
680#define drbd_susp_str(A) ((A) ? "1" : "0")
681#define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
682#define drbd_peer_isp_str(A) ((A) ? "1" : "0")
683#define drbd_user_isp_str(A) ((A) ? "1" : "0")
684
685#define PSC(A) \
686 ({ if (ns.A != os.A) { \
687 pbp += sprintf(pbp, #A "( %s -> %s ) ", \
688 drbd_##A##_str(os.A), \
689 drbd_##A##_str(ns.A)); \
690 } })
691
692/**
693 * is_valid_state() - Returns an SS_ error code if ns is not valid
694 * @mdev: DRBD device.
695 * @ns: State to consider.
696 */
697static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
698{
699 /* See drbd_state_sw_errors in drbd_strings.c */
700
701 enum drbd_fencing_p fp;
702 int rv = SS_SUCCESS;
703
704 fp = FP_DONT_CARE;
705 if (get_ldev(mdev)) {
706 fp = mdev->ldev->dc.fencing;
707 put_ldev(mdev);
708 }
709
710 if (get_net_conf(mdev)) {
711 if (!mdev->net_conf->two_primaries &&
712 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
713 rv = SS_TWO_PRIMARIES;
714 put_net_conf(mdev);
715 }
716
717 if (rv <= 0)
718 /* already found a reason to abort */;
719 else if (ns.role == R_SECONDARY && mdev->open_cnt)
720 rv = SS_DEVICE_IN_USE;
721
722 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
723 rv = SS_NO_UP_TO_DATE_DISK;
724
725 else if (fp >= FP_RESOURCE &&
726 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
727 rv = SS_PRIMARY_NOP;
728
729 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
730 rv = SS_NO_UP_TO_DATE_DISK;
731
732 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
733 rv = SS_NO_LOCAL_DISK;
734
735 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
736 rv = SS_NO_REMOTE_DISK;
737
Lars Ellenberg8d4ce822010-04-01 16:59:32 +0200738 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
739 rv = SS_NO_UP_TO_DATE_DISK;
740
Philipp Reisnerb411b362009-09-25 16:07:19 -0700741 else if ((ns.conn == C_CONNECTED ||
742 ns.conn == C_WF_BITMAP_S ||
743 ns.conn == C_SYNC_SOURCE ||
744 ns.conn == C_PAUSED_SYNC_S) &&
745 ns.disk == D_OUTDATED)
746 rv = SS_CONNECTED_OUTDATES;
747
748 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
749 (mdev->sync_conf.verify_alg[0] == 0))
750 rv = SS_NO_VERIFY_ALG;
751
752 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
753 mdev->agreed_pro_version < 88)
754 rv = SS_NOT_SUPPORTED;
755
756 return rv;
757}
758
759/**
760 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
761 * @mdev: DRBD device.
762 * @ns: new state.
763 * @os: old state.
764 */
765static int is_valid_state_transition(struct drbd_conf *mdev,
766 union drbd_state ns, union drbd_state os)
767{
768 int rv = SS_SUCCESS;
769
770 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
771 os.conn > C_CONNECTED)
772 rv = SS_RESYNC_RUNNING;
773
774 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
775 rv = SS_ALREADY_STANDALONE;
776
777 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
778 rv = SS_IS_DISKLESS;
779
780 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
781 rv = SS_NO_NET_CONFIG;
782
783 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
784 rv = SS_LOWER_THAN_OUTDATED;
785
786 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
787 rv = SS_IN_TRANSIENT_STATE;
788
789 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
790 rv = SS_IN_TRANSIENT_STATE;
791
792 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
793 rv = SS_NEED_CONNECTION;
794
795 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
796 ns.conn != os.conn && os.conn > C_CONNECTED)
797 rv = SS_RESYNC_RUNNING;
798
799 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
800 os.conn < C_CONNECTED)
801 rv = SS_NEED_CONNECTION;
802
Philipp Reisner1fc80cf2010-11-22 14:18:47 +0100803 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
804 && os.conn < C_WF_REPORT_PARAMS)
805 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
806
Philipp Reisnerb411b362009-09-25 16:07:19 -0700807 return rv;
808}
809
810/**
811 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
812 * @mdev: DRBD device.
813 * @os: old state.
814 * @ns: new state.
815 * @warn_sync_abort:
816 *
817 * When we loose connection, we have to set the state of the peers disk (pdsk)
818 * to D_UNKNOWN. This rule and many more along those lines are in this function.
819 */
820static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200821 union drbd_state ns, const char **warn_sync_abort)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700822{
823 enum drbd_fencing_p fp;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100824 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700825
826 fp = FP_DONT_CARE;
827 if (get_ldev(mdev)) {
828 fp = mdev->ldev->dc.fencing;
829 put_ldev(mdev);
830 }
831
832 /* Disallow Network errors to configure a device's network part */
833 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
834 os.conn <= C_DISCONNECTING)
835 ns.conn = os.conn;
836
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200837 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
838 * If you try to go into some Sync* state, that shall fail (elsewhere). */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700839 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200840 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700841 ns.conn = os.conn;
842
Lars Ellenberg82f59cc2010-10-16 12:13:47 +0200843 /* we cannot fail (again) if we already detached */
844 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
845 ns.disk = D_DISKLESS;
846
847 /* if we are only D_ATTACHING yet,
848 * we can (and should) go directly to D_DISKLESS. */
849 if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
850 ns.disk = D_DISKLESS;
851
Philipp Reisnerb411b362009-09-25 16:07:19 -0700852 /* After C_DISCONNECTING only C_STANDALONE may follow */
853 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
854 ns.conn = os.conn;
855
856 if (ns.conn < C_CONNECTED) {
857 ns.peer_isp = 0;
858 ns.peer = R_UNKNOWN;
859 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
860 ns.pdsk = D_UNKNOWN;
861 }
862
863 /* Clear the aftr_isp when becoming unconfigured */
864 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
865 ns.aftr_isp = 0;
866
Philipp Reisnerb411b362009-09-25 16:07:19 -0700867 /* Abort resync if a disk fails/detaches */
868 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
869 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
870 if (warn_sync_abort)
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200871 *warn_sync_abort =
872 os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
873 "Online-verify" : "Resync";
Philipp Reisnerb411b362009-09-25 16:07:19 -0700874 ns.conn = C_CONNECTED;
875 }
876
Philipp Reisnerb411b362009-09-25 16:07:19 -0700877 /* Connection breaks down before we finished "Negotiating" */
878 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
879 get_ldev_if_state(mdev, D_NEGOTIATING)) {
880 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
881 ns.disk = mdev->new_state_tmp.disk;
882 ns.pdsk = mdev->new_state_tmp.pdsk;
883 } else {
884 dev_alert(DEV, "Connection lost while negotiating, no data!\n");
885 ns.disk = D_DISKLESS;
886 ns.pdsk = D_UNKNOWN;
887 }
888 put_ldev(mdev);
889 }
890
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100891 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
892 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
893 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
894 ns.disk = D_UP_TO_DATE;
895 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
896 ns.pdsk = D_UP_TO_DATE;
897 }
898
899 /* Implications of the connection stat on the disk states */
900 disk_min = D_DISKLESS;
901 disk_max = D_UP_TO_DATE;
902 pdsk_min = D_INCONSISTENT;
903 pdsk_max = D_UNKNOWN;
904 switch ((enum drbd_conns)ns.conn) {
905 case C_WF_BITMAP_T:
906 case C_PAUSED_SYNC_T:
907 case C_STARTING_SYNC_T:
908 case C_WF_SYNC_UUID:
909 case C_BEHIND:
910 disk_min = D_INCONSISTENT;
911 disk_max = D_OUTDATED;
912 pdsk_min = D_UP_TO_DATE;
913 pdsk_max = D_UP_TO_DATE;
914 break;
915 case C_VERIFY_S:
916 case C_VERIFY_T:
917 disk_min = D_UP_TO_DATE;
918 disk_max = D_UP_TO_DATE;
919 pdsk_min = D_UP_TO_DATE;
920 pdsk_max = D_UP_TO_DATE;
921 break;
922 case C_CONNECTED:
923 disk_min = D_DISKLESS;
924 disk_max = D_UP_TO_DATE;
925 pdsk_min = D_DISKLESS;
926 pdsk_max = D_UP_TO_DATE;
927 break;
928 case C_WF_BITMAP_S:
929 case C_PAUSED_SYNC_S:
930 case C_STARTING_SYNC_S:
931 case C_AHEAD:
932 disk_min = D_UP_TO_DATE;
933 disk_max = D_UP_TO_DATE;
934 pdsk_min = D_INCONSISTENT;
935 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
936 break;
937 case C_SYNC_TARGET:
938 disk_min = D_INCONSISTENT;
939 disk_max = D_INCONSISTENT;
940 pdsk_min = D_UP_TO_DATE;
941 pdsk_max = D_UP_TO_DATE;
942 break;
943 case C_SYNC_SOURCE:
944 disk_min = D_UP_TO_DATE;
945 disk_max = D_UP_TO_DATE;
946 pdsk_min = D_INCONSISTENT;
947 pdsk_max = D_INCONSISTENT;
948 break;
949 case C_STANDALONE:
950 case C_DISCONNECTING:
951 case C_UNCONNECTED:
952 case C_TIMEOUT:
953 case C_BROKEN_PIPE:
954 case C_NETWORK_FAILURE:
955 case C_PROTOCOL_ERROR:
956 case C_TEAR_DOWN:
957 case C_WF_CONNECTION:
958 case C_WF_REPORT_PARAMS:
959 case C_MASK:
960 break;
961 }
962 if (ns.disk > disk_max)
963 ns.disk = disk_max;
964
965 if (ns.disk < disk_min) {
966 dev_warn(DEV, "Implicitly set disk from %s to %s\n",
967 drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
968 ns.disk = disk_min;
969 }
970 if (ns.pdsk > pdsk_max)
971 ns.pdsk = pdsk_max;
972
973 if (ns.pdsk < pdsk_min) {
974 dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
975 drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
976 ns.pdsk = pdsk_min;
977 }
978
Philipp Reisnerb411b362009-09-25 16:07:19 -0700979 if (fp == FP_STONITH &&
Philipp Reisner0a492162009-10-21 13:08:29 +0200980 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
981 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200982 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
Philipp Reisner265be2d2010-05-31 10:14:17 +0200983
984 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
985 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
986 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200987 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700988
989 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
990 if (ns.conn == C_SYNC_SOURCE)
991 ns.conn = C_PAUSED_SYNC_S;
992 if (ns.conn == C_SYNC_TARGET)
993 ns.conn = C_PAUSED_SYNC_T;
994 } else {
995 if (ns.conn == C_PAUSED_SYNC_S)
996 ns.conn = C_SYNC_SOURCE;
997 if (ns.conn == C_PAUSED_SYNC_T)
998 ns.conn = C_SYNC_TARGET;
999 }
1000
1001 return ns;
1002}
1003
1004/* helper for __drbd_set_state */
1005static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
1006{
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001007 if (mdev->agreed_pro_version < 90)
1008 mdev->ov_start_sector = 0;
1009 mdev->rs_total = drbd_bm_bits(mdev);
1010 mdev->ov_position = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001011 if (cs == C_VERIFY_T) {
1012 /* starting online verify from an arbitrary position
1013 * does not fit well into the existing protocol.
1014 * on C_VERIFY_T, we initialize ov_left and friends
1015 * implicitly in receive_DataRequest once the
1016 * first P_OV_REQUEST is received */
1017 mdev->ov_start_sector = ~(sector_t)0;
1018 } else {
1019 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001020 if (bit >= mdev->rs_total) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001021 mdev->ov_start_sector =
1022 BM_BIT_TO_SECT(mdev->rs_total - 1);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001023 mdev->rs_total = 1;
1024 } else
1025 mdev->rs_total -= bit;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001026 mdev->ov_position = mdev->ov_start_sector;
1027 }
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001028 mdev->ov_left = mdev->rs_total;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001029}
1030
Philipp Reisner07782862010-08-31 12:00:50 +02001031static void drbd_resume_al(struct drbd_conf *mdev)
1032{
1033 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1034 dev_info(DEV, "Resumed AL updates\n");
1035}
1036
Philipp Reisnerb411b362009-09-25 16:07:19 -07001037/**
1038 * __drbd_set_state() - Set a new DRBD state
1039 * @mdev: DRBD device.
1040 * @ns: new state.
1041 * @flags: Flags
1042 * @done: Optional completion, that will get completed after the after_state_ch() finished
1043 *
1044 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1045 */
1046int __drbd_set_state(struct drbd_conf *mdev,
1047 union drbd_state ns, enum chg_state_flags flags,
1048 struct completion *done)
1049{
1050 union drbd_state os;
1051 int rv = SS_SUCCESS;
Lars Ellenberg02bc7172010-09-06 12:13:20 +02001052 const char *warn_sync_abort = NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001053 struct after_state_chg_work *ascw;
1054
1055 os = mdev->state;
1056
1057 ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
1058
1059 if (ns.i == os.i)
1060 return SS_NOTHING_TO_DO;
1061
1062 if (!(flags & CS_HARD)) {
1063 /* pre-state-change checks ; only look at ns */
1064 /* See drbd_state_sw_errors in drbd_strings.c */
1065
1066 rv = is_valid_state(mdev, ns);
1067 if (rv < SS_SUCCESS) {
1068 /* If the old state was illegal as well, then let
1069 this happen...*/
1070
Philipp Reisner1616a252010-06-10 16:55:15 +02001071 if (is_valid_state(mdev, os) == rv)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001072 rv = is_valid_state_transition(mdev, ns, os);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001073 } else
1074 rv = is_valid_state_transition(mdev, ns, os);
1075 }
1076
1077 if (rv < SS_SUCCESS) {
1078 if (flags & CS_VERBOSE)
1079 print_st_err(mdev, os, ns, rv);
1080 return rv;
1081 }
1082
1083 if (warn_sync_abort)
Lars Ellenberg02bc7172010-09-06 12:13:20 +02001084 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001085
1086 {
1087 char *pbp, pb[300];
1088 pbp = pb;
1089 *pbp = 0;
1090 PSC(role);
1091 PSC(peer);
1092 PSC(conn);
1093 PSC(disk);
1094 PSC(pdsk);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001095 if (is_susp(ns) != is_susp(os))
1096 pbp += sprintf(pbp, "susp( %s -> %s ) ",
1097 drbd_susp_str(is_susp(os)),
1098 drbd_susp_str(is_susp(ns)));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001099 PSC(aftr_isp);
1100 PSC(peer_isp);
1101 PSC(user_isp);
1102 dev_info(DEV, "%s\n", pb);
1103 }
1104
1105 /* solve the race between becoming unconfigured,
1106 * worker doing the cleanup, and
1107 * admin reconfiguring us:
1108 * on (re)configure, first set CONFIG_PENDING,
1109 * then wait for a potentially exiting worker,
1110 * start the worker, and schedule one no_op.
1111 * then proceed with configuration.
1112 */
1113 if (ns.disk == D_DISKLESS &&
1114 ns.conn == C_STANDALONE &&
1115 ns.role == R_SECONDARY &&
1116 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1117 set_bit(DEVICE_DYING, &mdev->flags);
1118
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001119 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1120 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1121 * drbd_ldev_destroy() won't happen before our corresponding
1122 * after_state_ch works run, where we put_ldev again. */
1123 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1124 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1125 atomic_inc(&mdev->local_cnt);
1126
1127 mdev->state = ns;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001128 wake_up(&mdev->misc_wait);
1129 wake_up(&mdev->state_wait);
1130
Philipp Reisnerb411b362009-09-25 16:07:19 -07001131 /* aborted verify run. log the last position */
1132 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1133 ns.conn < C_CONNECTED) {
1134 mdev->ov_start_sector =
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001135 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001136 dev_info(DEV, "Online Verify reached sector %llu\n",
1137 (unsigned long long)mdev->ov_start_sector);
1138 }
1139
1140 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1141 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1142 dev_info(DEV, "Syncer continues.\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001143 mdev->rs_paused += (long)jiffies
1144 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
Philipp Reisner63106d32010-09-01 15:47:15 +02001145 if (ns.conn == C_SYNC_TARGET)
1146 mod_timer(&mdev->resync_timer, jiffies);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001147 }
1148
1149 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1150 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1151 dev_info(DEV, "Resync suspended\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001152 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001153 }
1154
1155 if (os.conn == C_CONNECTED &&
1156 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001157 unsigned long now = jiffies;
1158 int i;
1159
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001160 set_ov_position(mdev, ns.conn);
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001161 mdev->rs_start = now;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001162 mdev->rs_last_events = 0;
1163 mdev->rs_last_sect_ev = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001164 mdev->ov_last_oos_size = 0;
1165 mdev->ov_last_oos_start = 0;
1166
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001167 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001168 mdev->rs_mark_left[i] = mdev->ov_left;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001169 mdev->rs_mark_time[i] = now;
1170 }
1171
Lars Ellenberg2649f082010-11-05 10:05:47 +01001172 drbd_rs_controller_reset(mdev);
1173
Philipp Reisnerb411b362009-09-25 16:07:19 -07001174 if (ns.conn == C_VERIFY_S) {
1175 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1176 (unsigned long long)mdev->ov_position);
1177 mod_timer(&mdev->resync_timer, jiffies);
1178 }
1179 }
1180
1181 if (get_ldev(mdev)) {
1182 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1183 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1184 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1185
1186 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1187 mdf |= MDF_CRASHED_PRIMARY;
1188 if (mdev->state.role == R_PRIMARY ||
1189 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1190 mdf |= MDF_PRIMARY_IND;
1191 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1192 mdf |= MDF_CONNECTED_IND;
1193 if (mdev->state.disk > D_INCONSISTENT)
1194 mdf |= MDF_CONSISTENT;
1195 if (mdev->state.disk > D_OUTDATED)
1196 mdf |= MDF_WAS_UP_TO_DATE;
1197 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1198 mdf |= MDF_PEER_OUT_DATED;
1199 if (mdf != mdev->ldev->md.flags) {
1200 mdev->ldev->md.flags = mdf;
1201 drbd_md_mark_dirty(mdev);
1202 }
1203 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1204 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1205 put_ldev(mdev);
1206 }
1207
1208 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1209 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1210 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1211 set_bit(CONSIDER_RESYNC, &mdev->flags);
1212
1213 /* Receiver should clean up itself */
1214 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1215 drbd_thread_stop_nowait(&mdev->receiver);
1216
1217 /* Now the receiver finished cleaning up itself, it should die */
1218 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1219 drbd_thread_stop_nowait(&mdev->receiver);
1220
1221 /* Upon network failure, we need to restart the receiver. */
1222 if (os.conn > C_TEAR_DOWN &&
1223 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1224 drbd_thread_restart_nowait(&mdev->receiver);
1225
Philipp Reisner07782862010-08-31 12:00:50 +02001226 /* Resume AL writing if we get a connection */
1227 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1228 drbd_resume_al(mdev);
1229
Philipp Reisnerb411b362009-09-25 16:07:19 -07001230 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1231 if (ascw) {
1232 ascw->os = os;
1233 ascw->ns = ns;
1234 ascw->flags = flags;
1235 ascw->w.cb = w_after_state_ch;
1236 ascw->done = done;
1237 drbd_queue_work(&mdev->data.work, &ascw->w);
1238 } else {
1239 dev_warn(DEV, "Could not kmalloc an ascw\n");
1240 }
1241
1242 return rv;
1243}
1244
1245static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1246{
1247 struct after_state_chg_work *ascw =
1248 container_of(w, struct after_state_chg_work, w);
1249 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1250 if (ascw->flags & CS_WAIT_COMPLETE) {
1251 D_ASSERT(ascw->done != NULL);
1252 complete(ascw->done);
1253 }
1254 kfree(ascw);
1255
1256 return 1;
1257}
1258
1259static void abw_start_sync(struct drbd_conf *mdev, int rv)
1260{
1261 if (rv) {
1262 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1263 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1264 return;
1265 }
1266
1267 switch (mdev->state.conn) {
1268 case C_STARTING_SYNC_T:
1269 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1270 break;
1271 case C_STARTING_SYNC_S:
1272 drbd_start_resync(mdev, C_SYNC_SOURCE);
1273 break;
1274 }
1275}
1276
1277/**
1278 * after_state_ch() - Perform after state change actions that may sleep
1279 * @mdev: DRBD device.
1280 * @os: old state.
1281 * @ns: new state.
1282 * @flags: Flags
1283 */
1284static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1285 union drbd_state ns, enum chg_state_flags flags)
1286{
1287 enum drbd_fencing_p fp;
Philipp Reisner67098932010-06-24 16:24:25 +02001288 enum drbd_req_event what = nothing;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001289 union drbd_state nsm = (union drbd_state){ .i = -1 };
Philipp Reisnerb411b362009-09-25 16:07:19 -07001290
1291 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1292 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1293 if (mdev->p_uuid)
1294 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1295 }
1296
1297 fp = FP_DONT_CARE;
1298 if (get_ldev(mdev)) {
1299 fp = mdev->ldev->dc.fencing;
1300 put_ldev(mdev);
1301 }
1302
1303 /* Inform userspace about the change... */
1304 drbd_bcast_state(mdev, ns);
1305
1306 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1307 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1308 drbd_khelper(mdev, "pri-on-incon-degr");
1309
1310 /* Here we have the actions that are performed after a
1311 state change. This function might sleep */
1312
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001313 nsm.i = -1;
1314 if (ns.susp_nod) {
Philipp Reisner265be2d2010-05-31 10:14:17 +02001315 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
Philipp Reisner67098932010-06-24 16:24:25 +02001316 if (ns.conn == C_CONNECTED)
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001317 what = resend, nsm.susp_nod = 0;
Philipp Reisner67098932010-06-24 16:24:25 +02001318 else /* ns.conn > C_CONNECTED */
Philipp Reisner265be2d2010-05-31 10:14:17 +02001319 dev_err(DEV, "Unexpected Resynd going on!\n");
1320 }
1321
Philipp Reisner67098932010-06-24 16:24:25 +02001322 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001323 what = restart_frozen_disk_io, nsm.susp_nod = 0;
1324
Philipp Reisner265be2d2010-05-31 10:14:17 +02001325 }
1326
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001327 if (ns.susp_fen) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001328 /* case1: The outdate peer handler is successful: */
1329 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001330 tl_clear(mdev);
Philipp Reisner43a51822010-06-11 11:26:34 +02001331 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1332 drbd_uuid_new_current(mdev);
1333 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner43a51822010-06-11 11:26:34 +02001334 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001335 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001336 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001337 spin_unlock_irq(&mdev->req_lock);
1338 }
Philipp Reisner43a51822010-06-11 11:26:34 +02001339 /* case2: The connection was established again: */
1340 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1341 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner67098932010-06-24 16:24:25 +02001342 what = resend;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001343 nsm.susp_fen = 0;
Philipp Reisner43a51822010-06-11 11:26:34 +02001344 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001345 }
Philipp Reisner67098932010-06-24 16:24:25 +02001346
1347 if (what != nothing) {
1348 spin_lock_irq(&mdev->req_lock);
1349 _tl_restart(mdev, what);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001350 nsm.i &= mdev->state.i;
1351 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
Philipp Reisner67098932010-06-24 16:24:25 +02001352 spin_unlock_irq(&mdev->req_lock);
1353 }
1354
Philipp Reisnerb411b362009-09-25 16:07:19 -07001355 /* Do not change the order of the if above and the two below... */
1356 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1357 drbd_send_uuids(mdev);
1358 drbd_send_state(mdev);
1359 }
1360 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1361 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1362
1363 /* Lost contact to peer's copy of the data */
1364 if ((os.pdsk >= D_INCONSISTENT &&
1365 os.pdsk != D_UNKNOWN &&
1366 os.pdsk != D_OUTDATED)
1367 && (ns.pdsk < D_INCONSISTENT ||
1368 ns.pdsk == D_UNKNOWN ||
1369 ns.pdsk == D_OUTDATED)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001370 if (get_ldev(mdev)) {
1371 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001372 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001373 if (is_susp(mdev->state)) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001374 set_bit(NEW_CUR_UUID, &mdev->flags);
1375 } else {
1376 drbd_uuid_new_current(mdev);
1377 drbd_send_uuids(mdev);
1378 }
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001379 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001380 put_ldev(mdev);
1381 }
1382 }
1383
1384 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001385 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001386 drbd_uuid_new_current(mdev);
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001387 drbd_send_uuids(mdev);
1388 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001389
1390 /* D_DISKLESS Peer becomes secondary */
1391 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1392 drbd_al_to_on_disk_bm(mdev);
1393 put_ldev(mdev);
1394 }
1395
1396 /* Last part of the attaching process ... */
1397 if (ns.conn >= C_CONNECTED &&
1398 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
Philipp Reisnere89b5912010-03-24 17:11:33 +01001399 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001400 drbd_send_uuids(mdev);
1401 drbd_send_state(mdev);
1402 }
1403
1404 /* We want to pause/continue resync, tell peer. */
1405 if (ns.conn >= C_CONNECTED &&
1406 ((os.aftr_isp != ns.aftr_isp) ||
1407 (os.user_isp != ns.user_isp)))
1408 drbd_send_state(mdev);
1409
1410 /* In case one of the isp bits got set, suspend other devices. */
1411 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1412 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1413 suspend_other_sg(mdev);
1414
1415 /* Make sure the peer gets informed about eventual state
1416 changes (ISP bits) while we were in WFReportParams. */
1417 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1418 drbd_send_state(mdev);
1419
Philipp Reisner67531712010-10-27 12:21:30 +02001420 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1421 drbd_send_state(mdev);
1422
Philipp Reisnerb411b362009-09-25 16:07:19 -07001423 /* We are in the progress to start a full sync... */
1424 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1425 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1426 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1427
1428 /* We are invalidating our self... */
1429 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1430 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1431 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1432
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001433 /* first half of local IO error, failure to attach,
1434 * or administrative detach */
1435 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1436 enum drbd_io_error_p eh;
1437 int was_io_error;
1438 /* corresponding get_ldev was in __drbd_set_state, to serialize
1439 * our cleanup here with the transition to D_DISKLESS,
1440 * so it is safe to dreference ldev here. */
1441 eh = mdev->ldev->dc.on_io_error;
1442 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1443
1444 /* current state still has to be D_FAILED,
1445 * there is only one way out: to D_DISKLESS,
1446 * and that may only happen after our put_ldev below. */
1447 if (mdev->state.disk != D_FAILED)
1448 dev_err(DEV,
1449 "ASSERT FAILED: disk is %s during detach\n",
1450 drbd_disk_str(mdev->state.disk));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001451
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001452 if (drbd_send_state(mdev))
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001453 dev_warn(DEV, "Notified peer that I am detaching my disk\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001454 else
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001455 dev_err(DEV, "Sending state for detaching disk failed\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001456
1457 drbd_rs_cancel_all(mdev);
1458
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001459 /* In case we want to get something to stable storage still,
1460 * this may be the last chance.
1461 * Following put_ldev may transition to D_DISKLESS. */
1462 drbd_md_sync(mdev);
1463 put_ldev(mdev);
1464
1465 if (was_io_error && eh == EP_CALL_HELPER)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001466 drbd_khelper(mdev, "local-io-error");
1467 }
1468
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001469 /* second half of local IO error, failure to attach,
1470 * or administrative detach,
1471 * after local_cnt references have reached zero again */
1472 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1473 /* We must still be diskless,
1474 * re-attach has to be serialized with this! */
1475 if (mdev->state.disk != D_DISKLESS)
1476 dev_err(DEV,
1477 "ASSERT FAILED: disk is %s while going diskless\n",
1478 drbd_disk_str(mdev->state.disk));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001479
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001480 mdev->rs_total = 0;
1481 mdev->rs_failed = 0;
1482 atomic_set(&mdev->rs_pending_cnt, 0);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001483
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001484 if (drbd_send_state(mdev))
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001485 dev_warn(DEV, "Notified peer that I'm now diskless.\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001486 else
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001487 dev_err(DEV, "Sending state for being diskless failed\n");
1488 /* corresponding get_ldev in __drbd_set_state
1489 * this may finaly trigger drbd_ldev_destroy. */
1490 put_ldev(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001491 }
1492
1493 /* Disks got bigger while they were detached */
1494 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1495 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1496 if (ns.conn == C_CONNECTED)
1497 resync_after_online_grow(mdev);
1498 }
1499
1500 /* A resync finished or aborted, wake paused devices... */
1501 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1502 (os.peer_isp && !ns.peer_isp) ||
1503 (os.user_isp && !ns.user_isp))
1504 resume_next_sg(mdev);
1505
Lars Ellenbergaf85e8e2010-10-07 16:07:55 +02001506 /* sync target done with resync. Explicitly notify peer, even though
1507 * it should (at least for non-empty resyncs) already know itself. */
1508 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1509 drbd_send_state(mdev);
1510
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001511 /* free tl_hash if we Got thawed and are C_STANDALONE */
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001512 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001513 drbd_free_tl_hash(mdev);
1514
Philipp Reisnerb411b362009-09-25 16:07:19 -07001515 /* Upon network connection, we need to start the receiver */
1516 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1517 drbd_thread_start(&mdev->receiver);
1518
1519 /* Terminate worker thread if we are unconfigured - it will be
1520 restarted as needed... */
1521 if (ns.disk == D_DISKLESS &&
1522 ns.conn == C_STANDALONE &&
1523 ns.role == R_SECONDARY) {
1524 if (os.aftr_isp != ns.aftr_isp)
1525 resume_next_sg(mdev);
1526 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1527 if (test_bit(DEVICE_DYING, &mdev->flags))
1528 drbd_thread_stop_nowait(&mdev->worker);
1529 }
1530
1531 drbd_md_sync(mdev);
1532}
1533
1534
1535static int drbd_thread_setup(void *arg)
1536{
1537 struct drbd_thread *thi = (struct drbd_thread *) arg;
1538 struct drbd_conf *mdev = thi->mdev;
1539 unsigned long flags;
1540 int retval;
1541
1542restart:
1543 retval = thi->function(thi);
1544
1545 spin_lock_irqsave(&thi->t_lock, flags);
1546
1547 /* if the receiver has been "Exiting", the last thing it did
1548 * was set the conn state to "StandAlone",
1549 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1550 * and receiver thread will be "started".
1551 * drbd_thread_start needs to set "Restarting" in that case.
1552 * t_state check and assignment needs to be within the same spinlock,
1553 * so either thread_start sees Exiting, and can remap to Restarting,
1554 * or thread_start see None, and can proceed as normal.
1555 */
1556
1557 if (thi->t_state == Restarting) {
1558 dev_info(DEV, "Restarting %s\n", current->comm);
1559 thi->t_state = Running;
1560 spin_unlock_irqrestore(&thi->t_lock, flags);
1561 goto restart;
1562 }
1563
1564 thi->task = NULL;
1565 thi->t_state = None;
1566 smp_mb();
1567 complete(&thi->stop);
1568 spin_unlock_irqrestore(&thi->t_lock, flags);
1569
1570 dev_info(DEV, "Terminating %s\n", current->comm);
1571
1572 /* Release mod reference taken when thread was started */
1573 module_put(THIS_MODULE);
1574 return retval;
1575}
1576
1577static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1578 int (*func) (struct drbd_thread *))
1579{
1580 spin_lock_init(&thi->t_lock);
1581 thi->task = NULL;
1582 thi->t_state = None;
1583 thi->function = func;
1584 thi->mdev = mdev;
1585}
1586
1587int drbd_thread_start(struct drbd_thread *thi)
1588{
1589 struct drbd_conf *mdev = thi->mdev;
1590 struct task_struct *nt;
1591 unsigned long flags;
1592
1593 const char *me =
1594 thi == &mdev->receiver ? "receiver" :
1595 thi == &mdev->asender ? "asender" :
1596 thi == &mdev->worker ? "worker" : "NONSENSE";
1597
1598 /* is used from state engine doing drbd_thread_stop_nowait,
1599 * while holding the req lock irqsave */
1600 spin_lock_irqsave(&thi->t_lock, flags);
1601
1602 switch (thi->t_state) {
1603 case None:
1604 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1605 me, current->comm, current->pid);
1606
1607 /* Get ref on module for thread - this is released when thread exits */
1608 if (!try_module_get(THIS_MODULE)) {
1609 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1610 spin_unlock_irqrestore(&thi->t_lock, flags);
1611 return FALSE;
1612 }
1613
1614 init_completion(&thi->stop);
1615 D_ASSERT(thi->task == NULL);
1616 thi->reset_cpu_mask = 1;
1617 thi->t_state = Running;
1618 spin_unlock_irqrestore(&thi->t_lock, flags);
1619 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1620
1621 nt = kthread_create(drbd_thread_setup, (void *) thi,
1622 "drbd%d_%s", mdev_to_minor(mdev), me);
1623
1624 if (IS_ERR(nt)) {
1625 dev_err(DEV, "Couldn't start thread\n");
1626
1627 module_put(THIS_MODULE);
1628 return FALSE;
1629 }
1630 spin_lock_irqsave(&thi->t_lock, flags);
1631 thi->task = nt;
1632 thi->t_state = Running;
1633 spin_unlock_irqrestore(&thi->t_lock, flags);
1634 wake_up_process(nt);
1635 break;
1636 case Exiting:
1637 thi->t_state = Restarting;
1638 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1639 me, current->comm, current->pid);
1640 /* fall through */
1641 case Running:
1642 case Restarting:
1643 default:
1644 spin_unlock_irqrestore(&thi->t_lock, flags);
1645 break;
1646 }
1647
1648 return TRUE;
1649}
1650
1651
1652void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1653{
1654 unsigned long flags;
1655
1656 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1657
1658 /* may be called from state engine, holding the req lock irqsave */
1659 spin_lock_irqsave(&thi->t_lock, flags);
1660
1661 if (thi->t_state == None) {
1662 spin_unlock_irqrestore(&thi->t_lock, flags);
1663 if (restart)
1664 drbd_thread_start(thi);
1665 return;
1666 }
1667
1668 if (thi->t_state != ns) {
1669 if (thi->task == NULL) {
1670 spin_unlock_irqrestore(&thi->t_lock, flags);
1671 return;
1672 }
1673
1674 thi->t_state = ns;
1675 smp_mb();
1676 init_completion(&thi->stop);
1677 if (thi->task != current)
1678 force_sig(DRBD_SIGKILL, thi->task);
1679
1680 }
1681
1682 spin_unlock_irqrestore(&thi->t_lock, flags);
1683
1684 if (wait)
1685 wait_for_completion(&thi->stop);
1686}
1687
1688#ifdef CONFIG_SMP
1689/**
1690 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1691 * @mdev: DRBD device.
1692 *
1693 * Forces all threads of a device onto the same CPU. This is beneficial for
1694 * DRBD's performance. May be overwritten by user's configuration.
1695 */
1696void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1697{
1698 int ord, cpu;
1699
1700 /* user override. */
1701 if (cpumask_weight(mdev->cpu_mask))
1702 return;
1703
1704 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1705 for_each_online_cpu(cpu) {
1706 if (ord-- == 0) {
1707 cpumask_set_cpu(cpu, mdev->cpu_mask);
1708 return;
1709 }
1710 }
1711 /* should not be reached */
1712 cpumask_setall(mdev->cpu_mask);
1713}
1714
1715/**
1716 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1717 * @mdev: DRBD device.
1718 *
1719 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1720 * prematurely.
1721 */
1722void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1723{
1724 struct task_struct *p = current;
1725 struct drbd_thread *thi =
1726 p == mdev->asender.task ? &mdev->asender :
1727 p == mdev->receiver.task ? &mdev->receiver :
1728 p == mdev->worker.task ? &mdev->worker :
1729 NULL;
1730 ERR_IF(thi == NULL)
1731 return;
1732 if (!thi->reset_cpu_mask)
1733 return;
1734 thi->reset_cpu_mask = 0;
1735 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1736}
1737#endif
1738
1739/* the appropriate socket mutex must be held already */
1740int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001741 enum drbd_packets cmd, struct p_header80 *h,
Philipp Reisnerb411b362009-09-25 16:07:19 -07001742 size_t size, unsigned msg_flags)
1743{
1744 int sent, ok;
1745
1746 ERR_IF(!h) return FALSE;
1747 ERR_IF(!size) return FALSE;
1748
1749 h->magic = BE_DRBD_MAGIC;
1750 h->command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02001751 h->length = cpu_to_be16(size-sizeof(struct p_header80));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001752
Philipp Reisnerb411b362009-09-25 16:07:19 -07001753 sent = drbd_send(mdev, sock, h, size, msg_flags);
1754
1755 ok = (sent == size);
1756 if (!ok)
1757 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1758 cmdname(cmd), (int)size, sent);
1759 return ok;
1760}
1761
1762/* don't pass the socket. we may only look at it
1763 * when we hold the appropriate socket mutex.
1764 */
1765int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001766 enum drbd_packets cmd, struct p_header80 *h, size_t size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001767{
1768 int ok = 0;
1769 struct socket *sock;
1770
1771 if (use_data_socket) {
1772 mutex_lock(&mdev->data.mutex);
1773 sock = mdev->data.socket;
1774 } else {
1775 mutex_lock(&mdev->meta.mutex);
1776 sock = mdev->meta.socket;
1777 }
1778
1779 /* drbd_disconnect() could have called drbd_free_sock()
1780 * while we were waiting in down()... */
1781 if (likely(sock != NULL))
1782 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1783
1784 if (use_data_socket)
1785 mutex_unlock(&mdev->data.mutex);
1786 else
1787 mutex_unlock(&mdev->meta.mutex);
1788 return ok;
1789}
1790
1791int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1792 size_t size)
1793{
Philipp Reisner0b70a132010-08-20 13:36:10 +02001794 struct p_header80 h;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001795 int ok;
1796
1797 h.magic = BE_DRBD_MAGIC;
1798 h.command = cpu_to_be16(cmd);
1799 h.length = cpu_to_be16(size);
1800
1801 if (!drbd_get_data_sock(mdev))
1802 return 0;
1803
Philipp Reisnerb411b362009-09-25 16:07:19 -07001804 ok = (sizeof(h) ==
1805 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1806 ok = ok && (size ==
1807 drbd_send(mdev, mdev->data.socket, data, size, 0));
1808
1809 drbd_put_data_sock(mdev);
1810
1811 return ok;
1812}
1813
1814int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1815{
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001816 struct p_rs_param_95 *p;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001817 struct socket *sock;
1818 int size, rv;
1819 const int apv = mdev->agreed_pro_version;
1820
1821 size = apv <= 87 ? sizeof(struct p_rs_param)
1822 : apv == 88 ? sizeof(struct p_rs_param)
1823 + strlen(mdev->sync_conf.verify_alg) + 1
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001824 : apv <= 94 ? sizeof(struct p_rs_param_89)
1825 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001826
1827 /* used from admin command context and receiver/worker context.
1828 * to avoid kmalloc, grab the socket right here,
1829 * then use the pre-allocated sbuf there */
1830 mutex_lock(&mdev->data.mutex);
1831 sock = mdev->data.socket;
1832
1833 if (likely(sock != NULL)) {
1834 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1835
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001836 p = &mdev->data.sbuf.rs_param_95;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001837
1838 /* initialize verify_alg and csums_alg */
1839 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1840
1841 p->rate = cpu_to_be32(sc->rate);
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001842 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1843 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1844 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1845 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001846
1847 if (apv >= 88)
1848 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1849 if (apv >= 89)
1850 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1851
1852 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1853 } else
1854 rv = 0; /* not ok */
1855
1856 mutex_unlock(&mdev->data.mutex);
1857
1858 return rv;
1859}
1860
1861int drbd_send_protocol(struct drbd_conf *mdev)
1862{
1863 struct p_protocol *p;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001864 int size, cf, rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001865
1866 size = sizeof(struct p_protocol);
1867
1868 if (mdev->agreed_pro_version >= 87)
1869 size += strlen(mdev->net_conf->integrity_alg) + 1;
1870
1871 /* we must not recurse into our own queue,
1872 * as that is blocked during handshake */
1873 p = kmalloc(size, GFP_NOIO);
1874 if (p == NULL)
1875 return 0;
1876
1877 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
1878 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
1879 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
1880 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001881 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1882
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001883 cf = 0;
1884 if (mdev->net_conf->want_lose)
1885 cf |= CF_WANT_LOSE;
1886 if (mdev->net_conf->dry_run) {
1887 if (mdev->agreed_pro_version >= 92)
1888 cf |= CF_DRY_RUN;
1889 else {
1890 dev_err(DEV, "--dry-run is not supported by peer");
Dan Carpenter7ac314c2010-04-22 14:27:23 +02001891 kfree(p);
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001892 return 0;
1893 }
1894 }
1895 p->conn_flags = cpu_to_be32(cf);
1896
Philipp Reisnerb411b362009-09-25 16:07:19 -07001897 if (mdev->agreed_pro_version >= 87)
1898 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1899
1900 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001901 (struct p_header80 *)p, size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001902 kfree(p);
1903 return rv;
1904}
1905
1906int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1907{
1908 struct p_uuids p;
1909 int i;
1910
1911 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1912 return 1;
1913
1914 for (i = UI_CURRENT; i < UI_SIZE; i++)
1915 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1916
1917 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1918 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1919 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1920 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1921 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1922 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1923
1924 put_ldev(mdev);
1925
1926 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001927 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001928}
1929
1930int drbd_send_uuids(struct drbd_conf *mdev)
1931{
1932 return _drbd_send_uuids(mdev, 0);
1933}
1934
1935int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1936{
1937 return _drbd_send_uuids(mdev, 8);
1938}
1939
1940
1941int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1942{
1943 struct p_rs_uuid p;
1944
1945 p.uuid = cpu_to_be64(val);
1946
1947 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001948 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001949}
1950
Philipp Reisnere89b5912010-03-24 17:11:33 +01001951int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001952{
1953 struct p_sizes p;
1954 sector_t d_size, u_size;
1955 int q_order_type;
1956 int ok;
1957
1958 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1959 D_ASSERT(mdev->ldev->backing_bdev);
1960 d_size = drbd_get_max_capacity(mdev->ldev);
1961 u_size = mdev->ldev->dc.disk_size;
1962 q_order_type = drbd_queue_order_type(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001963 put_ldev(mdev);
1964 } else {
1965 d_size = 0;
1966 u_size = 0;
1967 q_order_type = QUEUE_ORDERED_NONE;
1968 }
1969
1970 p.d_size = cpu_to_be64(d_size);
1971 p.u_size = cpu_to_be64(u_size);
1972 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01001973 p.max_bio_size = cpu_to_be32(queue_max_hw_sectors(mdev->rq_queue) << 9);
Philipp Reisnere89b5912010-03-24 17:11:33 +01001974 p.queue_order_type = cpu_to_be16(q_order_type);
1975 p.dds_flags = cpu_to_be16(flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001976
1977 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001978 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001979 return ok;
1980}
1981
1982/**
1983 * drbd_send_state() - Sends the drbd state to the peer
1984 * @mdev: DRBD device.
1985 */
1986int drbd_send_state(struct drbd_conf *mdev)
1987{
1988 struct socket *sock;
1989 struct p_state p;
1990 int ok = 0;
1991
1992 /* Grab state lock so we wont send state if we're in the middle
1993 * of a cluster wide state change on another thread */
1994 drbd_state_lock(mdev);
1995
1996 mutex_lock(&mdev->data.mutex);
1997
1998 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
1999 sock = mdev->data.socket;
2000
2001 if (likely(sock != NULL)) {
2002 ok = _drbd_send_cmd(mdev, sock, P_STATE,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002003 (struct p_header80 *)&p, sizeof(p), 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002004 }
2005
2006 mutex_unlock(&mdev->data.mutex);
2007
2008 drbd_state_unlock(mdev);
2009 return ok;
2010}
2011
2012int drbd_send_state_req(struct drbd_conf *mdev,
2013 union drbd_state mask, union drbd_state val)
2014{
2015 struct p_req_state p;
2016
2017 p.mask = cpu_to_be32(mask.i);
2018 p.val = cpu_to_be32(val.i);
2019
2020 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002021 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002022}
2023
2024int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
2025{
2026 struct p_req_state_reply p;
2027
2028 p.retcode = cpu_to_be32(retcode);
2029
2030 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002031 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002032}
2033
2034int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2035 struct p_compressed_bm *p,
2036 struct bm_xfer_ctx *c)
2037{
2038 struct bitstream bs;
2039 unsigned long plain_bits;
2040 unsigned long tmp;
2041 unsigned long rl;
2042 unsigned len;
2043 unsigned toggle;
2044 int bits;
2045
2046 /* may we use this feature? */
2047 if ((mdev->sync_conf.use_rle == 0) ||
2048 (mdev->agreed_pro_version < 90))
2049 return 0;
2050
2051 if (c->bit_offset >= c->bm_bits)
2052 return 0; /* nothing to do. */
2053
2054 /* use at most thus many bytes */
2055 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2056 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2057 /* plain bits covered in this code string */
2058 plain_bits = 0;
2059
2060 /* p->encoding & 0x80 stores whether the first run length is set.
2061 * bit offset is implicit.
2062 * start with toggle == 2 to be able to tell the first iteration */
2063 toggle = 2;
2064
2065 /* see how much plain bits we can stuff into one packet
2066 * using RLE and VLI. */
2067 do {
2068 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2069 : _drbd_bm_find_next(mdev, c->bit_offset);
2070 if (tmp == -1UL)
2071 tmp = c->bm_bits;
2072 rl = tmp - c->bit_offset;
2073
2074 if (toggle == 2) { /* first iteration */
2075 if (rl == 0) {
2076 /* the first checked bit was set,
2077 * store start value, */
2078 DCBP_set_start(p, 1);
2079 /* but skip encoding of zero run length */
2080 toggle = !toggle;
2081 continue;
2082 }
2083 DCBP_set_start(p, 0);
2084 }
2085
2086 /* paranoia: catch zero runlength.
2087 * can only happen if bitmap is modified while we scan it. */
2088 if (rl == 0) {
2089 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2090 "t:%u bo:%lu\n", toggle, c->bit_offset);
2091 return -1;
2092 }
2093
2094 bits = vli_encode_bits(&bs, rl);
2095 if (bits == -ENOBUFS) /* buffer full */
2096 break;
2097 if (bits <= 0) {
2098 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2099 return 0;
2100 }
2101
2102 toggle = !toggle;
2103 plain_bits += rl;
2104 c->bit_offset = tmp;
2105 } while (c->bit_offset < c->bm_bits);
2106
2107 len = bs.cur.b - p->code + !!bs.cur.bit;
2108
2109 if (plain_bits < (len << 3)) {
2110 /* incompressible with this method.
2111 * we need to rewind both word and bit position. */
2112 c->bit_offset -= plain_bits;
2113 bm_xfer_ctx_bit_to_word_offset(c);
2114 c->bit_offset = c->word_offset * BITS_PER_LONG;
2115 return 0;
2116 }
2117
2118 /* RLE + VLI was able to compress it just fine.
2119 * update c->word_offset. */
2120 bm_xfer_ctx_bit_to_word_offset(c);
2121
2122 /* store pad_bits */
2123 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2124
2125 return len;
2126}
2127
2128enum { OK, FAILED, DONE }
2129send_bitmap_rle_or_plain(struct drbd_conf *mdev,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002130 struct p_header80 *h, struct bm_xfer_ctx *c)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002131{
2132 struct p_compressed_bm *p = (void*)h;
2133 unsigned long num_words;
2134 int len;
2135 int ok;
2136
2137 len = fill_bitmap_rle_bits(mdev, p, c);
2138
2139 if (len < 0)
2140 return FAILED;
2141
2142 if (len) {
2143 DCBP_set_code(p, RLE_VLI_Bits);
2144 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2145 sizeof(*p) + len, 0);
2146
2147 c->packets[0]++;
2148 c->bytes[0] += sizeof(*p) + len;
2149
2150 if (c->bit_offset >= c->bm_bits)
2151 len = 0; /* DONE */
2152 } else {
2153 /* was not compressible.
2154 * send a buffer full of plain text bits instead. */
2155 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2156 len = num_words * sizeof(long);
2157 if (len)
2158 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2159 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002160 h, sizeof(struct p_header80) + len, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002161 c->word_offset += num_words;
2162 c->bit_offset = c->word_offset * BITS_PER_LONG;
2163
2164 c->packets[1]++;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002165 c->bytes[1] += sizeof(struct p_header80) + len;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002166
2167 if (c->bit_offset > c->bm_bits)
2168 c->bit_offset = c->bm_bits;
2169 }
2170 ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
2171
2172 if (ok == DONE)
2173 INFO_bm_xfer_stats(mdev, "send", c);
2174 return ok;
2175}
2176
2177/* See the comment at receive_bitmap() */
2178int _drbd_send_bitmap(struct drbd_conf *mdev)
2179{
2180 struct bm_xfer_ctx c;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002181 struct p_header80 *p;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002182 int ret;
2183
2184 ERR_IF(!mdev->bitmap) return FALSE;
2185
2186 /* maybe we should use some per thread scratch page,
2187 * and allocate that during initial device creation? */
Philipp Reisner0b70a132010-08-20 13:36:10 +02002188 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002189 if (!p) {
2190 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2191 return FALSE;
2192 }
2193
2194 if (get_ldev(mdev)) {
2195 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2196 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2197 drbd_bm_set_all(mdev);
2198 if (drbd_bm_write(mdev)) {
2199 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2200 * but otherwise process as per normal - need to tell other
2201 * side that a full resync is required! */
2202 dev_err(DEV, "Failed to write bitmap to disk!\n");
2203 } else {
2204 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2205 drbd_md_sync(mdev);
2206 }
2207 }
2208 put_ldev(mdev);
2209 }
2210
2211 c = (struct bm_xfer_ctx) {
2212 .bm_bits = drbd_bm_bits(mdev),
2213 .bm_words = drbd_bm_words(mdev),
2214 };
2215
2216 do {
2217 ret = send_bitmap_rle_or_plain(mdev, p, &c);
2218 } while (ret == OK);
2219
2220 free_page((unsigned long) p);
2221 return (ret == DONE);
2222}
2223
2224int drbd_send_bitmap(struct drbd_conf *mdev)
2225{
2226 int err;
2227
2228 if (!drbd_get_data_sock(mdev))
2229 return -1;
2230 err = !_drbd_send_bitmap(mdev);
2231 drbd_put_data_sock(mdev);
2232 return err;
2233}
2234
2235int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2236{
2237 int ok;
2238 struct p_barrier_ack p;
2239
2240 p.barrier = barrier_nr;
2241 p.set_size = cpu_to_be32(set_size);
2242
2243 if (mdev->state.conn < C_CONNECTED)
2244 return FALSE;
2245 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002246 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002247 return ok;
2248}
2249
2250/**
2251 * _drbd_send_ack() - Sends an ack packet
2252 * @mdev: DRBD device.
2253 * @cmd: Packet command code.
2254 * @sector: sector, needs to be in big endian byte order
2255 * @blksize: size in byte, needs to be in big endian byte order
2256 * @block_id: Id, big endian byte order
2257 */
2258static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2259 u64 sector,
2260 u32 blksize,
2261 u64 block_id)
2262{
2263 int ok;
2264 struct p_block_ack p;
2265
2266 p.sector = sector;
2267 p.block_id = block_id;
2268 p.blksize = blksize;
2269 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2270
2271 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2272 return FALSE;
2273 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002274 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002275 return ok;
2276}
2277
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002278/* dp->sector and dp->block_id already/still in network byte order,
2279 * data_size is payload size according to dp->head,
2280 * and may need to be corrected for digest size. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002281int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002282 struct p_data *dp, int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002283{
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002284 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2285 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002286 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2287 dp->block_id);
2288}
2289
2290int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2291 struct p_block_req *rp)
2292{
2293 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2294}
2295
2296/**
2297 * drbd_send_ack() - Sends an ack packet
2298 * @mdev: DRBD device.
2299 * @cmd: Packet command code.
2300 * @e: Epoch entry.
2301 */
2302int drbd_send_ack(struct drbd_conf *mdev,
2303 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2304{
2305 return _drbd_send_ack(mdev, cmd,
2306 cpu_to_be64(e->sector),
2307 cpu_to_be32(e->size),
2308 e->block_id);
2309}
2310
2311/* This function misuses the block_id field to signal if the blocks
2312 * are is sync or not. */
2313int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2314 sector_t sector, int blksize, u64 block_id)
2315{
2316 return _drbd_send_ack(mdev, cmd,
2317 cpu_to_be64(sector),
2318 cpu_to_be32(blksize),
2319 cpu_to_be64(block_id));
2320}
2321
2322int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2323 sector_t sector, int size, u64 block_id)
2324{
2325 int ok;
2326 struct p_block_req p;
2327
2328 p.sector = cpu_to_be64(sector);
2329 p.block_id = block_id;
2330 p.blksize = cpu_to_be32(size);
2331
2332 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002333 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002334 return ok;
2335}
2336
2337int drbd_send_drequest_csum(struct drbd_conf *mdev,
2338 sector_t sector, int size,
2339 void *digest, int digest_size,
2340 enum drbd_packets cmd)
2341{
2342 int ok;
2343 struct p_block_req p;
2344
2345 p.sector = cpu_to_be64(sector);
2346 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2347 p.blksize = cpu_to_be32(size);
2348
2349 p.head.magic = BE_DRBD_MAGIC;
2350 p.head.command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002351 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002352
2353 mutex_lock(&mdev->data.mutex);
2354
2355 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2356 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2357
2358 mutex_unlock(&mdev->data.mutex);
2359
2360 return ok;
2361}
2362
2363int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2364{
2365 int ok;
2366 struct p_block_req p;
2367
2368 p.sector = cpu_to_be64(sector);
2369 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2370 p.blksize = cpu_to_be32(size);
2371
2372 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002373 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002374 return ok;
2375}
2376
2377/* called on sndtimeo
2378 * returns FALSE if we should retry,
2379 * TRUE if we think connection is dead
2380 */
2381static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2382{
2383 int drop_it;
2384 /* long elapsed = (long)(jiffies - mdev->last_received); */
2385
2386 drop_it = mdev->meta.socket == sock
2387 || !mdev->asender.task
2388 || get_t_state(&mdev->asender) != Running
2389 || mdev->state.conn < C_CONNECTED;
2390
2391 if (drop_it)
2392 return TRUE;
2393
2394 drop_it = !--mdev->ko_count;
2395 if (!drop_it) {
2396 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2397 current->comm, current->pid, mdev->ko_count);
2398 request_ping(mdev);
2399 }
2400
2401 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2402}
2403
2404/* The idea of sendpage seems to be to put some kind of reference
2405 * to the page into the skb, and to hand it over to the NIC. In
2406 * this process get_page() gets called.
2407 *
2408 * As soon as the page was really sent over the network put_page()
2409 * gets called by some part of the network layer. [ NIC driver? ]
2410 *
2411 * [ get_page() / put_page() increment/decrement the count. If count
2412 * reaches 0 the page will be freed. ]
2413 *
2414 * This works nicely with pages from FSs.
2415 * But this means that in protocol A we might signal IO completion too early!
2416 *
2417 * In order not to corrupt data during a resync we must make sure
2418 * that we do not reuse our own buffer pages (EEs) to early, therefore
2419 * we have the net_ee list.
2420 *
2421 * XFS seems to have problems, still, it submits pages with page_count == 0!
2422 * As a workaround, we disable sendpage on pages
2423 * with page_count == 0 or PageSlab.
2424 */
2425static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002426 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002427{
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002428 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002429 kunmap(page);
2430 if (sent == size)
2431 mdev->send_cnt += size>>9;
2432 return sent == size;
2433}
2434
2435static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002436 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002437{
2438 mm_segment_t oldfs = get_fs();
2439 int sent, ok;
2440 int len = size;
2441
2442 /* e.g. XFS meta- & log-data is in slab pages, which have a
2443 * page_count of 0 and/or have PageSlab() set.
2444 * we cannot use send_page for those, as that does get_page();
2445 * put_page(); and would cause either a VM_BUG directly, or
2446 * __page_cache_release a page that would actually still be referenced
2447 * by someone, leading to some obscure delayed Oops somewhere else. */
2448 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002449 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002450
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002451 msg_flags |= MSG_NOSIGNAL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002452 drbd_update_congested(mdev);
2453 set_fs(KERNEL_DS);
2454 do {
2455 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2456 offset, len,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002457 msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002458 if (sent == -EAGAIN) {
2459 if (we_should_drop_the_connection(mdev,
2460 mdev->data.socket))
2461 break;
2462 else
2463 continue;
2464 }
2465 if (sent <= 0) {
2466 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2467 __func__, (int)size, len, sent);
2468 break;
2469 }
2470 len -= sent;
2471 offset += sent;
2472 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2473 set_fs(oldfs);
2474 clear_bit(NET_CONGESTED, &mdev->flags);
2475
2476 ok = (len == 0);
2477 if (likely(ok))
2478 mdev->send_cnt += size>>9;
2479 return ok;
2480}
2481
2482static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2483{
2484 struct bio_vec *bvec;
2485 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002486 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002487 __bio_for_each_segment(bvec, bio, i, 0) {
2488 if (!_drbd_no_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002489 bvec->bv_offset, bvec->bv_len,
2490 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002491 return 0;
2492 }
2493 return 1;
2494}
2495
2496static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2497{
2498 struct bio_vec *bvec;
2499 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002500 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002501 __bio_for_each_segment(bvec, bio, i, 0) {
2502 if (!_drbd_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002503 bvec->bv_offset, bvec->bv_len,
2504 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002505 return 0;
2506 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002507 return 1;
2508}
2509
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002510static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2511{
2512 struct page *page = e->pages;
2513 unsigned len = e->size;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002514 /* hint all but last page with MSG_MORE */
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002515 page_chain_for_each(page) {
2516 unsigned l = min_t(unsigned, len, PAGE_SIZE);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002517 if (!_drbd_send_page(mdev, page, 0, l,
2518 page_chain_next(page) ? MSG_MORE : 0))
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002519 return 0;
2520 len -= l;
2521 }
2522 return 1;
2523}
2524
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002525static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2526{
2527 if (mdev->agreed_pro_version >= 95)
2528 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002529 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2530 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2531 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2532 else
Jens Axboe721a9602011-03-09 11:56:30 +01002533 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002534}
2535
Philipp Reisnerb411b362009-09-25 16:07:19 -07002536/* Used to send write requests
2537 * R_PRIMARY -> Peer (P_DATA)
2538 */
2539int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2540{
2541 int ok = 1;
2542 struct p_data p;
2543 unsigned int dp_flags = 0;
2544 void *dgb;
2545 int dgs;
2546
2547 if (!drbd_get_data_sock(mdev))
2548 return 0;
2549
2550 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2551 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2552
Philipp Reisnerd5373382010-08-23 15:18:33 +02002553 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002554 p.head.h80.magic = BE_DRBD_MAGIC;
2555 p.head.h80.command = cpu_to_be16(P_DATA);
2556 p.head.h80.length =
2557 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2558 } else {
2559 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2560 p.head.h95.command = cpu_to_be16(P_DATA);
2561 p.head.h95.length =
2562 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2563 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002564
2565 p.sector = cpu_to_be64(req->sector);
2566 p.block_id = (unsigned long)req;
2567 p.seq_num = cpu_to_be32(req->seq_num =
2568 atomic_add_return(1, &mdev->packet_seq));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002569
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002570 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2571
Philipp Reisnerb411b362009-09-25 16:07:19 -07002572 if (mdev->state.conn >= C_SYNC_SOURCE &&
2573 mdev->state.conn <= C_PAUSED_SYNC_T)
2574 dp_flags |= DP_MAY_SET_IN_SYNC;
2575
2576 p.dp_flags = cpu_to_be32(dp_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002577 set_bit(UNPLUG_REMOTE, &mdev->flags);
2578 ok = (sizeof(p) ==
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002579 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002580 if (ok && dgs) {
2581 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002582 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002583 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002584 }
2585 if (ok) {
Lars Ellenberg470be442010-11-10 10:36:52 +01002586 /* For protocol A, we have to memcpy the payload into
2587 * socket buffers, as we may complete right away
2588 * as soon as we handed it over to tcp, at which point the data
2589 * pages may become invalid.
2590 *
2591 * For data-integrity enabled, we copy it as well, so we can be
2592 * sure that even if the bio pages may still be modified, it
2593 * won't change the data on the wire, thus if the digest checks
2594 * out ok after sending on this side, but does not fit on the
2595 * receiving side, we sure have detected corruption elsewhere.
2596 */
2597 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002598 ok = _drbd_send_bio(mdev, req->master_bio);
2599 else
2600 ok = _drbd_send_zc_bio(mdev, req->master_bio);
Lars Ellenberg470be442010-11-10 10:36:52 +01002601
2602 /* double check digest, sometimes buffers have been modified in flight. */
2603 if (dgs > 0 && dgs <= 64) {
2604 /* 64 byte, 512 bit, is the larges digest size
2605 * currently supported in kernel crypto. */
2606 unsigned char digest[64];
2607 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2608 if (memcmp(mdev->int_dig_out, digest, dgs)) {
2609 dev_warn(DEV,
2610 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2611 (unsigned long long)req->sector, req->size);
2612 }
2613 } /* else if (dgs > 64) {
2614 ... Be noisy about digest too large ...
2615 } */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002616 }
2617
2618 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc2010-05-04 12:33:58 +02002619
Philipp Reisnerb411b362009-09-25 16:07:19 -07002620 return ok;
2621}
2622
2623/* answer packet, used to send data back for read requests:
2624 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2625 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2626 */
2627int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2628 struct drbd_epoch_entry *e)
2629{
2630 int ok;
2631 struct p_data p;
2632 void *dgb;
2633 int dgs;
2634
2635 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2636 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2637
Philipp Reisnerd5373382010-08-23 15:18:33 +02002638 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002639 p.head.h80.magic = BE_DRBD_MAGIC;
2640 p.head.h80.command = cpu_to_be16(cmd);
2641 p.head.h80.length =
2642 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2643 } else {
2644 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2645 p.head.h95.command = cpu_to_be16(cmd);
2646 p.head.h95.length =
2647 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2648 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002649
2650 p.sector = cpu_to_be64(e->sector);
2651 p.block_id = e->block_id;
2652 /* p.seq_num = 0; No sequence numbers here.. */
2653
2654 /* Only called by our kernel thread.
2655 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2656 * in response to admin command or module unload.
2657 */
2658 if (!drbd_get_data_sock(mdev))
2659 return 0;
2660
Philipp Reisner0b70a132010-08-20 13:36:10 +02002661 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002662 if (ok && dgs) {
2663 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002664 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002665 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002666 }
2667 if (ok)
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002668 ok = _drbd_send_zc_ee(mdev, e);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002669
2670 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc2010-05-04 12:33:58 +02002671
Philipp Reisnerb411b362009-09-25 16:07:19 -07002672 return ok;
2673}
2674
Philipp Reisner73a01a12010-10-27 14:33:00 +02002675int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2676{
2677 struct p_block_desc p;
2678
2679 p.sector = cpu_to_be64(req->sector);
2680 p.blksize = cpu_to_be32(req->size);
2681
2682 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2683}
2684
Philipp Reisnerb411b362009-09-25 16:07:19 -07002685/*
2686 drbd_send distinguishes two cases:
2687
2688 Packets sent via the data socket "sock"
2689 and packets sent via the meta data socket "msock"
2690
2691 sock msock
2692 -----------------+-------------------------+------------------------------
2693 timeout conf.timeout / 2 conf.timeout / 2
2694 timeout action send a ping via msock Abort communication
2695 and close all sockets
2696*/
2697
2698/*
2699 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2700 */
2701int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2702 void *buf, size_t size, unsigned msg_flags)
2703{
2704 struct kvec iov;
2705 struct msghdr msg;
2706 int rv, sent = 0;
2707
2708 if (!sock)
2709 return -1000;
2710
2711 /* THINK if (signal_pending) return ... ? */
2712
2713 iov.iov_base = buf;
2714 iov.iov_len = size;
2715
2716 msg.msg_name = NULL;
2717 msg.msg_namelen = 0;
2718 msg.msg_control = NULL;
2719 msg.msg_controllen = 0;
2720 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2721
2722 if (sock == mdev->data.socket) {
2723 mdev->ko_count = mdev->net_conf->ko_count;
2724 drbd_update_congested(mdev);
2725 }
2726 do {
2727 /* STRANGE
2728 * tcp_sendmsg does _not_ use its size parameter at all ?
2729 *
2730 * -EAGAIN on timeout, -EINTR on signal.
2731 */
2732/* THINK
2733 * do we need to block DRBD_SIG if sock == &meta.socket ??
2734 * otherwise wake_asender() might interrupt some send_*Ack !
2735 */
2736 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2737 if (rv == -EAGAIN) {
2738 if (we_should_drop_the_connection(mdev, sock))
2739 break;
2740 else
2741 continue;
2742 }
2743 D_ASSERT(rv != 0);
2744 if (rv == -EINTR) {
2745 flush_signals(current);
2746 rv = 0;
2747 }
2748 if (rv < 0)
2749 break;
2750 sent += rv;
2751 iov.iov_base += rv;
2752 iov.iov_len -= rv;
2753 } while (sent < size);
2754
2755 if (sock == mdev->data.socket)
2756 clear_bit(NET_CONGESTED, &mdev->flags);
2757
2758 if (rv <= 0) {
2759 if (rv != -EAGAIN) {
2760 dev_err(DEV, "%s_sendmsg returned %d\n",
2761 sock == mdev->meta.socket ? "msock" : "sock",
2762 rv);
2763 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2764 } else
2765 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2766 }
2767
2768 return sent;
2769}
2770
2771static int drbd_open(struct block_device *bdev, fmode_t mode)
2772{
2773 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2774 unsigned long flags;
2775 int rv = 0;
2776
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002777 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002778 spin_lock_irqsave(&mdev->req_lock, flags);
2779 /* to have a stable mdev->state.role
2780 * and no race with updating open_cnt */
2781
2782 if (mdev->state.role != R_PRIMARY) {
2783 if (mode & FMODE_WRITE)
2784 rv = -EROFS;
2785 else if (!allow_oos)
2786 rv = -EMEDIUMTYPE;
2787 }
2788
2789 if (!rv)
2790 mdev->open_cnt++;
2791 spin_unlock_irqrestore(&mdev->req_lock, flags);
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002792 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002793
2794 return rv;
2795}
2796
2797static int drbd_release(struct gendisk *gd, fmode_t mode)
2798{
2799 struct drbd_conf *mdev = gd->private_data;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002800 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002801 mdev->open_cnt--;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002802 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002803 return 0;
2804}
2805
Philipp Reisnerb411b362009-09-25 16:07:19 -07002806static void drbd_set_defaults(struct drbd_conf *mdev)
2807{
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002808 /* This way we get a compile error when sync_conf grows,
2809 and we forgot to initialize it here */
2810 mdev->sync_conf = (struct syncer_conf) {
2811 /* .rate = */ DRBD_RATE_DEF,
2812 /* .after = */ DRBD_AFTER_DEF,
2813 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002814 /* .verify_alg = */ {}, 0,
2815 /* .cpu_mask = */ {}, 0,
2816 /* .csums_alg = */ {}, 0,
Philipp Reisnere7564142010-06-29 17:35:34 +02002817 /* .use_rle = */ 0,
Philipp Reisner9a31d712010-07-05 13:42:03 +02002818 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
2819 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
2820 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2821 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002822 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
2823 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002824 };
2825
2826 /* Have to use that way, because the layout differs between
2827 big endian and little endian */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002828 mdev->state = (union drbd_state) {
2829 { .role = R_SECONDARY,
2830 .peer = R_UNKNOWN,
2831 .conn = C_STANDALONE,
2832 .disk = D_DISKLESS,
2833 .pdsk = D_UNKNOWN,
Philipp Reisnerfb22c402010-09-08 23:20:21 +02002834 .susp = 0,
2835 .susp_nod = 0,
2836 .susp_fen = 0
Philipp Reisnerb411b362009-09-25 16:07:19 -07002837 } };
2838}
2839
2840void drbd_init_set_defaults(struct drbd_conf *mdev)
2841{
2842 /* the memset(,0,) did most of this.
2843 * note: only assignments, no allocation in here */
2844
2845 drbd_set_defaults(mdev);
2846
Philipp Reisnerb411b362009-09-25 16:07:19 -07002847 atomic_set(&mdev->ap_bio_cnt, 0);
2848 atomic_set(&mdev->ap_pending_cnt, 0);
2849 atomic_set(&mdev->rs_pending_cnt, 0);
2850 atomic_set(&mdev->unacked_cnt, 0);
2851 atomic_set(&mdev->local_cnt, 0);
2852 atomic_set(&mdev->net_cnt, 0);
2853 atomic_set(&mdev->packet_seq, 0);
2854 atomic_set(&mdev->pp_in_use, 0);
Lars Ellenberg435f0742010-09-06 12:30:25 +02002855 atomic_set(&mdev->pp_in_use_by_net, 0);
Philipp Reisner778f2712010-07-06 11:14:00 +02002856 atomic_set(&mdev->rs_sect_in, 0);
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002857 atomic_set(&mdev->rs_sect_ev, 0);
Philipp Reisner759fbdf2010-10-26 16:02:27 +02002858 atomic_set(&mdev->ap_in_flight, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002859
2860 mutex_init(&mdev->md_io_mutex);
2861 mutex_init(&mdev->data.mutex);
2862 mutex_init(&mdev->meta.mutex);
2863 sema_init(&mdev->data.work.s, 0);
2864 sema_init(&mdev->meta.work.s, 0);
2865 mutex_init(&mdev->state_mutex);
2866
2867 spin_lock_init(&mdev->data.work.q_lock);
2868 spin_lock_init(&mdev->meta.work.q_lock);
2869
2870 spin_lock_init(&mdev->al_lock);
2871 spin_lock_init(&mdev->req_lock);
2872 spin_lock_init(&mdev->peer_seq_lock);
2873 spin_lock_init(&mdev->epoch_lock);
2874
2875 INIT_LIST_HEAD(&mdev->active_ee);
2876 INIT_LIST_HEAD(&mdev->sync_ee);
2877 INIT_LIST_HEAD(&mdev->done_ee);
2878 INIT_LIST_HEAD(&mdev->read_ee);
2879 INIT_LIST_HEAD(&mdev->net_ee);
2880 INIT_LIST_HEAD(&mdev->resync_reads);
2881 INIT_LIST_HEAD(&mdev->data.work.q);
2882 INIT_LIST_HEAD(&mdev->meta.work.q);
2883 INIT_LIST_HEAD(&mdev->resync_work.list);
2884 INIT_LIST_HEAD(&mdev->unplug_work.list);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02002885 INIT_LIST_HEAD(&mdev->go_diskless.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002886 INIT_LIST_HEAD(&mdev->md_sync_work.list);
Philipp Reisnerc4752ef2010-10-27 17:32:36 +02002887 INIT_LIST_HEAD(&mdev->start_resync_work.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002888 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
Philipp Reisner0ced55a2010-04-30 15:26:20 +02002889
Philipp Reisnerb411b362009-09-25 16:07:19 -07002890 mdev->resync_work.cb = w_resync_inactive;
2891 mdev->unplug_work.cb = w_send_write_hint;
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02002892 mdev->go_diskless.cb = w_go_diskless;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002893 mdev->md_sync_work.cb = w_md_sync;
2894 mdev->bm_io_work.w.cb = w_bitmap_io;
2895 init_timer(&mdev->resync_timer);
2896 init_timer(&mdev->md_sync_timer);
2897 mdev->resync_timer.function = resync_timer_fn;
2898 mdev->resync_timer.data = (unsigned long) mdev;
2899 mdev->md_sync_timer.function = md_sync_timer_fn;
2900 mdev->md_sync_timer.data = (unsigned long) mdev;
2901
2902 init_waitqueue_head(&mdev->misc_wait);
2903 init_waitqueue_head(&mdev->state_wait);
Philipp Reisner84dfb9f2010-06-23 11:20:05 +02002904 init_waitqueue_head(&mdev->net_cnt_wait);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002905 init_waitqueue_head(&mdev->ee_wait);
2906 init_waitqueue_head(&mdev->al_wait);
2907 init_waitqueue_head(&mdev->seq_wait);
2908
2909 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2910 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2911 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2912
2913 mdev->agreed_pro_version = PRO_VERSION_MAX;
Philipp Reisner2451fc32010-08-24 13:43:11 +02002914 mdev->write_ordering = WO_bdev_flush;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002915 mdev->resync_wenr = LC_FREE;
2916}
2917
2918void drbd_mdev_cleanup(struct drbd_conf *mdev)
2919{
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02002920 int i;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002921 if (mdev->receiver.t_state != None)
2922 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2923 mdev->receiver.t_state);
2924
2925 /* no need to lock it, I'm the only thread alive */
2926 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
2927 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2928 mdev->al_writ_cnt =
2929 mdev->bm_writ_cnt =
2930 mdev->read_cnt =
2931 mdev->recv_cnt =
2932 mdev->send_cnt =
2933 mdev->writ_cnt =
2934 mdev->p_size =
2935 mdev->rs_start =
2936 mdev->rs_total =
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02002937 mdev->rs_failed = 0;
2938 mdev->rs_last_events = 0;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002939 mdev->rs_last_sect_ev = 0;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02002940 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2941 mdev->rs_mark_left[i] = 0;
2942 mdev->rs_mark_time[i] = 0;
2943 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002944 D_ASSERT(mdev->net_conf == NULL);
2945
2946 drbd_set_my_capacity(mdev, 0);
2947 if (mdev->bitmap) {
2948 /* maybe never allocated. */
Philipp Reisner02d9a942010-03-24 16:23:03 +01002949 drbd_bm_resize(mdev, 0, 1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002950 drbd_bm_cleanup(mdev);
2951 }
2952
2953 drbd_free_resources(mdev);
Philipp Reisner07782862010-08-31 12:00:50 +02002954 clear_bit(AL_SUSPENDED, &mdev->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002955
2956 /*
2957 * currently we drbd_init_ee only on module load, so
2958 * we may do drbd_release_ee only on module unload!
2959 */
2960 D_ASSERT(list_empty(&mdev->active_ee));
2961 D_ASSERT(list_empty(&mdev->sync_ee));
2962 D_ASSERT(list_empty(&mdev->done_ee));
2963 D_ASSERT(list_empty(&mdev->read_ee));
2964 D_ASSERT(list_empty(&mdev->net_ee));
2965 D_ASSERT(list_empty(&mdev->resync_reads));
2966 D_ASSERT(list_empty(&mdev->data.work.q));
2967 D_ASSERT(list_empty(&mdev->meta.work.q));
2968 D_ASSERT(list_empty(&mdev->resync_work.list));
2969 D_ASSERT(list_empty(&mdev->unplug_work.list));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02002970 D_ASSERT(list_empty(&mdev->go_diskless.list));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002971}
2972
2973
2974static void drbd_destroy_mempools(void)
2975{
2976 struct page *page;
2977
2978 while (drbd_pp_pool) {
2979 page = drbd_pp_pool;
2980 drbd_pp_pool = (struct page *)page_private(page);
2981 __free_page(page);
2982 drbd_pp_vacant--;
2983 }
2984
2985 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2986
2987 if (drbd_ee_mempool)
2988 mempool_destroy(drbd_ee_mempool);
2989 if (drbd_request_mempool)
2990 mempool_destroy(drbd_request_mempool);
2991 if (drbd_ee_cache)
2992 kmem_cache_destroy(drbd_ee_cache);
2993 if (drbd_request_cache)
2994 kmem_cache_destroy(drbd_request_cache);
2995 if (drbd_bm_ext_cache)
2996 kmem_cache_destroy(drbd_bm_ext_cache);
2997 if (drbd_al_ext_cache)
2998 kmem_cache_destroy(drbd_al_ext_cache);
2999
3000 drbd_ee_mempool = NULL;
3001 drbd_request_mempool = NULL;
3002 drbd_ee_cache = NULL;
3003 drbd_request_cache = NULL;
3004 drbd_bm_ext_cache = NULL;
3005 drbd_al_ext_cache = NULL;
3006
3007 return;
3008}
3009
3010static int drbd_create_mempools(void)
3011{
3012 struct page *page;
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01003013 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003014 int i;
3015
3016 /* prepare our caches and mempools */
3017 drbd_request_mempool = NULL;
3018 drbd_ee_cache = NULL;
3019 drbd_request_cache = NULL;
3020 drbd_bm_ext_cache = NULL;
3021 drbd_al_ext_cache = NULL;
3022 drbd_pp_pool = NULL;
3023
3024 /* caches */
3025 drbd_request_cache = kmem_cache_create(
3026 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3027 if (drbd_request_cache == NULL)
3028 goto Enomem;
3029
3030 drbd_ee_cache = kmem_cache_create(
3031 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3032 if (drbd_ee_cache == NULL)
3033 goto Enomem;
3034
3035 drbd_bm_ext_cache = kmem_cache_create(
3036 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3037 if (drbd_bm_ext_cache == NULL)
3038 goto Enomem;
3039
3040 drbd_al_ext_cache = kmem_cache_create(
3041 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3042 if (drbd_al_ext_cache == NULL)
3043 goto Enomem;
3044
3045 /* mempools */
3046 drbd_request_mempool = mempool_create(number,
3047 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3048 if (drbd_request_mempool == NULL)
3049 goto Enomem;
3050
3051 drbd_ee_mempool = mempool_create(number,
3052 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
Nicolas Kaiser2027ae12010-10-28 06:15:26 -06003053 if (drbd_ee_mempool == NULL)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003054 goto Enomem;
3055
3056 /* drbd's page pool */
3057 spin_lock_init(&drbd_pp_lock);
3058
3059 for (i = 0; i < number; i++) {
3060 page = alloc_page(GFP_HIGHUSER);
3061 if (!page)
3062 goto Enomem;
3063 set_page_private(page, (unsigned long)drbd_pp_pool);
3064 drbd_pp_pool = page;
3065 }
3066 drbd_pp_vacant = number;
3067
3068 return 0;
3069
3070Enomem:
3071 drbd_destroy_mempools(); /* in case we allocated some */
3072 return -ENOMEM;
3073}
3074
3075static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3076 void *unused)
3077{
3078 /* just so we have it. you never know what interesting things we
3079 * might want to do here some day...
3080 */
3081
3082 return NOTIFY_DONE;
3083}
3084
3085static struct notifier_block drbd_notifier = {
3086 .notifier_call = drbd_notify_sys,
3087};
3088
3089static void drbd_release_ee_lists(struct drbd_conf *mdev)
3090{
3091 int rr;
3092
3093 rr = drbd_release_ee(mdev, &mdev->active_ee);
3094 if (rr)
3095 dev_err(DEV, "%d EEs in active list found!\n", rr);
3096
3097 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3098 if (rr)
3099 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3100
3101 rr = drbd_release_ee(mdev, &mdev->read_ee);
3102 if (rr)
3103 dev_err(DEV, "%d EEs in read list found!\n", rr);
3104
3105 rr = drbd_release_ee(mdev, &mdev->done_ee);
3106 if (rr)
3107 dev_err(DEV, "%d EEs in done list found!\n", rr);
3108
3109 rr = drbd_release_ee(mdev, &mdev->net_ee);
3110 if (rr)
3111 dev_err(DEV, "%d EEs in net list found!\n", rr);
3112}
3113
3114/* caution. no locking.
3115 * currently only used from module cleanup code. */
3116static void drbd_delete_device(unsigned int minor)
3117{
3118 struct drbd_conf *mdev = minor_to_mdev(minor);
3119
3120 if (!mdev)
3121 return;
3122
3123 /* paranoia asserts */
3124 if (mdev->open_cnt != 0)
3125 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3126 __FILE__ , __LINE__);
3127
3128 ERR_IF (!list_empty(&mdev->data.work.q)) {
3129 struct list_head *lp;
3130 list_for_each(lp, &mdev->data.work.q) {
3131 dev_err(DEV, "lp = %p\n", lp);
3132 }
3133 };
3134 /* end paranoia asserts */
3135
3136 del_gendisk(mdev->vdisk);
3137
3138 /* cleanup stuff that may have been allocated during
3139 * device (re-)configuration or state changes */
3140
3141 if (mdev->this_bdev)
3142 bdput(mdev->this_bdev);
3143
3144 drbd_free_resources(mdev);
3145
3146 drbd_release_ee_lists(mdev);
3147
3148 /* should be free'd on disconnect? */
3149 kfree(mdev->ee_hash);
3150 /*
3151 mdev->ee_hash_s = 0;
3152 mdev->ee_hash = NULL;
3153 */
3154
3155 lc_destroy(mdev->act_log);
3156 lc_destroy(mdev->resync);
3157
3158 kfree(mdev->p_uuid);
3159 /* mdev->p_uuid = NULL; */
3160
3161 kfree(mdev->int_dig_out);
3162 kfree(mdev->int_dig_in);
3163 kfree(mdev->int_dig_vv);
3164
3165 /* cleanup the rest that has been
3166 * allocated from drbd_new_device
3167 * and actually free the mdev itself */
3168 drbd_free_mdev(mdev);
3169}
3170
3171static void drbd_cleanup(void)
3172{
3173 unsigned int i;
3174
3175 unregister_reboot_notifier(&drbd_notifier);
3176
3177 drbd_nl_cleanup();
3178
3179 if (minor_table) {
3180 if (drbd_proc)
3181 remove_proc_entry("drbd", NULL);
3182 i = minor_count;
3183 while (i--)
3184 drbd_delete_device(i);
3185 drbd_destroy_mempools();
3186 }
3187
3188 kfree(minor_table);
3189
3190 unregister_blkdev(DRBD_MAJOR, "drbd");
3191
3192 printk(KERN_INFO "drbd: module cleanup done.\n");
3193}
3194
3195/**
3196 * drbd_congested() - Callback for pdflush
3197 * @congested_data: User data
3198 * @bdi_bits: Bits pdflush is currently interested in
3199 *
3200 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3201 */
3202static int drbd_congested(void *congested_data, int bdi_bits)
3203{
3204 struct drbd_conf *mdev = congested_data;
3205 struct request_queue *q;
3206 char reason = '-';
3207 int r = 0;
3208
3209 if (!__inc_ap_bio_cond(mdev)) {
3210 /* DRBD has frozen IO */
3211 r = bdi_bits;
3212 reason = 'd';
3213 goto out;
3214 }
3215
3216 if (get_ldev(mdev)) {
3217 q = bdev_get_queue(mdev->ldev->backing_bdev);
3218 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3219 put_ldev(mdev);
3220 if (r)
3221 reason = 'b';
3222 }
3223
3224 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3225 r |= (1 << BDI_async_congested);
3226 reason = reason == 'b' ? 'a' : 'n';
3227 }
3228
3229out:
3230 mdev->congestion_reason = reason;
3231 return r;
3232}
3233
3234struct drbd_conf *drbd_new_device(unsigned int minor)
3235{
3236 struct drbd_conf *mdev;
3237 struct gendisk *disk;
3238 struct request_queue *q;
3239
3240 /* GFP_KERNEL, we are outside of all write-out paths */
3241 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3242 if (!mdev)
3243 return NULL;
3244 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3245 goto out_no_cpumask;
3246
3247 mdev->minor = minor;
3248
3249 drbd_init_set_defaults(mdev);
3250
3251 q = blk_alloc_queue(GFP_KERNEL);
3252 if (!q)
3253 goto out_no_q;
3254 mdev->rq_queue = q;
3255 q->queuedata = mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003256
3257 disk = alloc_disk(1);
3258 if (!disk)
3259 goto out_no_disk;
3260 mdev->vdisk = disk;
3261
3262 set_disk_ro(disk, TRUE);
3263
3264 disk->queue = q;
3265 disk->major = DRBD_MAJOR;
3266 disk->first_minor = minor;
3267 disk->fops = &drbd_ops;
3268 sprintf(disk->disk_name, "drbd%d", minor);
3269 disk->private_data = mdev;
3270
3271 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3272 /* we have no partitions. we contain only ourselves. */
3273 mdev->this_bdev->bd_contains = mdev->this_bdev;
3274
3275 q->backing_dev_info.congested_fn = drbd_congested;
3276 q->backing_dev_info.congested_data = mdev;
3277
3278 blk_queue_make_request(q, drbd_make_request_26);
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01003279 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE >> 9);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003280 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3281 blk_queue_merge_bvec(q, drbd_merge_bvec);
Jens Axboe7eaceac2011-03-10 08:52:07 +01003282 q->queue_lock = &mdev->req_lock;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003283
3284 mdev->md_io_page = alloc_page(GFP_KERNEL);
3285 if (!mdev->md_io_page)
3286 goto out_no_io_page;
3287
3288 if (drbd_bm_init(mdev))
3289 goto out_no_bitmap;
3290 /* no need to lock access, we are still initializing this minor device. */
3291 if (!tl_init(mdev))
3292 goto out_no_tl;
3293
3294 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3295 if (!mdev->app_reads_hash)
3296 goto out_no_app_reads;
3297
3298 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3299 if (!mdev->current_epoch)
3300 goto out_no_epoch;
3301
3302 INIT_LIST_HEAD(&mdev->current_epoch->list);
3303 mdev->epochs = 1;
3304
3305 return mdev;
3306
3307/* out_whatever_else:
3308 kfree(mdev->current_epoch); */
3309out_no_epoch:
3310 kfree(mdev->app_reads_hash);
3311out_no_app_reads:
3312 tl_cleanup(mdev);
3313out_no_tl:
3314 drbd_bm_cleanup(mdev);
3315out_no_bitmap:
3316 __free_page(mdev->md_io_page);
3317out_no_io_page:
3318 put_disk(disk);
3319out_no_disk:
3320 blk_cleanup_queue(q);
3321out_no_q:
3322 free_cpumask_var(mdev->cpu_mask);
3323out_no_cpumask:
3324 kfree(mdev);
3325 return NULL;
3326}
3327
3328/* counterpart of drbd_new_device.
3329 * last part of drbd_delete_device. */
3330void drbd_free_mdev(struct drbd_conf *mdev)
3331{
3332 kfree(mdev->current_epoch);
3333 kfree(mdev->app_reads_hash);
3334 tl_cleanup(mdev);
3335 if (mdev->bitmap) /* should no longer be there. */
3336 drbd_bm_cleanup(mdev);
3337 __free_page(mdev->md_io_page);
3338 put_disk(mdev->vdisk);
3339 blk_cleanup_queue(mdev->rq_queue);
3340 free_cpumask_var(mdev->cpu_mask);
Philipp Reisner37190942010-11-10 12:08:37 +01003341 drbd_free_tl_hash(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003342 kfree(mdev);
3343}
3344
3345
3346int __init drbd_init(void)
3347{
3348 int err;
3349
3350 if (sizeof(struct p_handshake) != 80) {
3351 printk(KERN_ERR
3352 "drbd: never change the size or layout "
3353 "of the HandShake packet.\n");
3354 return -EINVAL;
3355 }
3356
3357 if (1 > minor_count || minor_count > 255) {
3358 printk(KERN_ERR
3359 "drbd: invalid minor_count (%d)\n", minor_count);
3360#ifdef MODULE
3361 return -EINVAL;
3362#else
3363 minor_count = 8;
3364#endif
3365 }
3366
3367 err = drbd_nl_init();
3368 if (err)
3369 return err;
3370
3371 err = register_blkdev(DRBD_MAJOR, "drbd");
3372 if (err) {
3373 printk(KERN_ERR
3374 "drbd: unable to register block device major %d\n",
3375 DRBD_MAJOR);
3376 return err;
3377 }
3378
3379 register_reboot_notifier(&drbd_notifier);
3380
3381 /*
3382 * allocate all necessary structs
3383 */
3384 err = -ENOMEM;
3385
3386 init_waitqueue_head(&drbd_pp_wait);
3387
3388 drbd_proc = NULL; /* play safe for drbd_cleanup */
3389 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3390 GFP_KERNEL);
3391 if (!minor_table)
3392 goto Enomem;
3393
3394 err = drbd_create_mempools();
3395 if (err)
3396 goto Enomem;
3397
Lars Ellenberg8c484ee2010-03-11 16:47:58 +01003398 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003399 if (!drbd_proc) {
3400 printk(KERN_ERR "drbd: unable to register proc file\n");
3401 goto Enomem;
3402 }
3403
3404 rwlock_init(&global_state_lock);
3405
3406 printk(KERN_INFO "drbd: initialized. "
3407 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3408 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3409 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3410 printk(KERN_INFO "drbd: registered as block device major %d\n",
3411 DRBD_MAJOR);
3412 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3413
3414 return 0; /* Success! */
3415
3416Enomem:
3417 drbd_cleanup();
3418 if (err == -ENOMEM)
3419 /* currently always the case */
3420 printk(KERN_ERR "drbd: ran out of memory\n");
3421 else
3422 printk(KERN_ERR "drbd: initialization failure\n");
3423 return err;
3424}
3425
3426void drbd_free_bc(struct drbd_backing_dev *ldev)
3427{
3428 if (ldev == NULL)
3429 return;
3430
Tejun Heoe525fd82010-11-13 11:55:17 +01003431 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3432 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003433
3434 kfree(ldev);
3435}
3436
3437void drbd_free_sock(struct drbd_conf *mdev)
3438{
3439 if (mdev->data.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003440 mutex_lock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003441 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3442 sock_release(mdev->data.socket);
3443 mdev->data.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003444 mutex_unlock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003445 }
3446 if (mdev->meta.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003447 mutex_lock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003448 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3449 sock_release(mdev->meta.socket);
3450 mdev->meta.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003451 mutex_unlock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003452 }
3453}
3454
3455
3456void drbd_free_resources(struct drbd_conf *mdev)
3457{
3458 crypto_free_hash(mdev->csums_tfm);
3459 mdev->csums_tfm = NULL;
3460 crypto_free_hash(mdev->verify_tfm);
3461 mdev->verify_tfm = NULL;
3462 crypto_free_hash(mdev->cram_hmac_tfm);
3463 mdev->cram_hmac_tfm = NULL;
3464 crypto_free_hash(mdev->integrity_w_tfm);
3465 mdev->integrity_w_tfm = NULL;
3466 crypto_free_hash(mdev->integrity_r_tfm);
3467 mdev->integrity_r_tfm = NULL;
3468
3469 drbd_free_sock(mdev);
3470
3471 __no_warn(local,
3472 drbd_free_bc(mdev->ldev);
3473 mdev->ldev = NULL;);
3474}
3475
3476/* meta data management */
3477
3478struct meta_data_on_disk {
3479 u64 la_size; /* last agreed size. */
3480 u64 uuid[UI_SIZE]; /* UUIDs. */
3481 u64 device_uuid;
3482 u64 reserved_u64_1;
3483 u32 flags; /* MDF */
3484 u32 magic;
3485 u32 md_size_sect;
3486 u32 al_offset; /* offset to this block */
3487 u32 al_nr_extents; /* important for restoring the AL */
3488 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3489 u32 bm_offset; /* offset to the bitmap, from here */
3490 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3491 u32 reserved_u32[4];
3492
3493} __packed;
3494
3495/**
3496 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3497 * @mdev: DRBD device.
3498 */
3499void drbd_md_sync(struct drbd_conf *mdev)
3500{
3501 struct meta_data_on_disk *buffer;
3502 sector_t sector;
3503 int i;
3504
Lars Ellenbergee15b032010-09-03 10:00:09 +02003505 del_timer(&mdev->md_sync_timer);
3506 /* timer may be rearmed by drbd_md_mark_dirty() now. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003507 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3508 return;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003509
3510 /* We use here D_FAILED and not D_ATTACHING because we try to write
3511 * metadata even if we detach due to a disk failure! */
3512 if (!get_ldev_if_state(mdev, D_FAILED))
3513 return;
3514
Philipp Reisnerb411b362009-09-25 16:07:19 -07003515 mutex_lock(&mdev->md_io_mutex);
3516 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3517 memset(buffer, 0, 512);
3518
3519 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3520 for (i = UI_CURRENT; i < UI_SIZE; i++)
3521 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3522 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3523 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3524
3525 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3526 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3527 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3528 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3529 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3530
3531 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3532
3533 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3534 sector = mdev->ldev->md.md_offset;
3535
Lars Ellenberg3f3a9b82010-09-01 15:12:12 +02003536 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003537 /* this was a try anyways ... */
3538 dev_err(DEV, "meta data update failed!\n");
Philipp Reisnerb411b362009-09-25 16:07:19 -07003539 drbd_chk_io_error(mdev, 1, TRUE);
3540 }
3541
3542 /* Update mdev->ldev->md.la_size_sect,
3543 * since we updated it on metadata. */
3544 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3545
3546 mutex_unlock(&mdev->md_io_mutex);
3547 put_ldev(mdev);
3548}
3549
3550/**
3551 * drbd_md_read() - Reads in the meta data super block
3552 * @mdev: DRBD device.
3553 * @bdev: Device from which the meta data should be read in.
3554 *
3555 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
3556 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3557 */
3558int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3559{
3560 struct meta_data_on_disk *buffer;
3561 int i, rv = NO_ERROR;
3562
3563 if (!get_ldev_if_state(mdev, D_ATTACHING))
3564 return ERR_IO_MD_DISK;
3565
Philipp Reisnerb411b362009-09-25 16:07:19 -07003566 mutex_lock(&mdev->md_io_mutex);
3567 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3568
3569 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3570 /* NOTE: cant do normal error processing here as this is
3571 called BEFORE disk is attached */
3572 dev_err(DEV, "Error while reading metadata.\n");
3573 rv = ERR_IO_MD_DISK;
3574 goto err;
3575 }
3576
3577 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3578 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3579 rv = ERR_MD_INVALID;
3580 goto err;
3581 }
3582 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3583 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3584 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3585 rv = ERR_MD_INVALID;
3586 goto err;
3587 }
3588 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3589 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3590 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3591 rv = ERR_MD_INVALID;
3592 goto err;
3593 }
3594 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3595 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3596 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3597 rv = ERR_MD_INVALID;
3598 goto err;
3599 }
3600
3601 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3602 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3603 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3604 rv = ERR_MD_INVALID;
3605 goto err;
3606 }
3607
3608 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3609 for (i = UI_CURRENT; i < UI_SIZE; i++)
3610 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3611 bdev->md.flags = be32_to_cpu(buffer->flags);
3612 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3613 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3614
3615 if (mdev->sync_conf.al_extents < 7)
3616 mdev->sync_conf.al_extents = 127;
3617
3618 err:
3619 mutex_unlock(&mdev->md_io_mutex);
3620 put_ldev(mdev);
3621
3622 return rv;
3623}
3624
Lars Ellenbergac724122010-10-07 15:18:08 +02003625static void debug_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
3626{
3627 static char *uuid_str[UI_EXTENDED_SIZE] = {
3628 [UI_CURRENT] = "CURRENT",
3629 [UI_BITMAP] = "BITMAP",
3630 [UI_HISTORY_START] = "HISTORY_START",
3631 [UI_HISTORY_END] = "HISTORY_END",
3632 [UI_SIZE] = "SIZE",
3633 [UI_FLAGS] = "FLAGS",
3634 };
3635
3636 if (index >= UI_EXTENDED_SIZE) {
3637 dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
3638 return;
3639 }
3640
3641 dynamic_dev_dbg(DEV, " uuid[%s] now %016llX\n",
3642 uuid_str[index],
3643 (unsigned long long)mdev->ldev->md.uuid[index]);
3644}
3645
3646
Philipp Reisnerb411b362009-09-25 16:07:19 -07003647/**
3648 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3649 * @mdev: DRBD device.
3650 *
3651 * Call this function if you change anything that should be written to
3652 * the meta-data super block. This function sets MD_DIRTY, and starts a
3653 * timer that ensures that within five seconds you have to call drbd_md_sync().
3654 */
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003655#ifdef DEBUG
Lars Ellenbergee15b032010-09-03 10:00:09 +02003656void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3657{
3658 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3659 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3660 mdev->last_md_mark_dirty.line = line;
3661 mdev->last_md_mark_dirty.func = func;
3662 }
3663}
3664#else
Philipp Reisnerb411b362009-09-25 16:07:19 -07003665void drbd_md_mark_dirty(struct drbd_conf *mdev)
3666{
Lars Ellenbergee15b032010-09-03 10:00:09 +02003667 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003668 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003669}
Lars Ellenbergee15b032010-09-03 10:00:09 +02003670#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07003671
3672static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3673{
3674 int i;
3675
Lars Ellenbergac724122010-10-07 15:18:08 +02003676 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003677 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
Lars Ellenbergac724122010-10-07 15:18:08 +02003678 debug_drbd_uuid(mdev, i+1);
3679 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003680}
3681
3682void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3683{
3684 if (idx == UI_CURRENT) {
3685 if (mdev->state.role == R_PRIMARY)
3686 val |= 1;
3687 else
3688 val &= ~((u64)1);
3689
3690 drbd_set_ed_uuid(mdev, val);
3691 }
3692
3693 mdev->ldev->md.uuid[idx] = val;
Lars Ellenbergac724122010-10-07 15:18:08 +02003694 debug_drbd_uuid(mdev, idx);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003695 drbd_md_mark_dirty(mdev);
3696}
3697
3698
3699void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3700{
3701 if (mdev->ldev->md.uuid[idx]) {
3702 drbd_uuid_move_history(mdev);
3703 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
Lars Ellenbergac724122010-10-07 15:18:08 +02003704 debug_drbd_uuid(mdev, UI_HISTORY_START);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003705 }
3706 _drbd_uuid_set(mdev, idx, val);
3707}
3708
3709/**
3710 * drbd_uuid_new_current() - Creates a new current UUID
3711 * @mdev: DRBD device.
3712 *
3713 * Creates a new current UUID, and rotates the old current UUID into
3714 * the bitmap slot. Causes an incremental resync upon next connect.
3715 */
3716void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3717{
3718 u64 val;
3719
3720 dev_info(DEV, "Creating new current UUID\n");
3721 D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3722 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
Lars Ellenbergac724122010-10-07 15:18:08 +02003723 debug_drbd_uuid(mdev, UI_BITMAP);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003724
3725 get_random_bytes(&val, sizeof(u64));
3726 _drbd_uuid_set(mdev, UI_CURRENT, val);
Lars Ellenbergaaa8e2b2010-10-15 13:16:53 +02003727 /* get it to stable storage _now_ */
3728 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003729}
3730
3731void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3732{
3733 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3734 return;
3735
3736 if (val == 0) {
3737 drbd_uuid_move_history(mdev);
3738 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3739 mdev->ldev->md.uuid[UI_BITMAP] = 0;
Lars Ellenbergac724122010-10-07 15:18:08 +02003740 debug_drbd_uuid(mdev, UI_HISTORY_START);
3741 debug_drbd_uuid(mdev, UI_BITMAP);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003742 } else {
3743 if (mdev->ldev->md.uuid[UI_BITMAP])
3744 dev_warn(DEV, "bm UUID already set");
3745
3746 mdev->ldev->md.uuid[UI_BITMAP] = val;
3747 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3748
Lars Ellenbergac724122010-10-07 15:18:08 +02003749 debug_drbd_uuid(mdev, UI_BITMAP);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003750 }
3751 drbd_md_mark_dirty(mdev);
3752}
3753
3754/**
3755 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3756 * @mdev: DRBD device.
3757 *
3758 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3759 */
3760int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3761{
3762 int rv = -EIO;
3763
3764 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3765 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3766 drbd_md_sync(mdev);
3767 drbd_bm_set_all(mdev);
3768
3769 rv = drbd_bm_write(mdev);
3770
3771 if (!rv) {
3772 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3773 drbd_md_sync(mdev);
3774 }
3775
3776 put_ldev(mdev);
3777 }
3778
3779 return rv;
3780}
3781
3782/**
3783 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3784 * @mdev: DRBD device.
3785 *
3786 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3787 */
3788int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3789{
3790 int rv = -EIO;
3791
Philipp Reisner07782862010-08-31 12:00:50 +02003792 drbd_resume_al(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003793 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3794 drbd_bm_clear_all(mdev);
3795 rv = drbd_bm_write(mdev);
3796 put_ldev(mdev);
3797 }
3798
3799 return rv;
3800}
3801
3802static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3803{
3804 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3805 int rv;
3806
3807 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3808
3809 drbd_bm_lock(mdev, work->why);
3810 rv = work->io_fn(mdev);
3811 drbd_bm_unlock(mdev);
3812
3813 clear_bit(BITMAP_IO, &mdev->flags);
Philipp Reisner127b3172010-11-16 10:07:53 +01003814 smp_mb__after_clear_bit();
Philipp Reisnerb411b362009-09-25 16:07:19 -07003815 wake_up(&mdev->misc_wait);
3816
3817 if (work->done)
3818 work->done(mdev, rv);
3819
3820 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3821 work->why = NULL;
3822
3823 return 1;
3824}
3825
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02003826void drbd_ldev_destroy(struct drbd_conf *mdev)
3827{
3828 lc_destroy(mdev->resync);
3829 mdev->resync = NULL;
3830 lc_destroy(mdev->act_log);
3831 mdev->act_log = NULL;
3832 __no_warn(local,
3833 drbd_free_bc(mdev->ldev);
3834 mdev->ldev = NULL;);
3835
3836 if (mdev->md_io_tmpp) {
3837 __free_page(mdev->md_io_tmpp);
3838 mdev->md_io_tmpp = NULL;
3839 }
3840 clear_bit(GO_DISKLESS, &mdev->flags);
3841}
3842
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003843static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3844{
3845 D_ASSERT(mdev->state.disk == D_FAILED);
Lars Ellenberg9d282872010-10-14 13:57:07 +02003846 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3847 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02003848 * the protected members anymore, though, so once put_ldev reaches zero
3849 * again, it will be safe to free them. */
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003850 drbd_force_state(mdev, NS(disk, D_DISKLESS));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003851 return 1;
3852}
3853
3854void drbd_go_diskless(struct drbd_conf *mdev)
3855{
3856 D_ASSERT(mdev->state.disk == D_FAILED);
3857 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
Lars Ellenberg9d282872010-10-14 13:57:07 +02003858 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003859}
3860
Philipp Reisnerb411b362009-09-25 16:07:19 -07003861/**
3862 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3863 * @mdev: DRBD device.
3864 * @io_fn: IO callback to be called when bitmap IO is possible
3865 * @done: callback to be called after the bitmap IO was performed
3866 * @why: Descriptive text of the reason for doing the IO
3867 *
3868 * While IO on the bitmap happens we freeze application IO thus we ensure
3869 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3870 * called from worker context. It MUST NOT be used while a previous such
3871 * work is still pending!
3872 */
3873void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3874 int (*io_fn)(struct drbd_conf *),
3875 void (*done)(struct drbd_conf *, int),
3876 char *why)
3877{
3878 D_ASSERT(current == mdev->worker.task);
3879
3880 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3881 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3882 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3883 if (mdev->bm_io_work.why)
3884 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3885 why, mdev->bm_io_work.why);
3886
3887 mdev->bm_io_work.io_fn = io_fn;
3888 mdev->bm_io_work.done = done;
3889 mdev->bm_io_work.why = why;
3890
Philipp Reisner22afd7e2010-11-16 15:30:44 +01003891 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003892 set_bit(BITMAP_IO, &mdev->flags);
3893 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
Philipp Reisner127b3172010-11-16 10:07:53 +01003894 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
Philipp Reisnerb411b362009-09-25 16:07:19 -07003895 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003896 }
Philipp Reisner22afd7e2010-11-16 15:30:44 +01003897 spin_unlock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003898}
3899
3900/**
3901 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
3902 * @mdev: DRBD device.
3903 * @io_fn: IO callback to be called when bitmap IO is possible
3904 * @why: Descriptive text of the reason for doing the IO
3905 *
3906 * freezes application IO while that the actual IO operations runs. This
3907 * functions MAY NOT be called from worker context.
3908 */
3909int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3910{
3911 int rv;
3912
3913 D_ASSERT(current != mdev->worker.task);
3914
3915 drbd_suspend_io(mdev);
3916
3917 drbd_bm_lock(mdev, why);
3918 rv = io_fn(mdev);
3919 drbd_bm_unlock(mdev);
3920
3921 drbd_resume_io(mdev);
3922
3923 return rv;
3924}
3925
3926void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3927{
3928 if ((mdev->ldev->md.flags & flag) != flag) {
3929 drbd_md_mark_dirty(mdev);
3930 mdev->ldev->md.flags |= flag;
3931 }
3932}
3933
3934void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3935{
3936 if ((mdev->ldev->md.flags & flag) != 0) {
3937 drbd_md_mark_dirty(mdev);
3938 mdev->ldev->md.flags &= ~flag;
3939 }
3940}
3941int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3942{
3943 return (bdev->md.flags & flag) != 0;
3944}
3945
3946static void md_sync_timer_fn(unsigned long data)
3947{
3948 struct drbd_conf *mdev = (struct drbd_conf *) data;
3949
3950 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3951}
3952
3953static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3954{
3955 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
Lars Ellenbergee15b032010-09-03 10:00:09 +02003956#ifdef DEBUG
3957 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
3958 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
3959#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07003960 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003961 return 1;
3962}
3963
3964#ifdef CONFIG_DRBD_FAULT_INJECTION
3965/* Fault insertion support including random number generator shamelessly
3966 * stolen from kernel/rcutorture.c */
3967struct fault_random_state {
3968 unsigned long state;
3969 unsigned long count;
3970};
3971
3972#define FAULT_RANDOM_MULT 39916801 /* prime */
3973#define FAULT_RANDOM_ADD 479001701 /* prime */
3974#define FAULT_RANDOM_REFRESH 10000
3975
3976/*
3977 * Crude but fast random-number generator. Uses a linear congruential
3978 * generator, with occasional help from get_random_bytes().
3979 */
3980static unsigned long
3981_drbd_fault_random(struct fault_random_state *rsp)
3982{
3983 long refresh;
3984
Roel Kluin49829ea2009-12-15 22:55:44 +01003985 if (!rsp->count--) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003986 get_random_bytes(&refresh, sizeof(refresh));
3987 rsp->state += refresh;
3988 rsp->count = FAULT_RANDOM_REFRESH;
3989 }
3990 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3991 return swahw32(rsp->state);
3992}
3993
3994static char *
3995_drbd_fault_str(unsigned int type) {
3996 static char *_faults[] = {
3997 [DRBD_FAULT_MD_WR] = "Meta-data write",
3998 [DRBD_FAULT_MD_RD] = "Meta-data read",
3999 [DRBD_FAULT_RS_WR] = "Resync write",
4000 [DRBD_FAULT_RS_RD] = "Resync read",
4001 [DRBD_FAULT_DT_WR] = "Data write",
4002 [DRBD_FAULT_DT_RD] = "Data read",
4003 [DRBD_FAULT_DT_RA] = "Data read ahead",
4004 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
Philipp Reisner6b4388a2010-04-26 14:11:45 +02004005 [DRBD_FAULT_AL_EE] = "EE allocation",
4006 [DRBD_FAULT_RECEIVE] = "receive data corruption",
Philipp Reisnerb411b362009-09-25 16:07:19 -07004007 };
4008
4009 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4010}
4011
4012unsigned int
4013_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4014{
4015 static struct fault_random_state rrs = {0, 0};
4016
4017 unsigned int ret = (
4018 (fault_devs == 0 ||
4019 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4020 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4021
4022 if (ret) {
4023 fault_count++;
4024
Lars Ellenberg73835062010-05-27 11:51:56 +02004025 if (__ratelimit(&drbd_ratelimit_state))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004026 dev_warn(DEV, "***Simulating %s failure\n",
4027 _drbd_fault_str(type));
4028 }
4029
4030 return ret;
4031}
4032#endif
4033
4034const char *drbd_buildtag(void)
4035{
4036 /* DRBD built from external sources has here a reference to the
4037 git hash of the source code. */
4038
4039 static char buildtag[38] = "\0uilt-in";
4040
4041 if (buildtag[0] == 0) {
4042#ifdef CONFIG_MODULES
4043 if (THIS_MODULE != NULL)
4044 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4045 else
4046#endif
4047 buildtag[0] = 'b';
4048 }
4049
4050 return buildtag;
4051}
4052
4053module_init(drbd_init)
4054module_exit(drbd_cleanup)
4055
Philipp Reisnerb411b362009-09-25 16:07:19 -07004056EXPORT_SYMBOL(drbd_conn_str);
4057EXPORT_SYMBOL(drbd_role_str);
4058EXPORT_SYMBOL(drbd_disk_str);
4059EXPORT_SYMBOL(drbd_set_st_err_str);