blob: a1a2cb1eadf1c55734a681a5413299622e269e8a [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
Philipp Reisnerb411b362009-09-25 16:07:19 -070029#include <linux/module.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070030#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020035#include <linux/mutex.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070036#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070055#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020067static DEFINE_MUTEX(drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -070068int drbdd_init(struct drbd_thread *);
69int drbd_worker(struct drbd_thread *);
70int drbd_asender(struct drbd_thread *);
71
72int drbd_init(void);
73static int drbd_open(struct block_device *bdev, fmode_t mode);
74static int drbd_release(struct gendisk *gd, fmode_t mode);
75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79static void md_sync_timer_fn(unsigned long data);
80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +020081static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Philipp Reisnerb411b362009-09-25 16:07:19 -070082
Philipp Reisnerb411b362009-09-25 16:07:19 -070083MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84 "Lars Ellenberg <lars@linbit.com>");
85MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86MODULE_VERSION(REL_VERSION);
87MODULE_LICENSE("GPL");
88MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
89MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
90
91#include <linux/moduleparam.h>
92/* allow_open_on_secondary */
93MODULE_PARM_DESC(allow_oos, "DONT USE!");
94/* thanks to these macros, if compiled into the kernel (not-module),
95 * this becomes the boot parameter drbd.minor_count */
96module_param(minor_count, uint, 0444);
97module_param(disable_sendpage, bool, 0644);
98module_param(allow_oos, bool, 0);
99module_param(cn_idx, uint, 0444);
100module_param(proc_details, int, 0644);
101
102#ifdef CONFIG_DRBD_FAULT_INJECTION
103int enable_faults;
104int fault_rate;
105static int fault_count;
106int fault_devs;
107/* bitmap of enabled faults */
108module_param(enable_faults, int, 0664);
109/* fault rate % value - applies to all enabled faults */
110module_param(fault_rate, int, 0664);
111/* count of faults inserted */
112module_param(fault_count, int, 0664);
113/* bitmap of devices to insert faults on */
114module_param(fault_devs, int, 0644);
115#endif
116
117/* module parameter, defined */
118unsigned int minor_count = 32;
119int disable_sendpage;
120int allow_oos;
121unsigned int cn_idx = CN_IDX_DRBD;
122int proc_details; /* Detail level in proc drbd*/
123
124/* Module parameter for setting the user mode helper program
125 * to run. Default is /sbin/drbdadm */
126char usermode_helper[80] = "/sbin/drbdadm";
127
128module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
129
130/* in 2.6.x, our device mapping and config info contains our virtual gendisks
131 * as member "struct gendisk *vdisk;"
132 */
133struct drbd_conf **minor_table;
134
135struct kmem_cache *drbd_request_cache;
136struct kmem_cache *drbd_ee_cache; /* epoch entries */
137struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
138struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
139mempool_t *drbd_request_mempool;
140mempool_t *drbd_ee_mempool;
141
142/* I do not use a standard mempool, because:
143 1) I want to hand out the pre-allocated objects first.
144 2) I want to be able to interrupt sleeping allocation with a signal.
145 Note: This is a single linked list, the next pointer is the private
146 member of struct page.
147 */
148struct page *drbd_pp_pool;
149spinlock_t drbd_pp_lock;
150int drbd_pp_vacant;
151wait_queue_head_t drbd_pp_wait;
152
153DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
154
Emese Revfy7d4e9d02009-12-14 00:59:30 +0100155static const struct block_device_operations drbd_ops = {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700156 .owner = THIS_MODULE,
157 .open = drbd_open,
158 .release = drbd_release,
159};
160
161#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
162
163#ifdef __CHECKER__
164/* When checking with sparse, and this is an inline function, sparse will
165 give tons of false positives. When this is a real functions sparse works.
166 */
167int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
168{
169 int io_allowed;
170
171 atomic_inc(&mdev->local_cnt);
172 io_allowed = (mdev->state.disk >= mins);
173 if (!io_allowed) {
174 if (atomic_dec_and_test(&mdev->local_cnt))
175 wake_up(&mdev->misc_wait);
176 }
177 return io_allowed;
178}
179
180#endif
181
182/**
183 * DOC: The transfer log
184 *
185 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
186 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
187 * of the list. There is always at least one &struct drbd_tl_epoch object.
188 *
189 * Each &struct drbd_tl_epoch has a circular double linked list of requests
190 * attached.
191 */
192static int tl_init(struct drbd_conf *mdev)
193{
194 struct drbd_tl_epoch *b;
195
196 /* during device minor initialization, we may well use GFP_KERNEL */
197 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
198 if (!b)
199 return 0;
200 INIT_LIST_HEAD(&b->requests);
201 INIT_LIST_HEAD(&b->w.list);
202 b->next = NULL;
203 b->br_number = 4711;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200204 b->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700205 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
206
207 mdev->oldest_tle = b;
208 mdev->newest_tle = b;
209 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
210
211 mdev->tl_hash = NULL;
212 mdev->tl_hash_s = 0;
213
214 return 1;
215}
216
217static void tl_cleanup(struct drbd_conf *mdev)
218{
219 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
220 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
221 kfree(mdev->oldest_tle);
222 mdev->oldest_tle = NULL;
223 kfree(mdev->unused_spare_tle);
224 mdev->unused_spare_tle = NULL;
225 kfree(mdev->tl_hash);
226 mdev->tl_hash = NULL;
227 mdev->tl_hash_s = 0;
228}
229
230/**
231 * _tl_add_barrier() - Adds a barrier to the transfer log
232 * @mdev: DRBD device.
233 * @new: Barrier to be added before the current head of the TL.
234 *
235 * The caller must hold the req_lock.
236 */
237void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
238{
239 struct drbd_tl_epoch *newest_before;
240
241 INIT_LIST_HEAD(&new->requests);
242 INIT_LIST_HEAD(&new->w.list);
243 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
244 new->next = NULL;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200245 new->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700246
247 newest_before = mdev->newest_tle;
248 /* never send a barrier number == 0, because that is special-cased
249 * when using TCQ for our write ordering code */
250 new->br_number = (newest_before->br_number+1) ?: 1;
251 if (mdev->newest_tle != new) {
252 mdev->newest_tle->next = new;
253 mdev->newest_tle = new;
254 }
255}
256
257/**
258 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
259 * @mdev: DRBD device.
260 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
261 * @set_size: Expected number of requests before that barrier.
262 *
263 * In case the passed barrier_nr or set_size does not match the oldest
264 * &struct drbd_tl_epoch objects this function will cause a termination
265 * of the connection.
266 */
267void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
268 unsigned int set_size)
269{
270 struct drbd_tl_epoch *b, *nob; /* next old barrier */
271 struct list_head *le, *tle;
272 struct drbd_request *r;
273
274 spin_lock_irq(&mdev->req_lock);
275
276 b = mdev->oldest_tle;
277
278 /* first some paranoia code */
279 if (b == NULL) {
280 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
281 barrier_nr);
282 goto bail;
283 }
284 if (b->br_number != barrier_nr) {
285 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
286 barrier_nr, b->br_number);
287 goto bail;
288 }
Philipp Reisner7e602c02010-05-27 14:49:27 +0200289 if (b->n_writes != set_size) {
290 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
291 barrier_nr, set_size, b->n_writes);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700292 goto bail;
293 }
294
295 /* Clean up list of requests processed during current epoch */
296 list_for_each_safe(le, tle, &b->requests) {
297 r = list_entry(le, struct drbd_request, tl_requests);
298 _req_mod(r, barrier_acked);
299 }
300 /* There could be requests on the list waiting for completion
301 of the write to the local disk. To avoid corruptions of
302 slab's data structures we have to remove the lists head.
303
304 Also there could have been a barrier ack out of sequence, overtaking
305 the write acks - which would be a bug and violating write ordering.
306 To not deadlock in case we lose connection while such requests are
307 still pending, we need some way to find them for the
308 _req_mode(connection_lost_while_pending).
309
310 These have been list_move'd to the out_of_sequence_requests list in
311 _req_mod(, barrier_acked) above.
312 */
313 list_del_init(&b->requests);
314
315 nob = b->next;
316 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
317 _tl_add_barrier(mdev, b);
318 if (nob)
319 mdev->oldest_tle = nob;
320 /* if nob == NULL b was the only barrier, and becomes the new
321 barrier. Therefore mdev->oldest_tle points already to b */
322 } else {
323 D_ASSERT(nob != NULL);
324 mdev->oldest_tle = nob;
325 kfree(b);
326 }
327
328 spin_unlock_irq(&mdev->req_lock);
329 dec_ap_pending(mdev);
330
331 return;
332
333bail:
334 spin_unlock_irq(&mdev->req_lock);
335 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
336}
337
Philipp Reisner11b58e72010-05-12 17:08:26 +0200338/**
339 * _tl_restart() - Walks the transfer log, and applies an action to all requests
340 * @mdev: DRBD device.
341 * @what: The action/event to perform with all request objects
342 *
343 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
344 * restart_frozen_disk_io.
345 */
346static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
347{
348 struct drbd_tl_epoch *b, *tmp, **pn;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200349 struct list_head *le, *tle, carry_reads;
Philipp Reisner11b58e72010-05-12 17:08:26 +0200350 struct drbd_request *req;
351 int rv, n_writes, n_reads;
352
353 b = mdev->oldest_tle;
354 pn = &mdev->oldest_tle;
355 while (b) {
356 n_writes = 0;
357 n_reads = 0;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200358 INIT_LIST_HEAD(&carry_reads);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200359 list_for_each_safe(le, tle, &b->requests) {
360 req = list_entry(le, struct drbd_request, tl_requests);
361 rv = _req_mod(req, what);
362
363 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
364 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
365 }
366 tmp = b->next;
367
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200368 if (n_writes) {
Philipp Reisner11b58e72010-05-12 17:08:26 +0200369 if (what == resend) {
370 b->n_writes = n_writes;
371 if (b->w.cb == NULL) {
372 b->w.cb = w_send_barrier;
373 inc_ap_pending(mdev);
374 set_bit(CREATE_BARRIER, &mdev->flags);
375 }
376
377 drbd_queue_work(&mdev->data.work, &b->w);
378 }
379 pn = &b->next;
380 } else {
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200381 if (n_reads)
382 list_add(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200383 /* there could still be requests on that ring list,
384 * in case local io is still pending */
385 list_del(&b->requests);
386
387 /* dec_ap_pending corresponding to queue_barrier.
388 * the newest barrier may not have been queued yet,
389 * in which case w.cb is still NULL. */
390 if (b->w.cb != NULL)
391 dec_ap_pending(mdev);
392
393 if (b == mdev->newest_tle) {
394 /* recycle, but reinit! */
395 D_ASSERT(tmp == NULL);
396 INIT_LIST_HEAD(&b->requests);
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200397 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200398 INIT_LIST_HEAD(&b->w.list);
399 b->w.cb = NULL;
400 b->br_number = net_random();
401 b->n_writes = 0;
402
403 *pn = b;
404 break;
405 }
406 *pn = tmp;
407 kfree(b);
408 }
409 b = tmp;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200410 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200411 }
412}
413
Philipp Reisnerb411b362009-09-25 16:07:19 -0700414
415/**
416 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
417 * @mdev: DRBD device.
418 *
419 * This is called after the connection to the peer was lost. The storage covered
420 * by the requests on the transfer gets marked as our of sync. Called from the
421 * receiver thread and the worker thread.
422 */
423void tl_clear(struct drbd_conf *mdev)
424{
Philipp Reisnerb411b362009-09-25 16:07:19 -0700425 struct list_head *le, *tle;
426 struct drbd_request *r;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700427
428 spin_lock_irq(&mdev->req_lock);
429
Philipp Reisner11b58e72010-05-12 17:08:26 +0200430 _tl_restart(mdev, connection_lost_while_pending);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700431
432 /* we expect this list to be empty. */
433 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
434
435 /* but just in case, clean it up anyways! */
436 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
437 r = list_entry(le, struct drbd_request, tl_requests);
438 /* It would be nice to complete outside of spinlock.
439 * But this is easier for now. */
440 _req_mod(r, connection_lost_while_pending);
441 }
442
443 /* ensure bit indicating barrier is required is clear */
444 clear_bit(CREATE_BARRIER, &mdev->flags);
445
Philipp Reisner288f4222010-05-27 15:07:43 +0200446 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
447
Philipp Reisnerb411b362009-09-25 16:07:19 -0700448 spin_unlock_irq(&mdev->req_lock);
449}
450
Philipp Reisner11b58e72010-05-12 17:08:26 +0200451void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
452{
453 spin_lock_irq(&mdev->req_lock);
454 _tl_restart(mdev, what);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700455 spin_unlock_irq(&mdev->req_lock);
456}
457
458/**
459 * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
460 * @mdev: DRBD device.
461 * @os: old (current) state.
462 * @ns: new (wanted) state.
463 */
464static int cl_wide_st_chg(struct drbd_conf *mdev,
465 union drbd_state os, union drbd_state ns)
466{
467 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
468 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
469 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
470 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
471 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
472 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
473 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
474}
475
476int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
477 union drbd_state mask, union drbd_state val)
478{
479 unsigned long flags;
480 union drbd_state os, ns;
481 int rv;
482
483 spin_lock_irqsave(&mdev->req_lock, flags);
484 os = mdev->state;
485 ns.i = (os.i & ~mask.i) | val.i;
486 rv = _drbd_set_state(mdev, ns, f, NULL);
487 ns = mdev->state;
488 spin_unlock_irqrestore(&mdev->req_lock, flags);
489
490 return rv;
491}
492
493/**
494 * drbd_force_state() - Impose a change which happens outside our control on our state
495 * @mdev: DRBD device.
496 * @mask: mask of state bits to change.
497 * @val: value of new state bits.
498 */
499void drbd_force_state(struct drbd_conf *mdev,
500 union drbd_state mask, union drbd_state val)
501{
502 drbd_change_state(mdev, CS_HARD, mask, val);
503}
504
505static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
506static int is_valid_state_transition(struct drbd_conf *,
507 union drbd_state, union drbd_state);
508static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200509 union drbd_state ns, const char **warn_sync_abort);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700510int drbd_send_state_req(struct drbd_conf *,
511 union drbd_state, union drbd_state);
512
513static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
514 union drbd_state mask, union drbd_state val)
515{
516 union drbd_state os, ns;
517 unsigned long flags;
518 int rv;
519
520 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
521 return SS_CW_SUCCESS;
522
523 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
524 return SS_CW_FAILED_BY_PEER;
525
526 rv = 0;
527 spin_lock_irqsave(&mdev->req_lock, flags);
528 os = mdev->state;
529 ns.i = (os.i & ~mask.i) | val.i;
530 ns = sanitize_state(mdev, os, ns, NULL);
531
532 if (!cl_wide_st_chg(mdev, os, ns))
533 rv = SS_CW_NO_NEED;
534 if (!rv) {
535 rv = is_valid_state(mdev, ns);
536 if (rv == SS_SUCCESS) {
537 rv = is_valid_state_transition(mdev, ns, os);
538 if (rv == SS_SUCCESS)
539 rv = 0; /* cont waiting, otherwise fail. */
540 }
541 }
542 spin_unlock_irqrestore(&mdev->req_lock, flags);
543
544 return rv;
545}
546
547/**
548 * drbd_req_state() - Perform an eventually cluster wide state change
549 * @mdev: DRBD device.
550 * @mask: mask of state bits to change.
551 * @val: value of new state bits.
552 * @f: flags
553 *
554 * Should not be called directly, use drbd_request_state() or
555 * _drbd_request_state().
556 */
557static int drbd_req_state(struct drbd_conf *mdev,
558 union drbd_state mask, union drbd_state val,
559 enum chg_state_flags f)
560{
561 struct completion done;
562 unsigned long flags;
563 union drbd_state os, ns;
564 int rv;
565
566 init_completion(&done);
567
568 if (f & CS_SERIALIZE)
569 mutex_lock(&mdev->state_mutex);
570
571 spin_lock_irqsave(&mdev->req_lock, flags);
572 os = mdev->state;
573 ns.i = (os.i & ~mask.i) | val.i;
574 ns = sanitize_state(mdev, os, ns, NULL);
575
576 if (cl_wide_st_chg(mdev, os, ns)) {
577 rv = is_valid_state(mdev, ns);
578 if (rv == SS_SUCCESS)
579 rv = is_valid_state_transition(mdev, ns, os);
580 spin_unlock_irqrestore(&mdev->req_lock, flags);
581
582 if (rv < SS_SUCCESS) {
583 if (f & CS_VERBOSE)
584 print_st_err(mdev, os, ns, rv);
585 goto abort;
586 }
587
588 drbd_state_lock(mdev);
589 if (!drbd_send_state_req(mdev, mask, val)) {
590 drbd_state_unlock(mdev);
591 rv = SS_CW_FAILED_BY_PEER;
592 if (f & CS_VERBOSE)
593 print_st_err(mdev, os, ns, rv);
594 goto abort;
595 }
596
597 wait_event(mdev->state_wait,
598 (rv = _req_st_cond(mdev, mask, val)));
599
600 if (rv < SS_SUCCESS) {
601 drbd_state_unlock(mdev);
602 if (f & CS_VERBOSE)
603 print_st_err(mdev, os, ns, rv);
604 goto abort;
605 }
606 spin_lock_irqsave(&mdev->req_lock, flags);
607 os = mdev->state;
608 ns.i = (os.i & ~mask.i) | val.i;
609 rv = _drbd_set_state(mdev, ns, f, &done);
610 drbd_state_unlock(mdev);
611 } else {
612 rv = _drbd_set_state(mdev, ns, f, &done);
613 }
614
615 spin_unlock_irqrestore(&mdev->req_lock, flags);
616
617 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
618 D_ASSERT(current != mdev->worker.task);
619 wait_for_completion(&done);
620 }
621
622abort:
623 if (f & CS_SERIALIZE)
624 mutex_unlock(&mdev->state_mutex);
625
626 return rv;
627}
628
629/**
630 * _drbd_request_state() - Request a state change (with flags)
631 * @mdev: DRBD device.
632 * @mask: mask of state bits to change.
633 * @val: value of new state bits.
634 * @f: flags
635 *
636 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
637 * flag, or when logging of failed state change requests is not desired.
638 */
639int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
640 union drbd_state val, enum chg_state_flags f)
641{
642 int rv;
643
644 wait_event(mdev->state_wait,
645 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
646
647 return rv;
648}
649
650static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
651{
652 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
653 name,
654 drbd_conn_str(ns.conn),
655 drbd_role_str(ns.role),
656 drbd_role_str(ns.peer),
657 drbd_disk_str(ns.disk),
658 drbd_disk_str(ns.pdsk),
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200659 is_susp(ns) ? 's' : 'r',
Philipp Reisnerb411b362009-09-25 16:07:19 -0700660 ns.aftr_isp ? 'a' : '-',
661 ns.peer_isp ? 'p' : '-',
662 ns.user_isp ? 'u' : '-'
663 );
664}
665
666void print_st_err(struct drbd_conf *mdev,
667 union drbd_state os, union drbd_state ns, int err)
668{
669 if (err == SS_IN_TRANSIENT_STATE)
670 return;
671 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
672 print_st(mdev, " state", os);
673 print_st(mdev, "wanted", ns);
674}
675
676
677#define drbd_peer_str drbd_role_str
678#define drbd_pdsk_str drbd_disk_str
679
680#define drbd_susp_str(A) ((A) ? "1" : "0")
681#define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
682#define drbd_peer_isp_str(A) ((A) ? "1" : "0")
683#define drbd_user_isp_str(A) ((A) ? "1" : "0")
684
685#define PSC(A) \
686 ({ if (ns.A != os.A) { \
687 pbp += sprintf(pbp, #A "( %s -> %s ) ", \
688 drbd_##A##_str(os.A), \
689 drbd_##A##_str(ns.A)); \
690 } })
691
692/**
693 * is_valid_state() - Returns an SS_ error code if ns is not valid
694 * @mdev: DRBD device.
695 * @ns: State to consider.
696 */
697static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
698{
699 /* See drbd_state_sw_errors in drbd_strings.c */
700
701 enum drbd_fencing_p fp;
702 int rv = SS_SUCCESS;
703
704 fp = FP_DONT_CARE;
705 if (get_ldev(mdev)) {
706 fp = mdev->ldev->dc.fencing;
707 put_ldev(mdev);
708 }
709
710 if (get_net_conf(mdev)) {
711 if (!mdev->net_conf->two_primaries &&
712 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
713 rv = SS_TWO_PRIMARIES;
714 put_net_conf(mdev);
715 }
716
717 if (rv <= 0)
718 /* already found a reason to abort */;
719 else if (ns.role == R_SECONDARY && mdev->open_cnt)
720 rv = SS_DEVICE_IN_USE;
721
722 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
723 rv = SS_NO_UP_TO_DATE_DISK;
724
725 else if (fp >= FP_RESOURCE &&
726 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
727 rv = SS_PRIMARY_NOP;
728
729 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
730 rv = SS_NO_UP_TO_DATE_DISK;
731
732 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
733 rv = SS_NO_LOCAL_DISK;
734
735 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
736 rv = SS_NO_REMOTE_DISK;
737
Lars Ellenberg8d4ce822010-04-01 16:59:32 +0200738 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
739 rv = SS_NO_UP_TO_DATE_DISK;
740
Philipp Reisnerb411b362009-09-25 16:07:19 -0700741 else if ((ns.conn == C_CONNECTED ||
742 ns.conn == C_WF_BITMAP_S ||
743 ns.conn == C_SYNC_SOURCE ||
744 ns.conn == C_PAUSED_SYNC_S) &&
745 ns.disk == D_OUTDATED)
746 rv = SS_CONNECTED_OUTDATES;
747
748 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
749 (mdev->sync_conf.verify_alg[0] == 0))
750 rv = SS_NO_VERIFY_ALG;
751
752 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
753 mdev->agreed_pro_version < 88)
754 rv = SS_NOT_SUPPORTED;
755
756 return rv;
757}
758
759/**
760 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
761 * @mdev: DRBD device.
762 * @ns: new state.
763 * @os: old state.
764 */
765static int is_valid_state_transition(struct drbd_conf *mdev,
766 union drbd_state ns, union drbd_state os)
767{
768 int rv = SS_SUCCESS;
769
770 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
771 os.conn > C_CONNECTED)
772 rv = SS_RESYNC_RUNNING;
773
774 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
775 rv = SS_ALREADY_STANDALONE;
776
777 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
778 rv = SS_IS_DISKLESS;
779
780 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
781 rv = SS_NO_NET_CONFIG;
782
783 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
784 rv = SS_LOWER_THAN_OUTDATED;
785
786 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
787 rv = SS_IN_TRANSIENT_STATE;
788
789 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
790 rv = SS_IN_TRANSIENT_STATE;
791
792 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
793 rv = SS_NEED_CONNECTION;
794
795 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
796 ns.conn != os.conn && os.conn > C_CONNECTED)
797 rv = SS_RESYNC_RUNNING;
798
799 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
800 os.conn < C_CONNECTED)
801 rv = SS_NEED_CONNECTION;
802
803 return rv;
804}
805
806/**
807 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
808 * @mdev: DRBD device.
809 * @os: old state.
810 * @ns: new state.
811 * @warn_sync_abort:
812 *
813 * When we loose connection, we have to set the state of the peers disk (pdsk)
814 * to D_UNKNOWN. This rule and many more along those lines are in this function.
815 */
816static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200817 union drbd_state ns, const char **warn_sync_abort)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700818{
819 enum drbd_fencing_p fp;
820
821 fp = FP_DONT_CARE;
822 if (get_ldev(mdev)) {
823 fp = mdev->ldev->dc.fencing;
824 put_ldev(mdev);
825 }
826
827 /* Disallow Network errors to configure a device's network part */
828 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
829 os.conn <= C_DISCONNECTING)
830 ns.conn = os.conn;
831
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200832 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
833 * If you try to go into some Sync* state, that shall fail (elsewhere). */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700834 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200835 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700836 ns.conn = os.conn;
837
Lars Ellenberg82f59cc2010-10-16 12:13:47 +0200838 /* we cannot fail (again) if we already detached */
839 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
840 ns.disk = D_DISKLESS;
841
842 /* if we are only D_ATTACHING yet,
843 * we can (and should) go directly to D_DISKLESS. */
844 if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
845 ns.disk = D_DISKLESS;
846
Philipp Reisnerb411b362009-09-25 16:07:19 -0700847 /* After C_DISCONNECTING only C_STANDALONE may follow */
848 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
849 ns.conn = os.conn;
850
851 if (ns.conn < C_CONNECTED) {
852 ns.peer_isp = 0;
853 ns.peer = R_UNKNOWN;
854 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
855 ns.pdsk = D_UNKNOWN;
856 }
857
858 /* Clear the aftr_isp when becoming unconfigured */
859 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
860 ns.aftr_isp = 0;
861
Philipp Reisnerb411b362009-09-25 16:07:19 -0700862 /* Abort resync if a disk fails/detaches */
863 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
864 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
865 if (warn_sync_abort)
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200866 *warn_sync_abort =
867 os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
868 "Online-verify" : "Resync";
Philipp Reisnerb411b362009-09-25 16:07:19 -0700869 ns.conn = C_CONNECTED;
870 }
871
872 if (ns.conn >= C_CONNECTED &&
873 ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
874 (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
875 switch (ns.conn) {
876 case C_WF_BITMAP_T:
877 case C_PAUSED_SYNC_T:
878 ns.disk = D_OUTDATED;
879 break;
880 case C_CONNECTED:
881 case C_WF_BITMAP_S:
882 case C_SYNC_SOURCE:
883 case C_PAUSED_SYNC_S:
884 ns.disk = D_UP_TO_DATE;
885 break;
886 case C_SYNC_TARGET:
887 ns.disk = D_INCONSISTENT;
888 dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
889 break;
890 }
891 if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
892 dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
893 }
894
895 if (ns.conn >= C_CONNECTED &&
896 (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
897 switch (ns.conn) {
898 case C_CONNECTED:
899 case C_WF_BITMAP_T:
900 case C_PAUSED_SYNC_T:
901 case C_SYNC_TARGET:
902 ns.pdsk = D_UP_TO_DATE;
903 break;
904 case C_WF_BITMAP_S:
905 case C_PAUSED_SYNC_S:
Lars Ellenberge0f83012010-04-01 15:13:19 +0200906 /* remap any consistent state to D_OUTDATED,
907 * but disallow "upgrade" of not even consistent states.
908 */
909 ns.pdsk =
910 (D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED)
911 ? os.pdsk : D_OUTDATED;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700912 break;
913 case C_SYNC_SOURCE:
914 ns.pdsk = D_INCONSISTENT;
915 dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
916 break;
917 }
918 if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
919 dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
920 }
921
922 /* Connection breaks down before we finished "Negotiating" */
923 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
924 get_ldev_if_state(mdev, D_NEGOTIATING)) {
925 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
926 ns.disk = mdev->new_state_tmp.disk;
927 ns.pdsk = mdev->new_state_tmp.pdsk;
928 } else {
929 dev_alert(DEV, "Connection lost while negotiating, no data!\n");
930 ns.disk = D_DISKLESS;
931 ns.pdsk = D_UNKNOWN;
932 }
933 put_ldev(mdev);
934 }
935
936 if (fp == FP_STONITH &&
Philipp Reisner0a492162009-10-21 13:08:29 +0200937 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
938 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200939 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
Philipp Reisner265be2d2010-05-31 10:14:17 +0200940
941 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
942 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
943 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200944 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700945
946 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
947 if (ns.conn == C_SYNC_SOURCE)
948 ns.conn = C_PAUSED_SYNC_S;
949 if (ns.conn == C_SYNC_TARGET)
950 ns.conn = C_PAUSED_SYNC_T;
951 } else {
952 if (ns.conn == C_PAUSED_SYNC_S)
953 ns.conn = C_SYNC_SOURCE;
954 if (ns.conn == C_PAUSED_SYNC_T)
955 ns.conn = C_SYNC_TARGET;
956 }
957
958 return ns;
959}
960
961/* helper for __drbd_set_state */
962static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
963{
Lars Ellenberg30b743a2010-11-05 09:39:06 +0100964 if (mdev->agreed_pro_version < 90)
965 mdev->ov_start_sector = 0;
966 mdev->rs_total = drbd_bm_bits(mdev);
967 mdev->ov_position = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700968 if (cs == C_VERIFY_T) {
969 /* starting online verify from an arbitrary position
970 * does not fit well into the existing protocol.
971 * on C_VERIFY_T, we initialize ov_left and friends
972 * implicitly in receive_DataRequest once the
973 * first P_OV_REQUEST is received */
974 mdev->ov_start_sector = ~(sector_t)0;
975 } else {
976 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
Lars Ellenberg30b743a2010-11-05 09:39:06 +0100977 if (bit >= mdev->rs_total) {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700978 mdev->ov_start_sector =
979 BM_BIT_TO_SECT(mdev->rs_total - 1);
Lars Ellenberg30b743a2010-11-05 09:39:06 +0100980 mdev->rs_total = 1;
981 } else
982 mdev->rs_total -= bit;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700983 mdev->ov_position = mdev->ov_start_sector;
984 }
Lars Ellenberg30b743a2010-11-05 09:39:06 +0100985 mdev->ov_left = mdev->rs_total;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700986}
987
Philipp Reisner07782862010-08-31 12:00:50 +0200988static void drbd_resume_al(struct drbd_conf *mdev)
989{
990 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
991 dev_info(DEV, "Resumed AL updates\n");
992}
993
Philipp Reisnerb411b362009-09-25 16:07:19 -0700994/**
995 * __drbd_set_state() - Set a new DRBD state
996 * @mdev: DRBD device.
997 * @ns: new state.
998 * @flags: Flags
999 * @done: Optional completion, that will get completed after the after_state_ch() finished
1000 *
1001 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1002 */
1003int __drbd_set_state(struct drbd_conf *mdev,
1004 union drbd_state ns, enum chg_state_flags flags,
1005 struct completion *done)
1006{
1007 union drbd_state os;
1008 int rv = SS_SUCCESS;
Lars Ellenberg02bc7172010-09-06 12:13:20 +02001009 const char *warn_sync_abort = NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001010 struct after_state_chg_work *ascw;
1011
1012 os = mdev->state;
1013
1014 ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
1015
1016 if (ns.i == os.i)
1017 return SS_NOTHING_TO_DO;
1018
1019 if (!(flags & CS_HARD)) {
1020 /* pre-state-change checks ; only look at ns */
1021 /* See drbd_state_sw_errors in drbd_strings.c */
1022
1023 rv = is_valid_state(mdev, ns);
1024 if (rv < SS_SUCCESS) {
1025 /* If the old state was illegal as well, then let
1026 this happen...*/
1027
Philipp Reisner1616a252010-06-10 16:55:15 +02001028 if (is_valid_state(mdev, os) == rv)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001029 rv = is_valid_state_transition(mdev, ns, os);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001030 } else
1031 rv = is_valid_state_transition(mdev, ns, os);
1032 }
1033
1034 if (rv < SS_SUCCESS) {
1035 if (flags & CS_VERBOSE)
1036 print_st_err(mdev, os, ns, rv);
1037 return rv;
1038 }
1039
1040 if (warn_sync_abort)
Lars Ellenberg02bc7172010-09-06 12:13:20 +02001041 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001042
1043 {
1044 char *pbp, pb[300];
1045 pbp = pb;
1046 *pbp = 0;
1047 PSC(role);
1048 PSC(peer);
1049 PSC(conn);
1050 PSC(disk);
1051 PSC(pdsk);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001052 if (is_susp(ns) != is_susp(os))
1053 pbp += sprintf(pbp, "susp( %s -> %s ) ",
1054 drbd_susp_str(is_susp(os)),
1055 drbd_susp_str(is_susp(ns)));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001056 PSC(aftr_isp);
1057 PSC(peer_isp);
1058 PSC(user_isp);
1059 dev_info(DEV, "%s\n", pb);
1060 }
1061
1062 /* solve the race between becoming unconfigured,
1063 * worker doing the cleanup, and
1064 * admin reconfiguring us:
1065 * on (re)configure, first set CONFIG_PENDING,
1066 * then wait for a potentially exiting worker,
1067 * start the worker, and schedule one no_op.
1068 * then proceed with configuration.
1069 */
1070 if (ns.disk == D_DISKLESS &&
1071 ns.conn == C_STANDALONE &&
1072 ns.role == R_SECONDARY &&
1073 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1074 set_bit(DEVICE_DYING, &mdev->flags);
1075
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001076 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1077 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1078 * drbd_ldev_destroy() won't happen before our corresponding
1079 * after_state_ch works run, where we put_ldev again. */
1080 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1081 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1082 atomic_inc(&mdev->local_cnt);
1083
1084 mdev->state = ns;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001085 wake_up(&mdev->misc_wait);
1086 wake_up(&mdev->state_wait);
1087
Philipp Reisnerb411b362009-09-25 16:07:19 -07001088 /* aborted verify run. log the last position */
1089 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1090 ns.conn < C_CONNECTED) {
1091 mdev->ov_start_sector =
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001092 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001093 dev_info(DEV, "Online Verify reached sector %llu\n",
1094 (unsigned long long)mdev->ov_start_sector);
1095 }
1096
1097 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1098 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1099 dev_info(DEV, "Syncer continues.\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001100 mdev->rs_paused += (long)jiffies
1101 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
Philipp Reisner63106d32010-09-01 15:47:15 +02001102 if (ns.conn == C_SYNC_TARGET)
1103 mod_timer(&mdev->resync_timer, jiffies);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001104 }
1105
1106 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1107 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1108 dev_info(DEV, "Resync suspended\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001109 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001110 }
1111
1112 if (os.conn == C_CONNECTED &&
1113 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001114 unsigned long now = jiffies;
1115 int i;
1116
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001117 set_ov_position(mdev, ns.conn);
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001118 mdev->rs_start = now;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001119 mdev->rs_last_events = 0;
1120 mdev->rs_last_sect_ev = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001121 mdev->ov_last_oos_size = 0;
1122 mdev->ov_last_oos_start = 0;
1123
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001124 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001125 mdev->rs_mark_left[i] = mdev->ov_left;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001126 mdev->rs_mark_time[i] = now;
1127 }
1128
Philipp Reisnerb411b362009-09-25 16:07:19 -07001129 if (ns.conn == C_VERIFY_S) {
1130 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1131 (unsigned long long)mdev->ov_position);
1132 mod_timer(&mdev->resync_timer, jiffies);
1133 }
1134 }
1135
1136 if (get_ldev(mdev)) {
1137 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1138 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1139 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1140
1141 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1142 mdf |= MDF_CRASHED_PRIMARY;
1143 if (mdev->state.role == R_PRIMARY ||
1144 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1145 mdf |= MDF_PRIMARY_IND;
1146 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1147 mdf |= MDF_CONNECTED_IND;
1148 if (mdev->state.disk > D_INCONSISTENT)
1149 mdf |= MDF_CONSISTENT;
1150 if (mdev->state.disk > D_OUTDATED)
1151 mdf |= MDF_WAS_UP_TO_DATE;
1152 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1153 mdf |= MDF_PEER_OUT_DATED;
1154 if (mdf != mdev->ldev->md.flags) {
1155 mdev->ldev->md.flags = mdf;
1156 drbd_md_mark_dirty(mdev);
1157 }
1158 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1159 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1160 put_ldev(mdev);
1161 }
1162
1163 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1164 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1165 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1166 set_bit(CONSIDER_RESYNC, &mdev->flags);
1167
1168 /* Receiver should clean up itself */
1169 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1170 drbd_thread_stop_nowait(&mdev->receiver);
1171
1172 /* Now the receiver finished cleaning up itself, it should die */
1173 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1174 drbd_thread_stop_nowait(&mdev->receiver);
1175
1176 /* Upon network failure, we need to restart the receiver. */
1177 if (os.conn > C_TEAR_DOWN &&
1178 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1179 drbd_thread_restart_nowait(&mdev->receiver);
1180
Philipp Reisner07782862010-08-31 12:00:50 +02001181 /* Resume AL writing if we get a connection */
1182 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1183 drbd_resume_al(mdev);
1184
Philipp Reisnerb411b362009-09-25 16:07:19 -07001185 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1186 if (ascw) {
1187 ascw->os = os;
1188 ascw->ns = ns;
1189 ascw->flags = flags;
1190 ascw->w.cb = w_after_state_ch;
1191 ascw->done = done;
1192 drbd_queue_work(&mdev->data.work, &ascw->w);
1193 } else {
1194 dev_warn(DEV, "Could not kmalloc an ascw\n");
1195 }
1196
1197 return rv;
1198}
1199
1200static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1201{
1202 struct after_state_chg_work *ascw =
1203 container_of(w, struct after_state_chg_work, w);
1204 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1205 if (ascw->flags & CS_WAIT_COMPLETE) {
1206 D_ASSERT(ascw->done != NULL);
1207 complete(ascw->done);
1208 }
1209 kfree(ascw);
1210
1211 return 1;
1212}
1213
1214static void abw_start_sync(struct drbd_conf *mdev, int rv)
1215{
1216 if (rv) {
1217 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1218 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1219 return;
1220 }
1221
1222 switch (mdev->state.conn) {
1223 case C_STARTING_SYNC_T:
1224 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1225 break;
1226 case C_STARTING_SYNC_S:
1227 drbd_start_resync(mdev, C_SYNC_SOURCE);
1228 break;
1229 }
1230}
1231
1232/**
1233 * after_state_ch() - Perform after state change actions that may sleep
1234 * @mdev: DRBD device.
1235 * @os: old state.
1236 * @ns: new state.
1237 * @flags: Flags
1238 */
1239static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1240 union drbd_state ns, enum chg_state_flags flags)
1241{
1242 enum drbd_fencing_p fp;
Philipp Reisner67098932010-06-24 16:24:25 +02001243 enum drbd_req_event what = nothing;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001244 union drbd_state nsm = (union drbd_state){ .i = -1 };
Philipp Reisnerb411b362009-09-25 16:07:19 -07001245
1246 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1247 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1248 if (mdev->p_uuid)
1249 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1250 }
1251
1252 fp = FP_DONT_CARE;
1253 if (get_ldev(mdev)) {
1254 fp = mdev->ldev->dc.fencing;
1255 put_ldev(mdev);
1256 }
1257
1258 /* Inform userspace about the change... */
1259 drbd_bcast_state(mdev, ns);
1260
1261 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1262 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1263 drbd_khelper(mdev, "pri-on-incon-degr");
1264
1265 /* Here we have the actions that are performed after a
1266 state change. This function might sleep */
1267
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001268 nsm.i = -1;
1269 if (ns.susp_nod) {
Philipp Reisner265be2d2010-05-31 10:14:17 +02001270 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
Philipp Reisner67098932010-06-24 16:24:25 +02001271 if (ns.conn == C_CONNECTED)
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001272 what = resend, nsm.susp_nod = 0;
Philipp Reisner67098932010-06-24 16:24:25 +02001273 else /* ns.conn > C_CONNECTED */
Philipp Reisner265be2d2010-05-31 10:14:17 +02001274 dev_err(DEV, "Unexpected Resynd going on!\n");
1275 }
1276
Philipp Reisner67098932010-06-24 16:24:25 +02001277 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001278 what = restart_frozen_disk_io, nsm.susp_nod = 0;
1279
Philipp Reisner265be2d2010-05-31 10:14:17 +02001280 }
1281
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001282 if (ns.susp_fen) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001283 /* case1: The outdate peer handler is successful: */
1284 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001285 tl_clear(mdev);
Philipp Reisner43a51822010-06-11 11:26:34 +02001286 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1287 drbd_uuid_new_current(mdev);
1288 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner43a51822010-06-11 11:26:34 +02001289 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001290 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001291 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001292 spin_unlock_irq(&mdev->req_lock);
1293 }
Philipp Reisner43a51822010-06-11 11:26:34 +02001294 /* case2: The connection was established again: */
1295 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1296 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner67098932010-06-24 16:24:25 +02001297 what = resend;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001298 nsm.susp_fen = 0;
Philipp Reisner43a51822010-06-11 11:26:34 +02001299 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001300 }
Philipp Reisner67098932010-06-24 16:24:25 +02001301
1302 if (what != nothing) {
1303 spin_lock_irq(&mdev->req_lock);
1304 _tl_restart(mdev, what);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001305 nsm.i &= mdev->state.i;
1306 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
Philipp Reisner67098932010-06-24 16:24:25 +02001307 spin_unlock_irq(&mdev->req_lock);
1308 }
1309
Philipp Reisnerb411b362009-09-25 16:07:19 -07001310 /* Do not change the order of the if above and the two below... */
1311 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1312 drbd_send_uuids(mdev);
1313 drbd_send_state(mdev);
1314 }
1315 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1316 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1317
1318 /* Lost contact to peer's copy of the data */
1319 if ((os.pdsk >= D_INCONSISTENT &&
1320 os.pdsk != D_UNKNOWN &&
1321 os.pdsk != D_OUTDATED)
1322 && (ns.pdsk < D_INCONSISTENT ||
1323 ns.pdsk == D_UNKNOWN ||
1324 ns.pdsk == D_OUTDATED)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001325 if (get_ldev(mdev)) {
1326 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001327 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001328 if (is_susp(mdev->state)) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001329 set_bit(NEW_CUR_UUID, &mdev->flags);
1330 } else {
1331 drbd_uuid_new_current(mdev);
1332 drbd_send_uuids(mdev);
1333 }
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001334 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001335 put_ldev(mdev);
1336 }
1337 }
1338
1339 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001340 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001341 drbd_uuid_new_current(mdev);
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001342 drbd_send_uuids(mdev);
1343 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001344
1345 /* D_DISKLESS Peer becomes secondary */
1346 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1347 drbd_al_to_on_disk_bm(mdev);
1348 put_ldev(mdev);
1349 }
1350
1351 /* Last part of the attaching process ... */
1352 if (ns.conn >= C_CONNECTED &&
1353 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
Philipp Reisnere89b5912010-03-24 17:11:33 +01001354 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001355 drbd_send_uuids(mdev);
1356 drbd_send_state(mdev);
1357 }
1358
1359 /* We want to pause/continue resync, tell peer. */
1360 if (ns.conn >= C_CONNECTED &&
1361 ((os.aftr_isp != ns.aftr_isp) ||
1362 (os.user_isp != ns.user_isp)))
1363 drbd_send_state(mdev);
1364
1365 /* In case one of the isp bits got set, suspend other devices. */
1366 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1367 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1368 suspend_other_sg(mdev);
1369
1370 /* Make sure the peer gets informed about eventual state
1371 changes (ISP bits) while we were in WFReportParams. */
1372 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1373 drbd_send_state(mdev);
1374
1375 /* We are in the progress to start a full sync... */
1376 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1377 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1378 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1379
1380 /* We are invalidating our self... */
1381 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1382 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1383 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1384
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001385 /* first half of local IO error, failure to attach,
1386 * or administrative detach */
1387 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1388 enum drbd_io_error_p eh;
1389 int was_io_error;
1390 /* corresponding get_ldev was in __drbd_set_state, to serialize
1391 * our cleanup here with the transition to D_DISKLESS,
1392 * so it is safe to dreference ldev here. */
1393 eh = mdev->ldev->dc.on_io_error;
1394 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1395
1396 /* current state still has to be D_FAILED,
1397 * there is only one way out: to D_DISKLESS,
1398 * and that may only happen after our put_ldev below. */
1399 if (mdev->state.disk != D_FAILED)
1400 dev_err(DEV,
1401 "ASSERT FAILED: disk is %s during detach\n",
1402 drbd_disk_str(mdev->state.disk));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001403
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001404 if (drbd_send_state(mdev))
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001405 dev_warn(DEV, "Notified peer that I am detaching my disk\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001406 else
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001407 dev_err(DEV, "Sending state for detaching disk failed\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001408
1409 drbd_rs_cancel_all(mdev);
1410
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001411 /* In case we want to get something to stable storage still,
1412 * this may be the last chance.
1413 * Following put_ldev may transition to D_DISKLESS. */
1414 drbd_md_sync(mdev);
1415 put_ldev(mdev);
1416
1417 if (was_io_error && eh == EP_CALL_HELPER)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001418 drbd_khelper(mdev, "local-io-error");
1419 }
1420
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001421 /* second half of local IO error, failure to attach,
1422 * or administrative detach,
1423 * after local_cnt references have reached zero again */
1424 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1425 /* We must still be diskless,
1426 * re-attach has to be serialized with this! */
1427 if (mdev->state.disk != D_DISKLESS)
1428 dev_err(DEV,
1429 "ASSERT FAILED: disk is %s while going diskless\n",
1430 drbd_disk_str(mdev->state.disk));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001431
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001432 mdev->rs_total = 0;
1433 mdev->rs_failed = 0;
1434 atomic_set(&mdev->rs_pending_cnt, 0);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001435
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001436 if (drbd_send_state(mdev))
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001437 dev_warn(DEV, "Notified peer that I'm now diskless.\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001438 else
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001439 dev_err(DEV, "Sending state for being diskless failed\n");
1440 /* corresponding get_ldev in __drbd_set_state
1441 * this may finaly trigger drbd_ldev_destroy. */
1442 put_ldev(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001443 }
1444
1445 /* Disks got bigger while they were detached */
1446 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1447 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1448 if (ns.conn == C_CONNECTED)
1449 resync_after_online_grow(mdev);
1450 }
1451
1452 /* A resync finished or aborted, wake paused devices... */
1453 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1454 (os.peer_isp && !ns.peer_isp) ||
1455 (os.user_isp && !ns.user_isp))
1456 resume_next_sg(mdev);
1457
Lars Ellenbergaf85e8e2010-10-07 16:07:55 +02001458 /* sync target done with resync. Explicitly notify peer, even though
1459 * it should (at least for non-empty resyncs) already know itself. */
1460 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1461 drbd_send_state(mdev);
1462
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001463 /* free tl_hash if we Got thawed and are C_STANDALONE */
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001464 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001465 drbd_free_tl_hash(mdev);
1466
Philipp Reisnerb411b362009-09-25 16:07:19 -07001467 /* Upon network connection, we need to start the receiver */
1468 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1469 drbd_thread_start(&mdev->receiver);
1470
1471 /* Terminate worker thread if we are unconfigured - it will be
1472 restarted as needed... */
1473 if (ns.disk == D_DISKLESS &&
1474 ns.conn == C_STANDALONE &&
1475 ns.role == R_SECONDARY) {
1476 if (os.aftr_isp != ns.aftr_isp)
1477 resume_next_sg(mdev);
1478 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1479 if (test_bit(DEVICE_DYING, &mdev->flags))
1480 drbd_thread_stop_nowait(&mdev->worker);
1481 }
1482
1483 drbd_md_sync(mdev);
1484}
1485
1486
1487static int drbd_thread_setup(void *arg)
1488{
1489 struct drbd_thread *thi = (struct drbd_thread *) arg;
1490 struct drbd_conf *mdev = thi->mdev;
1491 unsigned long flags;
1492 int retval;
1493
1494restart:
1495 retval = thi->function(thi);
1496
1497 spin_lock_irqsave(&thi->t_lock, flags);
1498
1499 /* if the receiver has been "Exiting", the last thing it did
1500 * was set the conn state to "StandAlone",
1501 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1502 * and receiver thread will be "started".
1503 * drbd_thread_start needs to set "Restarting" in that case.
1504 * t_state check and assignment needs to be within the same spinlock,
1505 * so either thread_start sees Exiting, and can remap to Restarting,
1506 * or thread_start see None, and can proceed as normal.
1507 */
1508
1509 if (thi->t_state == Restarting) {
1510 dev_info(DEV, "Restarting %s\n", current->comm);
1511 thi->t_state = Running;
1512 spin_unlock_irqrestore(&thi->t_lock, flags);
1513 goto restart;
1514 }
1515
1516 thi->task = NULL;
1517 thi->t_state = None;
1518 smp_mb();
1519 complete(&thi->stop);
1520 spin_unlock_irqrestore(&thi->t_lock, flags);
1521
1522 dev_info(DEV, "Terminating %s\n", current->comm);
1523
1524 /* Release mod reference taken when thread was started */
1525 module_put(THIS_MODULE);
1526 return retval;
1527}
1528
1529static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1530 int (*func) (struct drbd_thread *))
1531{
1532 spin_lock_init(&thi->t_lock);
1533 thi->task = NULL;
1534 thi->t_state = None;
1535 thi->function = func;
1536 thi->mdev = mdev;
1537}
1538
1539int drbd_thread_start(struct drbd_thread *thi)
1540{
1541 struct drbd_conf *mdev = thi->mdev;
1542 struct task_struct *nt;
1543 unsigned long flags;
1544
1545 const char *me =
1546 thi == &mdev->receiver ? "receiver" :
1547 thi == &mdev->asender ? "asender" :
1548 thi == &mdev->worker ? "worker" : "NONSENSE";
1549
1550 /* is used from state engine doing drbd_thread_stop_nowait,
1551 * while holding the req lock irqsave */
1552 spin_lock_irqsave(&thi->t_lock, flags);
1553
1554 switch (thi->t_state) {
1555 case None:
1556 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1557 me, current->comm, current->pid);
1558
1559 /* Get ref on module for thread - this is released when thread exits */
1560 if (!try_module_get(THIS_MODULE)) {
1561 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1562 spin_unlock_irqrestore(&thi->t_lock, flags);
1563 return FALSE;
1564 }
1565
1566 init_completion(&thi->stop);
1567 D_ASSERT(thi->task == NULL);
1568 thi->reset_cpu_mask = 1;
1569 thi->t_state = Running;
1570 spin_unlock_irqrestore(&thi->t_lock, flags);
1571 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1572
1573 nt = kthread_create(drbd_thread_setup, (void *) thi,
1574 "drbd%d_%s", mdev_to_minor(mdev), me);
1575
1576 if (IS_ERR(nt)) {
1577 dev_err(DEV, "Couldn't start thread\n");
1578
1579 module_put(THIS_MODULE);
1580 return FALSE;
1581 }
1582 spin_lock_irqsave(&thi->t_lock, flags);
1583 thi->task = nt;
1584 thi->t_state = Running;
1585 spin_unlock_irqrestore(&thi->t_lock, flags);
1586 wake_up_process(nt);
1587 break;
1588 case Exiting:
1589 thi->t_state = Restarting;
1590 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1591 me, current->comm, current->pid);
1592 /* fall through */
1593 case Running:
1594 case Restarting:
1595 default:
1596 spin_unlock_irqrestore(&thi->t_lock, flags);
1597 break;
1598 }
1599
1600 return TRUE;
1601}
1602
1603
1604void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1605{
1606 unsigned long flags;
1607
1608 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1609
1610 /* may be called from state engine, holding the req lock irqsave */
1611 spin_lock_irqsave(&thi->t_lock, flags);
1612
1613 if (thi->t_state == None) {
1614 spin_unlock_irqrestore(&thi->t_lock, flags);
1615 if (restart)
1616 drbd_thread_start(thi);
1617 return;
1618 }
1619
1620 if (thi->t_state != ns) {
1621 if (thi->task == NULL) {
1622 spin_unlock_irqrestore(&thi->t_lock, flags);
1623 return;
1624 }
1625
1626 thi->t_state = ns;
1627 smp_mb();
1628 init_completion(&thi->stop);
1629 if (thi->task != current)
1630 force_sig(DRBD_SIGKILL, thi->task);
1631
1632 }
1633
1634 spin_unlock_irqrestore(&thi->t_lock, flags);
1635
1636 if (wait)
1637 wait_for_completion(&thi->stop);
1638}
1639
1640#ifdef CONFIG_SMP
1641/**
1642 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1643 * @mdev: DRBD device.
1644 *
1645 * Forces all threads of a device onto the same CPU. This is beneficial for
1646 * DRBD's performance. May be overwritten by user's configuration.
1647 */
1648void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1649{
1650 int ord, cpu;
1651
1652 /* user override. */
1653 if (cpumask_weight(mdev->cpu_mask))
1654 return;
1655
1656 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1657 for_each_online_cpu(cpu) {
1658 if (ord-- == 0) {
1659 cpumask_set_cpu(cpu, mdev->cpu_mask);
1660 return;
1661 }
1662 }
1663 /* should not be reached */
1664 cpumask_setall(mdev->cpu_mask);
1665}
1666
1667/**
1668 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1669 * @mdev: DRBD device.
1670 *
1671 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1672 * prematurely.
1673 */
1674void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1675{
1676 struct task_struct *p = current;
1677 struct drbd_thread *thi =
1678 p == mdev->asender.task ? &mdev->asender :
1679 p == mdev->receiver.task ? &mdev->receiver :
1680 p == mdev->worker.task ? &mdev->worker :
1681 NULL;
1682 ERR_IF(thi == NULL)
1683 return;
1684 if (!thi->reset_cpu_mask)
1685 return;
1686 thi->reset_cpu_mask = 0;
1687 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1688}
1689#endif
1690
1691/* the appropriate socket mutex must be held already */
1692int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001693 enum drbd_packets cmd, struct p_header80 *h,
Philipp Reisnerb411b362009-09-25 16:07:19 -07001694 size_t size, unsigned msg_flags)
1695{
1696 int sent, ok;
1697
1698 ERR_IF(!h) return FALSE;
1699 ERR_IF(!size) return FALSE;
1700
1701 h->magic = BE_DRBD_MAGIC;
1702 h->command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02001703 h->length = cpu_to_be16(size-sizeof(struct p_header80));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001704
Philipp Reisnerb411b362009-09-25 16:07:19 -07001705 sent = drbd_send(mdev, sock, h, size, msg_flags);
1706
1707 ok = (sent == size);
1708 if (!ok)
1709 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1710 cmdname(cmd), (int)size, sent);
1711 return ok;
1712}
1713
1714/* don't pass the socket. we may only look at it
1715 * when we hold the appropriate socket mutex.
1716 */
1717int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001718 enum drbd_packets cmd, struct p_header80 *h, size_t size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001719{
1720 int ok = 0;
1721 struct socket *sock;
1722
1723 if (use_data_socket) {
1724 mutex_lock(&mdev->data.mutex);
1725 sock = mdev->data.socket;
1726 } else {
1727 mutex_lock(&mdev->meta.mutex);
1728 sock = mdev->meta.socket;
1729 }
1730
1731 /* drbd_disconnect() could have called drbd_free_sock()
1732 * while we were waiting in down()... */
1733 if (likely(sock != NULL))
1734 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1735
1736 if (use_data_socket)
1737 mutex_unlock(&mdev->data.mutex);
1738 else
1739 mutex_unlock(&mdev->meta.mutex);
1740 return ok;
1741}
1742
1743int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1744 size_t size)
1745{
Philipp Reisner0b70a132010-08-20 13:36:10 +02001746 struct p_header80 h;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001747 int ok;
1748
1749 h.magic = BE_DRBD_MAGIC;
1750 h.command = cpu_to_be16(cmd);
1751 h.length = cpu_to_be16(size);
1752
1753 if (!drbd_get_data_sock(mdev))
1754 return 0;
1755
Philipp Reisnerb411b362009-09-25 16:07:19 -07001756 ok = (sizeof(h) ==
1757 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1758 ok = ok && (size ==
1759 drbd_send(mdev, mdev->data.socket, data, size, 0));
1760
1761 drbd_put_data_sock(mdev);
1762
1763 return ok;
1764}
1765
1766int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1767{
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001768 struct p_rs_param_95 *p;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001769 struct socket *sock;
1770 int size, rv;
1771 const int apv = mdev->agreed_pro_version;
1772
1773 size = apv <= 87 ? sizeof(struct p_rs_param)
1774 : apv == 88 ? sizeof(struct p_rs_param)
1775 + strlen(mdev->sync_conf.verify_alg) + 1
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001776 : apv <= 94 ? sizeof(struct p_rs_param_89)
1777 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001778
1779 /* used from admin command context and receiver/worker context.
1780 * to avoid kmalloc, grab the socket right here,
1781 * then use the pre-allocated sbuf there */
1782 mutex_lock(&mdev->data.mutex);
1783 sock = mdev->data.socket;
1784
1785 if (likely(sock != NULL)) {
1786 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1787
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001788 p = &mdev->data.sbuf.rs_param_95;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001789
1790 /* initialize verify_alg and csums_alg */
1791 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1792
1793 p->rate = cpu_to_be32(sc->rate);
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001794 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1795 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1796 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1797 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001798
1799 if (apv >= 88)
1800 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1801 if (apv >= 89)
1802 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1803
1804 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1805 } else
1806 rv = 0; /* not ok */
1807
1808 mutex_unlock(&mdev->data.mutex);
1809
1810 return rv;
1811}
1812
1813int drbd_send_protocol(struct drbd_conf *mdev)
1814{
1815 struct p_protocol *p;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001816 int size, cf, rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001817
1818 size = sizeof(struct p_protocol);
1819
1820 if (mdev->agreed_pro_version >= 87)
1821 size += strlen(mdev->net_conf->integrity_alg) + 1;
1822
1823 /* we must not recurse into our own queue,
1824 * as that is blocked during handshake */
1825 p = kmalloc(size, GFP_NOIO);
1826 if (p == NULL)
1827 return 0;
1828
1829 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
1830 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
1831 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
1832 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001833 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1834
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001835 cf = 0;
1836 if (mdev->net_conf->want_lose)
1837 cf |= CF_WANT_LOSE;
1838 if (mdev->net_conf->dry_run) {
1839 if (mdev->agreed_pro_version >= 92)
1840 cf |= CF_DRY_RUN;
1841 else {
1842 dev_err(DEV, "--dry-run is not supported by peer");
Dan Carpenter7ac314c2010-04-22 14:27:23 +02001843 kfree(p);
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001844 return 0;
1845 }
1846 }
1847 p->conn_flags = cpu_to_be32(cf);
1848
Philipp Reisnerb411b362009-09-25 16:07:19 -07001849 if (mdev->agreed_pro_version >= 87)
1850 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1851
1852 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001853 (struct p_header80 *)p, size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001854 kfree(p);
1855 return rv;
1856}
1857
1858int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1859{
1860 struct p_uuids p;
1861 int i;
1862
1863 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1864 return 1;
1865
1866 for (i = UI_CURRENT; i < UI_SIZE; i++)
1867 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1868
1869 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1870 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1871 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1872 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1873 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1874 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1875
1876 put_ldev(mdev);
1877
1878 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001879 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001880}
1881
1882int drbd_send_uuids(struct drbd_conf *mdev)
1883{
1884 return _drbd_send_uuids(mdev, 0);
1885}
1886
1887int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1888{
1889 return _drbd_send_uuids(mdev, 8);
1890}
1891
1892
1893int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1894{
1895 struct p_rs_uuid p;
1896
1897 p.uuid = cpu_to_be64(val);
1898
1899 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001900 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001901}
1902
Philipp Reisnere89b5912010-03-24 17:11:33 +01001903int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001904{
1905 struct p_sizes p;
1906 sector_t d_size, u_size;
1907 int q_order_type;
1908 int ok;
1909
1910 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1911 D_ASSERT(mdev->ldev->backing_bdev);
1912 d_size = drbd_get_max_capacity(mdev->ldev);
1913 u_size = mdev->ldev->dc.disk_size;
1914 q_order_type = drbd_queue_order_type(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001915 put_ldev(mdev);
1916 } else {
1917 d_size = 0;
1918 u_size = 0;
1919 q_order_type = QUEUE_ORDERED_NONE;
1920 }
1921
1922 p.d_size = cpu_to_be64(d_size);
1923 p.u_size = cpu_to_be64(u_size);
1924 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1925 p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
Philipp Reisnere89b5912010-03-24 17:11:33 +01001926 p.queue_order_type = cpu_to_be16(q_order_type);
1927 p.dds_flags = cpu_to_be16(flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001928
1929 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001930 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001931 return ok;
1932}
1933
1934/**
1935 * drbd_send_state() - Sends the drbd state to the peer
1936 * @mdev: DRBD device.
1937 */
1938int drbd_send_state(struct drbd_conf *mdev)
1939{
1940 struct socket *sock;
1941 struct p_state p;
1942 int ok = 0;
1943
1944 /* Grab state lock so we wont send state if we're in the middle
1945 * of a cluster wide state change on another thread */
1946 drbd_state_lock(mdev);
1947
1948 mutex_lock(&mdev->data.mutex);
1949
1950 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
1951 sock = mdev->data.socket;
1952
1953 if (likely(sock != NULL)) {
1954 ok = _drbd_send_cmd(mdev, sock, P_STATE,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001955 (struct p_header80 *)&p, sizeof(p), 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001956 }
1957
1958 mutex_unlock(&mdev->data.mutex);
1959
1960 drbd_state_unlock(mdev);
1961 return ok;
1962}
1963
1964int drbd_send_state_req(struct drbd_conf *mdev,
1965 union drbd_state mask, union drbd_state val)
1966{
1967 struct p_req_state p;
1968
1969 p.mask = cpu_to_be32(mask.i);
1970 p.val = cpu_to_be32(val.i);
1971
1972 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001973 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001974}
1975
1976int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
1977{
1978 struct p_req_state_reply p;
1979
1980 p.retcode = cpu_to_be32(retcode);
1981
1982 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001983 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001984}
1985
1986int fill_bitmap_rle_bits(struct drbd_conf *mdev,
1987 struct p_compressed_bm *p,
1988 struct bm_xfer_ctx *c)
1989{
1990 struct bitstream bs;
1991 unsigned long plain_bits;
1992 unsigned long tmp;
1993 unsigned long rl;
1994 unsigned len;
1995 unsigned toggle;
1996 int bits;
1997
1998 /* may we use this feature? */
1999 if ((mdev->sync_conf.use_rle == 0) ||
2000 (mdev->agreed_pro_version < 90))
2001 return 0;
2002
2003 if (c->bit_offset >= c->bm_bits)
2004 return 0; /* nothing to do. */
2005
2006 /* use at most thus many bytes */
2007 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2008 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2009 /* plain bits covered in this code string */
2010 plain_bits = 0;
2011
2012 /* p->encoding & 0x80 stores whether the first run length is set.
2013 * bit offset is implicit.
2014 * start with toggle == 2 to be able to tell the first iteration */
2015 toggle = 2;
2016
2017 /* see how much plain bits we can stuff into one packet
2018 * using RLE and VLI. */
2019 do {
2020 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2021 : _drbd_bm_find_next(mdev, c->bit_offset);
2022 if (tmp == -1UL)
2023 tmp = c->bm_bits;
2024 rl = tmp - c->bit_offset;
2025
2026 if (toggle == 2) { /* first iteration */
2027 if (rl == 0) {
2028 /* the first checked bit was set,
2029 * store start value, */
2030 DCBP_set_start(p, 1);
2031 /* but skip encoding of zero run length */
2032 toggle = !toggle;
2033 continue;
2034 }
2035 DCBP_set_start(p, 0);
2036 }
2037
2038 /* paranoia: catch zero runlength.
2039 * can only happen if bitmap is modified while we scan it. */
2040 if (rl == 0) {
2041 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2042 "t:%u bo:%lu\n", toggle, c->bit_offset);
2043 return -1;
2044 }
2045
2046 bits = vli_encode_bits(&bs, rl);
2047 if (bits == -ENOBUFS) /* buffer full */
2048 break;
2049 if (bits <= 0) {
2050 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2051 return 0;
2052 }
2053
2054 toggle = !toggle;
2055 plain_bits += rl;
2056 c->bit_offset = tmp;
2057 } while (c->bit_offset < c->bm_bits);
2058
2059 len = bs.cur.b - p->code + !!bs.cur.bit;
2060
2061 if (plain_bits < (len << 3)) {
2062 /* incompressible with this method.
2063 * we need to rewind both word and bit position. */
2064 c->bit_offset -= plain_bits;
2065 bm_xfer_ctx_bit_to_word_offset(c);
2066 c->bit_offset = c->word_offset * BITS_PER_LONG;
2067 return 0;
2068 }
2069
2070 /* RLE + VLI was able to compress it just fine.
2071 * update c->word_offset. */
2072 bm_xfer_ctx_bit_to_word_offset(c);
2073
2074 /* store pad_bits */
2075 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2076
2077 return len;
2078}
2079
2080enum { OK, FAILED, DONE }
2081send_bitmap_rle_or_plain(struct drbd_conf *mdev,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002082 struct p_header80 *h, struct bm_xfer_ctx *c)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002083{
2084 struct p_compressed_bm *p = (void*)h;
2085 unsigned long num_words;
2086 int len;
2087 int ok;
2088
2089 len = fill_bitmap_rle_bits(mdev, p, c);
2090
2091 if (len < 0)
2092 return FAILED;
2093
2094 if (len) {
2095 DCBP_set_code(p, RLE_VLI_Bits);
2096 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2097 sizeof(*p) + len, 0);
2098
2099 c->packets[0]++;
2100 c->bytes[0] += sizeof(*p) + len;
2101
2102 if (c->bit_offset >= c->bm_bits)
2103 len = 0; /* DONE */
2104 } else {
2105 /* was not compressible.
2106 * send a buffer full of plain text bits instead. */
2107 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2108 len = num_words * sizeof(long);
2109 if (len)
2110 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2111 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002112 h, sizeof(struct p_header80) + len, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002113 c->word_offset += num_words;
2114 c->bit_offset = c->word_offset * BITS_PER_LONG;
2115
2116 c->packets[1]++;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002117 c->bytes[1] += sizeof(struct p_header80) + len;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002118
2119 if (c->bit_offset > c->bm_bits)
2120 c->bit_offset = c->bm_bits;
2121 }
2122 ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
2123
2124 if (ok == DONE)
2125 INFO_bm_xfer_stats(mdev, "send", c);
2126 return ok;
2127}
2128
2129/* See the comment at receive_bitmap() */
2130int _drbd_send_bitmap(struct drbd_conf *mdev)
2131{
2132 struct bm_xfer_ctx c;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002133 struct p_header80 *p;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002134 int ret;
2135
2136 ERR_IF(!mdev->bitmap) return FALSE;
2137
2138 /* maybe we should use some per thread scratch page,
2139 * and allocate that during initial device creation? */
Philipp Reisner0b70a132010-08-20 13:36:10 +02002140 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002141 if (!p) {
2142 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2143 return FALSE;
2144 }
2145
2146 if (get_ldev(mdev)) {
2147 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2148 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2149 drbd_bm_set_all(mdev);
2150 if (drbd_bm_write(mdev)) {
2151 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2152 * but otherwise process as per normal - need to tell other
2153 * side that a full resync is required! */
2154 dev_err(DEV, "Failed to write bitmap to disk!\n");
2155 } else {
2156 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2157 drbd_md_sync(mdev);
2158 }
2159 }
2160 put_ldev(mdev);
2161 }
2162
2163 c = (struct bm_xfer_ctx) {
2164 .bm_bits = drbd_bm_bits(mdev),
2165 .bm_words = drbd_bm_words(mdev),
2166 };
2167
2168 do {
2169 ret = send_bitmap_rle_or_plain(mdev, p, &c);
2170 } while (ret == OK);
2171
2172 free_page((unsigned long) p);
2173 return (ret == DONE);
2174}
2175
2176int drbd_send_bitmap(struct drbd_conf *mdev)
2177{
2178 int err;
2179
2180 if (!drbd_get_data_sock(mdev))
2181 return -1;
2182 err = !_drbd_send_bitmap(mdev);
2183 drbd_put_data_sock(mdev);
2184 return err;
2185}
2186
2187int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2188{
2189 int ok;
2190 struct p_barrier_ack p;
2191
2192 p.barrier = barrier_nr;
2193 p.set_size = cpu_to_be32(set_size);
2194
2195 if (mdev->state.conn < C_CONNECTED)
2196 return FALSE;
2197 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002198 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002199 return ok;
2200}
2201
2202/**
2203 * _drbd_send_ack() - Sends an ack packet
2204 * @mdev: DRBD device.
2205 * @cmd: Packet command code.
2206 * @sector: sector, needs to be in big endian byte order
2207 * @blksize: size in byte, needs to be in big endian byte order
2208 * @block_id: Id, big endian byte order
2209 */
2210static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2211 u64 sector,
2212 u32 blksize,
2213 u64 block_id)
2214{
2215 int ok;
2216 struct p_block_ack p;
2217
2218 p.sector = sector;
2219 p.block_id = block_id;
2220 p.blksize = blksize;
2221 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2222
2223 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2224 return FALSE;
2225 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002226 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002227 return ok;
2228}
2229
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002230/* dp->sector and dp->block_id already/still in network byte order,
2231 * data_size is payload size according to dp->head,
2232 * and may need to be corrected for digest size. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002233int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002234 struct p_data *dp, int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002235{
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002236 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2237 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002238 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2239 dp->block_id);
2240}
2241
2242int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2243 struct p_block_req *rp)
2244{
2245 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2246}
2247
2248/**
2249 * drbd_send_ack() - Sends an ack packet
2250 * @mdev: DRBD device.
2251 * @cmd: Packet command code.
2252 * @e: Epoch entry.
2253 */
2254int drbd_send_ack(struct drbd_conf *mdev,
2255 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2256{
2257 return _drbd_send_ack(mdev, cmd,
2258 cpu_to_be64(e->sector),
2259 cpu_to_be32(e->size),
2260 e->block_id);
2261}
2262
2263/* This function misuses the block_id field to signal if the blocks
2264 * are is sync or not. */
2265int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2266 sector_t sector, int blksize, u64 block_id)
2267{
2268 return _drbd_send_ack(mdev, cmd,
2269 cpu_to_be64(sector),
2270 cpu_to_be32(blksize),
2271 cpu_to_be64(block_id));
2272}
2273
2274int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2275 sector_t sector, int size, u64 block_id)
2276{
2277 int ok;
2278 struct p_block_req p;
2279
2280 p.sector = cpu_to_be64(sector);
2281 p.block_id = block_id;
2282 p.blksize = cpu_to_be32(size);
2283
2284 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002285 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002286 return ok;
2287}
2288
2289int drbd_send_drequest_csum(struct drbd_conf *mdev,
2290 sector_t sector, int size,
2291 void *digest, int digest_size,
2292 enum drbd_packets cmd)
2293{
2294 int ok;
2295 struct p_block_req p;
2296
2297 p.sector = cpu_to_be64(sector);
2298 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2299 p.blksize = cpu_to_be32(size);
2300
2301 p.head.magic = BE_DRBD_MAGIC;
2302 p.head.command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002303 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002304
2305 mutex_lock(&mdev->data.mutex);
2306
2307 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2308 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2309
2310 mutex_unlock(&mdev->data.mutex);
2311
2312 return ok;
2313}
2314
2315int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2316{
2317 int ok;
2318 struct p_block_req p;
2319
2320 p.sector = cpu_to_be64(sector);
2321 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2322 p.blksize = cpu_to_be32(size);
2323
2324 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002325 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002326 return ok;
2327}
2328
2329/* called on sndtimeo
2330 * returns FALSE if we should retry,
2331 * TRUE if we think connection is dead
2332 */
2333static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2334{
2335 int drop_it;
2336 /* long elapsed = (long)(jiffies - mdev->last_received); */
2337
2338 drop_it = mdev->meta.socket == sock
2339 || !mdev->asender.task
2340 || get_t_state(&mdev->asender) != Running
2341 || mdev->state.conn < C_CONNECTED;
2342
2343 if (drop_it)
2344 return TRUE;
2345
2346 drop_it = !--mdev->ko_count;
2347 if (!drop_it) {
2348 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2349 current->comm, current->pid, mdev->ko_count);
2350 request_ping(mdev);
2351 }
2352
2353 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2354}
2355
2356/* The idea of sendpage seems to be to put some kind of reference
2357 * to the page into the skb, and to hand it over to the NIC. In
2358 * this process get_page() gets called.
2359 *
2360 * As soon as the page was really sent over the network put_page()
2361 * gets called by some part of the network layer. [ NIC driver? ]
2362 *
2363 * [ get_page() / put_page() increment/decrement the count. If count
2364 * reaches 0 the page will be freed. ]
2365 *
2366 * This works nicely with pages from FSs.
2367 * But this means that in protocol A we might signal IO completion too early!
2368 *
2369 * In order not to corrupt data during a resync we must make sure
2370 * that we do not reuse our own buffer pages (EEs) to early, therefore
2371 * we have the net_ee list.
2372 *
2373 * XFS seems to have problems, still, it submits pages with page_count == 0!
2374 * As a workaround, we disable sendpage on pages
2375 * with page_count == 0 or PageSlab.
2376 */
2377static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002378 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002379{
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002380 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002381 kunmap(page);
2382 if (sent == size)
2383 mdev->send_cnt += size>>9;
2384 return sent == size;
2385}
2386
2387static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002388 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002389{
2390 mm_segment_t oldfs = get_fs();
2391 int sent, ok;
2392 int len = size;
2393
2394 /* e.g. XFS meta- & log-data is in slab pages, which have a
2395 * page_count of 0 and/or have PageSlab() set.
2396 * we cannot use send_page for those, as that does get_page();
2397 * put_page(); and would cause either a VM_BUG directly, or
2398 * __page_cache_release a page that would actually still be referenced
2399 * by someone, leading to some obscure delayed Oops somewhere else. */
2400 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002401 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002402
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002403 msg_flags |= MSG_NOSIGNAL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002404 drbd_update_congested(mdev);
2405 set_fs(KERNEL_DS);
2406 do {
2407 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2408 offset, len,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002409 msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002410 if (sent == -EAGAIN) {
2411 if (we_should_drop_the_connection(mdev,
2412 mdev->data.socket))
2413 break;
2414 else
2415 continue;
2416 }
2417 if (sent <= 0) {
2418 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2419 __func__, (int)size, len, sent);
2420 break;
2421 }
2422 len -= sent;
2423 offset += sent;
2424 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2425 set_fs(oldfs);
2426 clear_bit(NET_CONGESTED, &mdev->flags);
2427
2428 ok = (len == 0);
2429 if (likely(ok))
2430 mdev->send_cnt += size>>9;
2431 return ok;
2432}
2433
2434static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2435{
2436 struct bio_vec *bvec;
2437 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002438 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002439 __bio_for_each_segment(bvec, bio, i, 0) {
2440 if (!_drbd_no_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002441 bvec->bv_offset, bvec->bv_len,
2442 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002443 return 0;
2444 }
2445 return 1;
2446}
2447
2448static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2449{
2450 struct bio_vec *bvec;
2451 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002452 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002453 __bio_for_each_segment(bvec, bio, i, 0) {
2454 if (!_drbd_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002455 bvec->bv_offset, bvec->bv_len,
2456 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002457 return 0;
2458 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002459 return 1;
2460}
2461
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002462static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2463{
2464 struct page *page = e->pages;
2465 unsigned len = e->size;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002466 /* hint all but last page with MSG_MORE */
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002467 page_chain_for_each(page) {
2468 unsigned l = min_t(unsigned, len, PAGE_SIZE);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002469 if (!_drbd_send_page(mdev, page, 0, l,
2470 page_chain_next(page) ? MSG_MORE : 0))
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002471 return 0;
2472 len -= l;
2473 }
2474 return 1;
2475}
2476
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002477static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2478{
2479 if (mdev->agreed_pro_version >= 95)
2480 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002481 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2482 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2483 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2484 else
Jens Axboe721a9602011-03-09 11:56:30 +01002485 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002486}
2487
Philipp Reisnerb411b362009-09-25 16:07:19 -07002488/* Used to send write requests
2489 * R_PRIMARY -> Peer (P_DATA)
2490 */
2491int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2492{
2493 int ok = 1;
2494 struct p_data p;
2495 unsigned int dp_flags = 0;
2496 void *dgb;
2497 int dgs;
2498
2499 if (!drbd_get_data_sock(mdev))
2500 return 0;
2501
2502 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2503 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2504
Philipp Reisnerd5373382010-08-23 15:18:33 +02002505 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002506 p.head.h80.magic = BE_DRBD_MAGIC;
2507 p.head.h80.command = cpu_to_be16(P_DATA);
2508 p.head.h80.length =
2509 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2510 } else {
2511 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2512 p.head.h95.command = cpu_to_be16(P_DATA);
2513 p.head.h95.length =
2514 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2515 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002516
2517 p.sector = cpu_to_be64(req->sector);
2518 p.block_id = (unsigned long)req;
2519 p.seq_num = cpu_to_be32(req->seq_num =
2520 atomic_add_return(1, &mdev->packet_seq));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002521
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002522 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2523
Philipp Reisnerb411b362009-09-25 16:07:19 -07002524 if (mdev->state.conn >= C_SYNC_SOURCE &&
2525 mdev->state.conn <= C_PAUSED_SYNC_T)
2526 dp_flags |= DP_MAY_SET_IN_SYNC;
2527
2528 p.dp_flags = cpu_to_be32(dp_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002529 set_bit(UNPLUG_REMOTE, &mdev->flags);
2530 ok = (sizeof(p) ==
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002531 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002532 if (ok && dgs) {
2533 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002534 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002535 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002536 }
2537 if (ok) {
2538 if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
2539 ok = _drbd_send_bio(mdev, req->master_bio);
2540 else
2541 ok = _drbd_send_zc_bio(mdev, req->master_bio);
2542 }
2543
2544 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002545
Philipp Reisnerb411b362009-09-25 16:07:19 -07002546 return ok;
2547}
2548
2549/* answer packet, used to send data back for read requests:
2550 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2551 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2552 */
2553int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2554 struct drbd_epoch_entry *e)
2555{
2556 int ok;
2557 struct p_data p;
2558 void *dgb;
2559 int dgs;
2560
2561 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2562 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2563
Philipp Reisnerd5373382010-08-23 15:18:33 +02002564 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002565 p.head.h80.magic = BE_DRBD_MAGIC;
2566 p.head.h80.command = cpu_to_be16(cmd);
2567 p.head.h80.length =
2568 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2569 } else {
2570 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2571 p.head.h95.command = cpu_to_be16(cmd);
2572 p.head.h95.length =
2573 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2574 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002575
2576 p.sector = cpu_to_be64(e->sector);
2577 p.block_id = e->block_id;
2578 /* p.seq_num = 0; No sequence numbers here.. */
2579
2580 /* Only called by our kernel thread.
2581 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2582 * in response to admin command or module unload.
2583 */
2584 if (!drbd_get_data_sock(mdev))
2585 return 0;
2586
Philipp Reisner0b70a132010-08-20 13:36:10 +02002587 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002588 if (ok && dgs) {
2589 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002590 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002591 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002592 }
2593 if (ok)
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002594 ok = _drbd_send_zc_ee(mdev, e);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002595
2596 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002597
Philipp Reisnerb411b362009-09-25 16:07:19 -07002598 return ok;
2599}
2600
2601/*
2602 drbd_send distinguishes two cases:
2603
2604 Packets sent via the data socket "sock"
2605 and packets sent via the meta data socket "msock"
2606
2607 sock msock
2608 -----------------+-------------------------+------------------------------
2609 timeout conf.timeout / 2 conf.timeout / 2
2610 timeout action send a ping via msock Abort communication
2611 and close all sockets
2612*/
2613
2614/*
2615 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2616 */
2617int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2618 void *buf, size_t size, unsigned msg_flags)
2619{
2620 struct kvec iov;
2621 struct msghdr msg;
2622 int rv, sent = 0;
2623
2624 if (!sock)
2625 return -1000;
2626
2627 /* THINK if (signal_pending) return ... ? */
2628
2629 iov.iov_base = buf;
2630 iov.iov_len = size;
2631
2632 msg.msg_name = NULL;
2633 msg.msg_namelen = 0;
2634 msg.msg_control = NULL;
2635 msg.msg_controllen = 0;
2636 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2637
2638 if (sock == mdev->data.socket) {
2639 mdev->ko_count = mdev->net_conf->ko_count;
2640 drbd_update_congested(mdev);
2641 }
2642 do {
2643 /* STRANGE
2644 * tcp_sendmsg does _not_ use its size parameter at all ?
2645 *
2646 * -EAGAIN on timeout, -EINTR on signal.
2647 */
2648/* THINK
2649 * do we need to block DRBD_SIG if sock == &meta.socket ??
2650 * otherwise wake_asender() might interrupt some send_*Ack !
2651 */
2652 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2653 if (rv == -EAGAIN) {
2654 if (we_should_drop_the_connection(mdev, sock))
2655 break;
2656 else
2657 continue;
2658 }
2659 D_ASSERT(rv != 0);
2660 if (rv == -EINTR) {
2661 flush_signals(current);
2662 rv = 0;
2663 }
2664 if (rv < 0)
2665 break;
2666 sent += rv;
2667 iov.iov_base += rv;
2668 iov.iov_len -= rv;
2669 } while (sent < size);
2670
2671 if (sock == mdev->data.socket)
2672 clear_bit(NET_CONGESTED, &mdev->flags);
2673
2674 if (rv <= 0) {
2675 if (rv != -EAGAIN) {
2676 dev_err(DEV, "%s_sendmsg returned %d\n",
2677 sock == mdev->meta.socket ? "msock" : "sock",
2678 rv);
2679 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2680 } else
2681 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2682 }
2683
2684 return sent;
2685}
2686
2687static int drbd_open(struct block_device *bdev, fmode_t mode)
2688{
2689 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2690 unsigned long flags;
2691 int rv = 0;
2692
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002693 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002694 spin_lock_irqsave(&mdev->req_lock, flags);
2695 /* to have a stable mdev->state.role
2696 * and no race with updating open_cnt */
2697
2698 if (mdev->state.role != R_PRIMARY) {
2699 if (mode & FMODE_WRITE)
2700 rv = -EROFS;
2701 else if (!allow_oos)
2702 rv = -EMEDIUMTYPE;
2703 }
2704
2705 if (!rv)
2706 mdev->open_cnt++;
2707 spin_unlock_irqrestore(&mdev->req_lock, flags);
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002708 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002709
2710 return rv;
2711}
2712
2713static int drbd_release(struct gendisk *gd, fmode_t mode)
2714{
2715 struct drbd_conf *mdev = gd->private_data;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002716 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002717 mdev->open_cnt--;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002718 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002719 return 0;
2720}
2721
Philipp Reisnerb411b362009-09-25 16:07:19 -07002722static void drbd_set_defaults(struct drbd_conf *mdev)
2723{
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002724 /* This way we get a compile error when sync_conf grows,
2725 and we forgot to initialize it here */
2726 mdev->sync_conf = (struct syncer_conf) {
2727 /* .rate = */ DRBD_RATE_DEF,
2728 /* .after = */ DRBD_AFTER_DEF,
2729 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002730 /* .verify_alg = */ {}, 0,
2731 /* .cpu_mask = */ {}, 0,
2732 /* .csums_alg = */ {}, 0,
Philipp Reisnere7564142010-06-29 17:35:34 +02002733 /* .use_rle = */ 0,
Philipp Reisner9a31d712010-07-05 13:42:03 +02002734 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
2735 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
2736 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2737 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002738 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
2739 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002740 };
2741
2742 /* Have to use that way, because the layout differs between
2743 big endian and little endian */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002744 mdev->state = (union drbd_state) {
2745 { .role = R_SECONDARY,
2746 .peer = R_UNKNOWN,
2747 .conn = C_STANDALONE,
2748 .disk = D_DISKLESS,
2749 .pdsk = D_UNKNOWN,
Philipp Reisnerfb22c402010-09-08 23:20:21 +02002750 .susp = 0,
2751 .susp_nod = 0,
2752 .susp_fen = 0
Philipp Reisnerb411b362009-09-25 16:07:19 -07002753 } };
2754}
2755
2756void drbd_init_set_defaults(struct drbd_conf *mdev)
2757{
2758 /* the memset(,0,) did most of this.
2759 * note: only assignments, no allocation in here */
2760
2761 drbd_set_defaults(mdev);
2762
Philipp Reisnerb411b362009-09-25 16:07:19 -07002763 atomic_set(&mdev->ap_bio_cnt, 0);
2764 atomic_set(&mdev->ap_pending_cnt, 0);
2765 atomic_set(&mdev->rs_pending_cnt, 0);
2766 atomic_set(&mdev->unacked_cnt, 0);
2767 atomic_set(&mdev->local_cnt, 0);
2768 atomic_set(&mdev->net_cnt, 0);
2769 atomic_set(&mdev->packet_seq, 0);
2770 atomic_set(&mdev->pp_in_use, 0);
Lars Ellenberg435f0742010-09-06 12:30:25 +02002771 atomic_set(&mdev->pp_in_use_by_net, 0);
Philipp Reisner778f2712010-07-06 11:14:00 +02002772 atomic_set(&mdev->rs_sect_in, 0);
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002773 atomic_set(&mdev->rs_sect_ev, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002774
2775 mutex_init(&mdev->md_io_mutex);
2776 mutex_init(&mdev->data.mutex);
2777 mutex_init(&mdev->meta.mutex);
2778 sema_init(&mdev->data.work.s, 0);
2779 sema_init(&mdev->meta.work.s, 0);
2780 mutex_init(&mdev->state_mutex);
2781
2782 spin_lock_init(&mdev->data.work.q_lock);
2783 spin_lock_init(&mdev->meta.work.q_lock);
2784
2785 spin_lock_init(&mdev->al_lock);
2786 spin_lock_init(&mdev->req_lock);
2787 spin_lock_init(&mdev->peer_seq_lock);
2788 spin_lock_init(&mdev->epoch_lock);
2789
2790 INIT_LIST_HEAD(&mdev->active_ee);
2791 INIT_LIST_HEAD(&mdev->sync_ee);
2792 INIT_LIST_HEAD(&mdev->done_ee);
2793 INIT_LIST_HEAD(&mdev->read_ee);
2794 INIT_LIST_HEAD(&mdev->net_ee);
2795 INIT_LIST_HEAD(&mdev->resync_reads);
2796 INIT_LIST_HEAD(&mdev->data.work.q);
2797 INIT_LIST_HEAD(&mdev->meta.work.q);
2798 INIT_LIST_HEAD(&mdev->resync_work.list);
2799 INIT_LIST_HEAD(&mdev->unplug_work.list);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02002800 INIT_LIST_HEAD(&mdev->go_diskless.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002801 INIT_LIST_HEAD(&mdev->md_sync_work.list);
2802 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
Philipp Reisner0ced55a2010-04-30 15:26:20 +02002803
Philipp Reisnerb411b362009-09-25 16:07:19 -07002804 mdev->resync_work.cb = w_resync_inactive;
2805 mdev->unplug_work.cb = w_send_write_hint;
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02002806 mdev->go_diskless.cb = w_go_diskless;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002807 mdev->md_sync_work.cb = w_md_sync;
2808 mdev->bm_io_work.w.cb = w_bitmap_io;
2809 init_timer(&mdev->resync_timer);
2810 init_timer(&mdev->md_sync_timer);
2811 mdev->resync_timer.function = resync_timer_fn;
2812 mdev->resync_timer.data = (unsigned long) mdev;
2813 mdev->md_sync_timer.function = md_sync_timer_fn;
2814 mdev->md_sync_timer.data = (unsigned long) mdev;
2815
2816 init_waitqueue_head(&mdev->misc_wait);
2817 init_waitqueue_head(&mdev->state_wait);
Philipp Reisner84dfb9f2010-06-23 11:20:05 +02002818 init_waitqueue_head(&mdev->net_cnt_wait);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002819 init_waitqueue_head(&mdev->ee_wait);
2820 init_waitqueue_head(&mdev->al_wait);
2821 init_waitqueue_head(&mdev->seq_wait);
2822
2823 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2824 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2825 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2826
2827 mdev->agreed_pro_version = PRO_VERSION_MAX;
Philipp Reisner2451fc32010-08-24 13:43:11 +02002828 mdev->write_ordering = WO_bdev_flush;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002829 mdev->resync_wenr = LC_FREE;
2830}
2831
2832void drbd_mdev_cleanup(struct drbd_conf *mdev)
2833{
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02002834 int i;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002835 if (mdev->receiver.t_state != None)
2836 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2837 mdev->receiver.t_state);
2838
2839 /* no need to lock it, I'm the only thread alive */
2840 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
2841 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2842 mdev->al_writ_cnt =
2843 mdev->bm_writ_cnt =
2844 mdev->read_cnt =
2845 mdev->recv_cnt =
2846 mdev->send_cnt =
2847 mdev->writ_cnt =
2848 mdev->p_size =
2849 mdev->rs_start =
2850 mdev->rs_total =
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02002851 mdev->rs_failed = 0;
2852 mdev->rs_last_events = 0;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002853 mdev->rs_last_sect_ev = 0;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02002854 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2855 mdev->rs_mark_left[i] = 0;
2856 mdev->rs_mark_time[i] = 0;
2857 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002858 D_ASSERT(mdev->net_conf == NULL);
2859
2860 drbd_set_my_capacity(mdev, 0);
2861 if (mdev->bitmap) {
2862 /* maybe never allocated. */
Philipp Reisner02d9a942010-03-24 16:23:03 +01002863 drbd_bm_resize(mdev, 0, 1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002864 drbd_bm_cleanup(mdev);
2865 }
2866
2867 drbd_free_resources(mdev);
Philipp Reisner07782862010-08-31 12:00:50 +02002868 clear_bit(AL_SUSPENDED, &mdev->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002869
2870 /*
2871 * currently we drbd_init_ee only on module load, so
2872 * we may do drbd_release_ee only on module unload!
2873 */
2874 D_ASSERT(list_empty(&mdev->active_ee));
2875 D_ASSERT(list_empty(&mdev->sync_ee));
2876 D_ASSERT(list_empty(&mdev->done_ee));
2877 D_ASSERT(list_empty(&mdev->read_ee));
2878 D_ASSERT(list_empty(&mdev->net_ee));
2879 D_ASSERT(list_empty(&mdev->resync_reads));
2880 D_ASSERT(list_empty(&mdev->data.work.q));
2881 D_ASSERT(list_empty(&mdev->meta.work.q));
2882 D_ASSERT(list_empty(&mdev->resync_work.list));
2883 D_ASSERT(list_empty(&mdev->unplug_work.list));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02002884 D_ASSERT(list_empty(&mdev->go_diskless.list));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002885}
2886
2887
2888static void drbd_destroy_mempools(void)
2889{
2890 struct page *page;
2891
2892 while (drbd_pp_pool) {
2893 page = drbd_pp_pool;
2894 drbd_pp_pool = (struct page *)page_private(page);
2895 __free_page(page);
2896 drbd_pp_vacant--;
2897 }
2898
2899 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2900
2901 if (drbd_ee_mempool)
2902 mempool_destroy(drbd_ee_mempool);
2903 if (drbd_request_mempool)
2904 mempool_destroy(drbd_request_mempool);
2905 if (drbd_ee_cache)
2906 kmem_cache_destroy(drbd_ee_cache);
2907 if (drbd_request_cache)
2908 kmem_cache_destroy(drbd_request_cache);
2909 if (drbd_bm_ext_cache)
2910 kmem_cache_destroy(drbd_bm_ext_cache);
2911 if (drbd_al_ext_cache)
2912 kmem_cache_destroy(drbd_al_ext_cache);
2913
2914 drbd_ee_mempool = NULL;
2915 drbd_request_mempool = NULL;
2916 drbd_ee_cache = NULL;
2917 drbd_request_cache = NULL;
2918 drbd_bm_ext_cache = NULL;
2919 drbd_al_ext_cache = NULL;
2920
2921 return;
2922}
2923
2924static int drbd_create_mempools(void)
2925{
2926 struct page *page;
2927 const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count;
2928 int i;
2929
2930 /* prepare our caches and mempools */
2931 drbd_request_mempool = NULL;
2932 drbd_ee_cache = NULL;
2933 drbd_request_cache = NULL;
2934 drbd_bm_ext_cache = NULL;
2935 drbd_al_ext_cache = NULL;
2936 drbd_pp_pool = NULL;
2937
2938 /* caches */
2939 drbd_request_cache = kmem_cache_create(
2940 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
2941 if (drbd_request_cache == NULL)
2942 goto Enomem;
2943
2944 drbd_ee_cache = kmem_cache_create(
2945 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
2946 if (drbd_ee_cache == NULL)
2947 goto Enomem;
2948
2949 drbd_bm_ext_cache = kmem_cache_create(
2950 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2951 if (drbd_bm_ext_cache == NULL)
2952 goto Enomem;
2953
2954 drbd_al_ext_cache = kmem_cache_create(
2955 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2956 if (drbd_al_ext_cache == NULL)
2957 goto Enomem;
2958
2959 /* mempools */
2960 drbd_request_mempool = mempool_create(number,
2961 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
2962 if (drbd_request_mempool == NULL)
2963 goto Enomem;
2964
2965 drbd_ee_mempool = mempool_create(number,
2966 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
Nicolas Kaiser2027ae12010-10-28 06:15:26 -06002967 if (drbd_ee_mempool == NULL)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002968 goto Enomem;
2969
2970 /* drbd's page pool */
2971 spin_lock_init(&drbd_pp_lock);
2972
2973 for (i = 0; i < number; i++) {
2974 page = alloc_page(GFP_HIGHUSER);
2975 if (!page)
2976 goto Enomem;
2977 set_page_private(page, (unsigned long)drbd_pp_pool);
2978 drbd_pp_pool = page;
2979 }
2980 drbd_pp_vacant = number;
2981
2982 return 0;
2983
2984Enomem:
2985 drbd_destroy_mempools(); /* in case we allocated some */
2986 return -ENOMEM;
2987}
2988
2989static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
2990 void *unused)
2991{
2992 /* just so we have it. you never know what interesting things we
2993 * might want to do here some day...
2994 */
2995
2996 return NOTIFY_DONE;
2997}
2998
2999static struct notifier_block drbd_notifier = {
3000 .notifier_call = drbd_notify_sys,
3001};
3002
3003static void drbd_release_ee_lists(struct drbd_conf *mdev)
3004{
3005 int rr;
3006
3007 rr = drbd_release_ee(mdev, &mdev->active_ee);
3008 if (rr)
3009 dev_err(DEV, "%d EEs in active list found!\n", rr);
3010
3011 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3012 if (rr)
3013 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3014
3015 rr = drbd_release_ee(mdev, &mdev->read_ee);
3016 if (rr)
3017 dev_err(DEV, "%d EEs in read list found!\n", rr);
3018
3019 rr = drbd_release_ee(mdev, &mdev->done_ee);
3020 if (rr)
3021 dev_err(DEV, "%d EEs in done list found!\n", rr);
3022
3023 rr = drbd_release_ee(mdev, &mdev->net_ee);
3024 if (rr)
3025 dev_err(DEV, "%d EEs in net list found!\n", rr);
3026}
3027
3028/* caution. no locking.
3029 * currently only used from module cleanup code. */
3030static void drbd_delete_device(unsigned int minor)
3031{
3032 struct drbd_conf *mdev = minor_to_mdev(minor);
3033
3034 if (!mdev)
3035 return;
3036
3037 /* paranoia asserts */
3038 if (mdev->open_cnt != 0)
3039 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3040 __FILE__ , __LINE__);
3041
3042 ERR_IF (!list_empty(&mdev->data.work.q)) {
3043 struct list_head *lp;
3044 list_for_each(lp, &mdev->data.work.q) {
3045 dev_err(DEV, "lp = %p\n", lp);
3046 }
3047 };
3048 /* end paranoia asserts */
3049
3050 del_gendisk(mdev->vdisk);
3051
3052 /* cleanup stuff that may have been allocated during
3053 * device (re-)configuration or state changes */
3054
3055 if (mdev->this_bdev)
3056 bdput(mdev->this_bdev);
3057
3058 drbd_free_resources(mdev);
3059
3060 drbd_release_ee_lists(mdev);
3061
3062 /* should be free'd on disconnect? */
3063 kfree(mdev->ee_hash);
3064 /*
3065 mdev->ee_hash_s = 0;
3066 mdev->ee_hash = NULL;
3067 */
3068
3069 lc_destroy(mdev->act_log);
3070 lc_destroy(mdev->resync);
3071
3072 kfree(mdev->p_uuid);
3073 /* mdev->p_uuid = NULL; */
3074
3075 kfree(mdev->int_dig_out);
3076 kfree(mdev->int_dig_in);
3077 kfree(mdev->int_dig_vv);
3078
3079 /* cleanup the rest that has been
3080 * allocated from drbd_new_device
3081 * and actually free the mdev itself */
3082 drbd_free_mdev(mdev);
3083}
3084
3085static void drbd_cleanup(void)
3086{
3087 unsigned int i;
3088
3089 unregister_reboot_notifier(&drbd_notifier);
3090
3091 drbd_nl_cleanup();
3092
3093 if (minor_table) {
3094 if (drbd_proc)
3095 remove_proc_entry("drbd", NULL);
3096 i = minor_count;
3097 while (i--)
3098 drbd_delete_device(i);
3099 drbd_destroy_mempools();
3100 }
3101
3102 kfree(minor_table);
3103
3104 unregister_blkdev(DRBD_MAJOR, "drbd");
3105
3106 printk(KERN_INFO "drbd: module cleanup done.\n");
3107}
3108
3109/**
3110 * drbd_congested() - Callback for pdflush
3111 * @congested_data: User data
3112 * @bdi_bits: Bits pdflush is currently interested in
3113 *
3114 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3115 */
3116static int drbd_congested(void *congested_data, int bdi_bits)
3117{
3118 struct drbd_conf *mdev = congested_data;
3119 struct request_queue *q;
3120 char reason = '-';
3121 int r = 0;
3122
3123 if (!__inc_ap_bio_cond(mdev)) {
3124 /* DRBD has frozen IO */
3125 r = bdi_bits;
3126 reason = 'd';
3127 goto out;
3128 }
3129
3130 if (get_ldev(mdev)) {
3131 q = bdev_get_queue(mdev->ldev->backing_bdev);
3132 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3133 put_ldev(mdev);
3134 if (r)
3135 reason = 'b';
3136 }
3137
3138 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3139 r |= (1 << BDI_async_congested);
3140 reason = reason == 'b' ? 'a' : 'n';
3141 }
3142
3143out:
3144 mdev->congestion_reason = reason;
3145 return r;
3146}
3147
3148struct drbd_conf *drbd_new_device(unsigned int minor)
3149{
3150 struct drbd_conf *mdev;
3151 struct gendisk *disk;
3152 struct request_queue *q;
3153
3154 /* GFP_KERNEL, we are outside of all write-out paths */
3155 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3156 if (!mdev)
3157 return NULL;
3158 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3159 goto out_no_cpumask;
3160
3161 mdev->minor = minor;
3162
3163 drbd_init_set_defaults(mdev);
3164
3165 q = blk_alloc_queue(GFP_KERNEL);
3166 if (!q)
3167 goto out_no_q;
3168 mdev->rq_queue = q;
3169 q->queuedata = mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003170
3171 disk = alloc_disk(1);
3172 if (!disk)
3173 goto out_no_disk;
3174 mdev->vdisk = disk;
3175
3176 set_disk_ro(disk, TRUE);
3177
3178 disk->queue = q;
3179 disk->major = DRBD_MAJOR;
3180 disk->first_minor = minor;
3181 disk->fops = &drbd_ops;
3182 sprintf(disk->disk_name, "drbd%d", minor);
3183 disk->private_data = mdev;
3184
3185 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3186 /* we have no partitions. we contain only ourselves. */
3187 mdev->this_bdev->bd_contains = mdev->this_bdev;
3188
3189 q->backing_dev_info.congested_fn = drbd_congested;
3190 q->backing_dev_info.congested_data = mdev;
3191
3192 blk_queue_make_request(q, drbd_make_request_26);
Lars Ellenberg98ec2862010-01-21 19:33:14 +01003193 blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003194 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3195 blk_queue_merge_bvec(q, drbd_merge_bvec);
Jens Axboe7eaceac2011-03-10 08:52:07 +01003196 q->queue_lock = &mdev->req_lock;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003197
3198 mdev->md_io_page = alloc_page(GFP_KERNEL);
3199 if (!mdev->md_io_page)
3200 goto out_no_io_page;
3201
3202 if (drbd_bm_init(mdev))
3203 goto out_no_bitmap;
3204 /* no need to lock access, we are still initializing this minor device. */
3205 if (!tl_init(mdev))
3206 goto out_no_tl;
3207
3208 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3209 if (!mdev->app_reads_hash)
3210 goto out_no_app_reads;
3211
3212 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3213 if (!mdev->current_epoch)
3214 goto out_no_epoch;
3215
3216 INIT_LIST_HEAD(&mdev->current_epoch->list);
3217 mdev->epochs = 1;
3218
3219 return mdev;
3220
3221/* out_whatever_else:
3222 kfree(mdev->current_epoch); */
3223out_no_epoch:
3224 kfree(mdev->app_reads_hash);
3225out_no_app_reads:
3226 tl_cleanup(mdev);
3227out_no_tl:
3228 drbd_bm_cleanup(mdev);
3229out_no_bitmap:
3230 __free_page(mdev->md_io_page);
3231out_no_io_page:
3232 put_disk(disk);
3233out_no_disk:
3234 blk_cleanup_queue(q);
3235out_no_q:
3236 free_cpumask_var(mdev->cpu_mask);
3237out_no_cpumask:
3238 kfree(mdev);
3239 return NULL;
3240}
3241
3242/* counterpart of drbd_new_device.
3243 * last part of drbd_delete_device. */
3244void drbd_free_mdev(struct drbd_conf *mdev)
3245{
3246 kfree(mdev->current_epoch);
3247 kfree(mdev->app_reads_hash);
3248 tl_cleanup(mdev);
3249 if (mdev->bitmap) /* should no longer be there. */
3250 drbd_bm_cleanup(mdev);
3251 __free_page(mdev->md_io_page);
3252 put_disk(mdev->vdisk);
3253 blk_cleanup_queue(mdev->rq_queue);
3254 free_cpumask_var(mdev->cpu_mask);
3255 kfree(mdev);
3256}
3257
3258
3259int __init drbd_init(void)
3260{
3261 int err;
3262
3263 if (sizeof(struct p_handshake) != 80) {
3264 printk(KERN_ERR
3265 "drbd: never change the size or layout "
3266 "of the HandShake packet.\n");
3267 return -EINVAL;
3268 }
3269
3270 if (1 > minor_count || minor_count > 255) {
3271 printk(KERN_ERR
3272 "drbd: invalid minor_count (%d)\n", minor_count);
3273#ifdef MODULE
3274 return -EINVAL;
3275#else
3276 minor_count = 8;
3277#endif
3278 }
3279
3280 err = drbd_nl_init();
3281 if (err)
3282 return err;
3283
3284 err = register_blkdev(DRBD_MAJOR, "drbd");
3285 if (err) {
3286 printk(KERN_ERR
3287 "drbd: unable to register block device major %d\n",
3288 DRBD_MAJOR);
3289 return err;
3290 }
3291
3292 register_reboot_notifier(&drbd_notifier);
3293
3294 /*
3295 * allocate all necessary structs
3296 */
3297 err = -ENOMEM;
3298
3299 init_waitqueue_head(&drbd_pp_wait);
3300
3301 drbd_proc = NULL; /* play safe for drbd_cleanup */
3302 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3303 GFP_KERNEL);
3304 if (!minor_table)
3305 goto Enomem;
3306
3307 err = drbd_create_mempools();
3308 if (err)
3309 goto Enomem;
3310
Lars Ellenberg8c484ee2010-03-11 16:47:58 +01003311 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003312 if (!drbd_proc) {
3313 printk(KERN_ERR "drbd: unable to register proc file\n");
3314 goto Enomem;
3315 }
3316
3317 rwlock_init(&global_state_lock);
3318
3319 printk(KERN_INFO "drbd: initialized. "
3320 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3321 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3322 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3323 printk(KERN_INFO "drbd: registered as block device major %d\n",
3324 DRBD_MAJOR);
3325 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3326
3327 return 0; /* Success! */
3328
3329Enomem:
3330 drbd_cleanup();
3331 if (err == -ENOMEM)
3332 /* currently always the case */
3333 printk(KERN_ERR "drbd: ran out of memory\n");
3334 else
3335 printk(KERN_ERR "drbd: initialization failure\n");
3336 return err;
3337}
3338
3339void drbd_free_bc(struct drbd_backing_dev *ldev)
3340{
3341 if (ldev == NULL)
3342 return;
3343
Tejun Heoe525fd82010-11-13 11:55:17 +01003344 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3345 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003346
3347 kfree(ldev);
3348}
3349
3350void drbd_free_sock(struct drbd_conf *mdev)
3351{
3352 if (mdev->data.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003353 mutex_lock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003354 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3355 sock_release(mdev->data.socket);
3356 mdev->data.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003357 mutex_unlock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003358 }
3359 if (mdev->meta.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003360 mutex_lock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003361 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3362 sock_release(mdev->meta.socket);
3363 mdev->meta.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003364 mutex_unlock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003365 }
3366}
3367
3368
3369void drbd_free_resources(struct drbd_conf *mdev)
3370{
3371 crypto_free_hash(mdev->csums_tfm);
3372 mdev->csums_tfm = NULL;
3373 crypto_free_hash(mdev->verify_tfm);
3374 mdev->verify_tfm = NULL;
3375 crypto_free_hash(mdev->cram_hmac_tfm);
3376 mdev->cram_hmac_tfm = NULL;
3377 crypto_free_hash(mdev->integrity_w_tfm);
3378 mdev->integrity_w_tfm = NULL;
3379 crypto_free_hash(mdev->integrity_r_tfm);
3380 mdev->integrity_r_tfm = NULL;
3381
3382 drbd_free_sock(mdev);
3383
3384 __no_warn(local,
3385 drbd_free_bc(mdev->ldev);
3386 mdev->ldev = NULL;);
3387}
3388
3389/* meta data management */
3390
3391struct meta_data_on_disk {
3392 u64 la_size; /* last agreed size. */
3393 u64 uuid[UI_SIZE]; /* UUIDs. */
3394 u64 device_uuid;
3395 u64 reserved_u64_1;
3396 u32 flags; /* MDF */
3397 u32 magic;
3398 u32 md_size_sect;
3399 u32 al_offset; /* offset to this block */
3400 u32 al_nr_extents; /* important for restoring the AL */
3401 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3402 u32 bm_offset; /* offset to the bitmap, from here */
3403 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3404 u32 reserved_u32[4];
3405
3406} __packed;
3407
3408/**
3409 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3410 * @mdev: DRBD device.
3411 */
3412void drbd_md_sync(struct drbd_conf *mdev)
3413{
3414 struct meta_data_on_disk *buffer;
3415 sector_t sector;
3416 int i;
3417
Lars Ellenbergee15b032010-09-03 10:00:09 +02003418 del_timer(&mdev->md_sync_timer);
3419 /* timer may be rearmed by drbd_md_mark_dirty() now. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003420 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3421 return;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003422
3423 /* We use here D_FAILED and not D_ATTACHING because we try to write
3424 * metadata even if we detach due to a disk failure! */
3425 if (!get_ldev_if_state(mdev, D_FAILED))
3426 return;
3427
Philipp Reisnerb411b362009-09-25 16:07:19 -07003428 mutex_lock(&mdev->md_io_mutex);
3429 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3430 memset(buffer, 0, 512);
3431
3432 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3433 for (i = UI_CURRENT; i < UI_SIZE; i++)
3434 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3435 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3436 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3437
3438 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3439 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3440 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3441 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3442 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3443
3444 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3445
3446 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3447 sector = mdev->ldev->md.md_offset;
3448
Lars Ellenberg3f3a9b82010-09-01 15:12:12 +02003449 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003450 /* this was a try anyways ... */
3451 dev_err(DEV, "meta data update failed!\n");
Philipp Reisnerb411b362009-09-25 16:07:19 -07003452 drbd_chk_io_error(mdev, 1, TRUE);
3453 }
3454
3455 /* Update mdev->ldev->md.la_size_sect,
3456 * since we updated it on metadata. */
3457 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3458
3459 mutex_unlock(&mdev->md_io_mutex);
3460 put_ldev(mdev);
3461}
3462
3463/**
3464 * drbd_md_read() - Reads in the meta data super block
3465 * @mdev: DRBD device.
3466 * @bdev: Device from which the meta data should be read in.
3467 *
3468 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
3469 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3470 */
3471int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3472{
3473 struct meta_data_on_disk *buffer;
3474 int i, rv = NO_ERROR;
3475
3476 if (!get_ldev_if_state(mdev, D_ATTACHING))
3477 return ERR_IO_MD_DISK;
3478
Philipp Reisnerb411b362009-09-25 16:07:19 -07003479 mutex_lock(&mdev->md_io_mutex);
3480 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3481
3482 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3483 /* NOTE: cant do normal error processing here as this is
3484 called BEFORE disk is attached */
3485 dev_err(DEV, "Error while reading metadata.\n");
3486 rv = ERR_IO_MD_DISK;
3487 goto err;
3488 }
3489
3490 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3491 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3492 rv = ERR_MD_INVALID;
3493 goto err;
3494 }
3495 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3496 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3497 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3498 rv = ERR_MD_INVALID;
3499 goto err;
3500 }
3501 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3502 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3503 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3504 rv = ERR_MD_INVALID;
3505 goto err;
3506 }
3507 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3508 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3509 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3510 rv = ERR_MD_INVALID;
3511 goto err;
3512 }
3513
3514 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3515 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3516 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3517 rv = ERR_MD_INVALID;
3518 goto err;
3519 }
3520
3521 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3522 for (i = UI_CURRENT; i < UI_SIZE; i++)
3523 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3524 bdev->md.flags = be32_to_cpu(buffer->flags);
3525 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3526 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3527
3528 if (mdev->sync_conf.al_extents < 7)
3529 mdev->sync_conf.al_extents = 127;
3530
3531 err:
3532 mutex_unlock(&mdev->md_io_mutex);
3533 put_ldev(mdev);
3534
3535 return rv;
3536}
3537
Lars Ellenbergac724122010-10-07 15:18:08 +02003538static void debug_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
3539{
3540 static char *uuid_str[UI_EXTENDED_SIZE] = {
3541 [UI_CURRENT] = "CURRENT",
3542 [UI_BITMAP] = "BITMAP",
3543 [UI_HISTORY_START] = "HISTORY_START",
3544 [UI_HISTORY_END] = "HISTORY_END",
3545 [UI_SIZE] = "SIZE",
3546 [UI_FLAGS] = "FLAGS",
3547 };
3548
3549 if (index >= UI_EXTENDED_SIZE) {
3550 dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
3551 return;
3552 }
3553
3554 dynamic_dev_dbg(DEV, " uuid[%s] now %016llX\n",
3555 uuid_str[index],
3556 (unsigned long long)mdev->ldev->md.uuid[index]);
3557}
3558
3559
Philipp Reisnerb411b362009-09-25 16:07:19 -07003560/**
3561 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3562 * @mdev: DRBD device.
3563 *
3564 * Call this function if you change anything that should be written to
3565 * the meta-data super block. This function sets MD_DIRTY, and starts a
3566 * timer that ensures that within five seconds you have to call drbd_md_sync().
3567 */
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003568#ifdef DEBUG
Lars Ellenbergee15b032010-09-03 10:00:09 +02003569void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3570{
3571 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3572 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3573 mdev->last_md_mark_dirty.line = line;
3574 mdev->last_md_mark_dirty.func = func;
3575 }
3576}
3577#else
Philipp Reisnerb411b362009-09-25 16:07:19 -07003578void drbd_md_mark_dirty(struct drbd_conf *mdev)
3579{
Lars Ellenbergee15b032010-09-03 10:00:09 +02003580 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003581 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003582}
Lars Ellenbergee15b032010-09-03 10:00:09 +02003583#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07003584
3585static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3586{
3587 int i;
3588
Lars Ellenbergac724122010-10-07 15:18:08 +02003589 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003590 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
Lars Ellenbergac724122010-10-07 15:18:08 +02003591 debug_drbd_uuid(mdev, i+1);
3592 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003593}
3594
3595void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3596{
3597 if (idx == UI_CURRENT) {
3598 if (mdev->state.role == R_PRIMARY)
3599 val |= 1;
3600 else
3601 val &= ~((u64)1);
3602
3603 drbd_set_ed_uuid(mdev, val);
3604 }
3605
3606 mdev->ldev->md.uuid[idx] = val;
Lars Ellenbergac724122010-10-07 15:18:08 +02003607 debug_drbd_uuid(mdev, idx);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003608 drbd_md_mark_dirty(mdev);
3609}
3610
3611
3612void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3613{
3614 if (mdev->ldev->md.uuid[idx]) {
3615 drbd_uuid_move_history(mdev);
3616 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
Lars Ellenbergac724122010-10-07 15:18:08 +02003617 debug_drbd_uuid(mdev, UI_HISTORY_START);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003618 }
3619 _drbd_uuid_set(mdev, idx, val);
3620}
3621
3622/**
3623 * drbd_uuid_new_current() - Creates a new current UUID
3624 * @mdev: DRBD device.
3625 *
3626 * Creates a new current UUID, and rotates the old current UUID into
3627 * the bitmap slot. Causes an incremental resync upon next connect.
3628 */
3629void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3630{
3631 u64 val;
3632
3633 dev_info(DEV, "Creating new current UUID\n");
3634 D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3635 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
Lars Ellenbergac724122010-10-07 15:18:08 +02003636 debug_drbd_uuid(mdev, UI_BITMAP);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003637
3638 get_random_bytes(&val, sizeof(u64));
3639 _drbd_uuid_set(mdev, UI_CURRENT, val);
Lars Ellenbergaaa8e2b2010-10-15 13:16:53 +02003640 /* get it to stable storage _now_ */
3641 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003642}
3643
3644void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3645{
3646 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3647 return;
3648
3649 if (val == 0) {
3650 drbd_uuid_move_history(mdev);
3651 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3652 mdev->ldev->md.uuid[UI_BITMAP] = 0;
Lars Ellenbergac724122010-10-07 15:18:08 +02003653 debug_drbd_uuid(mdev, UI_HISTORY_START);
3654 debug_drbd_uuid(mdev, UI_BITMAP);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003655 } else {
3656 if (mdev->ldev->md.uuid[UI_BITMAP])
3657 dev_warn(DEV, "bm UUID already set");
3658
3659 mdev->ldev->md.uuid[UI_BITMAP] = val;
3660 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3661
Lars Ellenbergac724122010-10-07 15:18:08 +02003662 debug_drbd_uuid(mdev, UI_BITMAP);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003663 }
3664 drbd_md_mark_dirty(mdev);
3665}
3666
3667/**
3668 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3669 * @mdev: DRBD device.
3670 *
3671 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3672 */
3673int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3674{
3675 int rv = -EIO;
3676
3677 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3678 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3679 drbd_md_sync(mdev);
3680 drbd_bm_set_all(mdev);
3681
3682 rv = drbd_bm_write(mdev);
3683
3684 if (!rv) {
3685 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3686 drbd_md_sync(mdev);
3687 }
3688
3689 put_ldev(mdev);
3690 }
3691
3692 return rv;
3693}
3694
3695/**
3696 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3697 * @mdev: DRBD device.
3698 *
3699 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3700 */
3701int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3702{
3703 int rv = -EIO;
3704
Philipp Reisner07782862010-08-31 12:00:50 +02003705 drbd_resume_al(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003706 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3707 drbd_bm_clear_all(mdev);
3708 rv = drbd_bm_write(mdev);
3709 put_ldev(mdev);
3710 }
3711
3712 return rv;
3713}
3714
3715static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3716{
3717 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3718 int rv;
3719
3720 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3721
3722 drbd_bm_lock(mdev, work->why);
3723 rv = work->io_fn(mdev);
3724 drbd_bm_unlock(mdev);
3725
3726 clear_bit(BITMAP_IO, &mdev->flags);
3727 wake_up(&mdev->misc_wait);
3728
3729 if (work->done)
3730 work->done(mdev, rv);
3731
3732 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3733 work->why = NULL;
3734
3735 return 1;
3736}
3737
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02003738void drbd_ldev_destroy(struct drbd_conf *mdev)
3739{
3740 lc_destroy(mdev->resync);
3741 mdev->resync = NULL;
3742 lc_destroy(mdev->act_log);
3743 mdev->act_log = NULL;
3744 __no_warn(local,
3745 drbd_free_bc(mdev->ldev);
3746 mdev->ldev = NULL;);
3747
3748 if (mdev->md_io_tmpp) {
3749 __free_page(mdev->md_io_tmpp);
3750 mdev->md_io_tmpp = NULL;
3751 }
3752 clear_bit(GO_DISKLESS, &mdev->flags);
3753}
3754
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003755static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3756{
3757 D_ASSERT(mdev->state.disk == D_FAILED);
Lars Ellenberg9d282872010-10-14 13:57:07 +02003758 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3759 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02003760 * the protected members anymore, though, so once put_ldev reaches zero
3761 * again, it will be safe to free them. */
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003762 drbd_force_state(mdev, NS(disk, D_DISKLESS));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003763 return 1;
3764}
3765
3766void drbd_go_diskless(struct drbd_conf *mdev)
3767{
3768 D_ASSERT(mdev->state.disk == D_FAILED);
3769 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
Lars Ellenberg9d282872010-10-14 13:57:07 +02003770 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003771}
3772
Philipp Reisnerb411b362009-09-25 16:07:19 -07003773/**
3774 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3775 * @mdev: DRBD device.
3776 * @io_fn: IO callback to be called when bitmap IO is possible
3777 * @done: callback to be called after the bitmap IO was performed
3778 * @why: Descriptive text of the reason for doing the IO
3779 *
3780 * While IO on the bitmap happens we freeze application IO thus we ensure
3781 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3782 * called from worker context. It MUST NOT be used while a previous such
3783 * work is still pending!
3784 */
3785void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3786 int (*io_fn)(struct drbd_conf *),
3787 void (*done)(struct drbd_conf *, int),
3788 char *why)
3789{
3790 D_ASSERT(current == mdev->worker.task);
3791
3792 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3793 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3794 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3795 if (mdev->bm_io_work.why)
3796 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3797 why, mdev->bm_io_work.why);
3798
3799 mdev->bm_io_work.io_fn = io_fn;
3800 mdev->bm_io_work.done = done;
3801 mdev->bm_io_work.why = why;
3802
3803 set_bit(BITMAP_IO, &mdev->flags);
3804 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3805 if (list_empty(&mdev->bm_io_work.w.list)) {
3806 set_bit(BITMAP_IO_QUEUED, &mdev->flags);
3807 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3808 } else
3809 dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
3810 }
3811}
3812
3813/**
3814 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
3815 * @mdev: DRBD device.
3816 * @io_fn: IO callback to be called when bitmap IO is possible
3817 * @why: Descriptive text of the reason for doing the IO
3818 *
3819 * freezes application IO while that the actual IO operations runs. This
3820 * functions MAY NOT be called from worker context.
3821 */
3822int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3823{
3824 int rv;
3825
3826 D_ASSERT(current != mdev->worker.task);
3827
3828 drbd_suspend_io(mdev);
3829
3830 drbd_bm_lock(mdev, why);
3831 rv = io_fn(mdev);
3832 drbd_bm_unlock(mdev);
3833
3834 drbd_resume_io(mdev);
3835
3836 return rv;
3837}
3838
3839void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3840{
3841 if ((mdev->ldev->md.flags & flag) != flag) {
3842 drbd_md_mark_dirty(mdev);
3843 mdev->ldev->md.flags |= flag;
3844 }
3845}
3846
3847void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3848{
3849 if ((mdev->ldev->md.flags & flag) != 0) {
3850 drbd_md_mark_dirty(mdev);
3851 mdev->ldev->md.flags &= ~flag;
3852 }
3853}
3854int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3855{
3856 return (bdev->md.flags & flag) != 0;
3857}
3858
3859static void md_sync_timer_fn(unsigned long data)
3860{
3861 struct drbd_conf *mdev = (struct drbd_conf *) data;
3862
3863 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3864}
3865
3866static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3867{
3868 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
Lars Ellenbergee15b032010-09-03 10:00:09 +02003869#ifdef DEBUG
3870 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
3871 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
3872#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07003873 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003874 return 1;
3875}
3876
3877#ifdef CONFIG_DRBD_FAULT_INJECTION
3878/* Fault insertion support including random number generator shamelessly
3879 * stolen from kernel/rcutorture.c */
3880struct fault_random_state {
3881 unsigned long state;
3882 unsigned long count;
3883};
3884
3885#define FAULT_RANDOM_MULT 39916801 /* prime */
3886#define FAULT_RANDOM_ADD 479001701 /* prime */
3887#define FAULT_RANDOM_REFRESH 10000
3888
3889/*
3890 * Crude but fast random-number generator. Uses a linear congruential
3891 * generator, with occasional help from get_random_bytes().
3892 */
3893static unsigned long
3894_drbd_fault_random(struct fault_random_state *rsp)
3895{
3896 long refresh;
3897
Roel Kluin49829ea2009-12-15 22:55:44 +01003898 if (!rsp->count--) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003899 get_random_bytes(&refresh, sizeof(refresh));
3900 rsp->state += refresh;
3901 rsp->count = FAULT_RANDOM_REFRESH;
3902 }
3903 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3904 return swahw32(rsp->state);
3905}
3906
3907static char *
3908_drbd_fault_str(unsigned int type) {
3909 static char *_faults[] = {
3910 [DRBD_FAULT_MD_WR] = "Meta-data write",
3911 [DRBD_FAULT_MD_RD] = "Meta-data read",
3912 [DRBD_FAULT_RS_WR] = "Resync write",
3913 [DRBD_FAULT_RS_RD] = "Resync read",
3914 [DRBD_FAULT_DT_WR] = "Data write",
3915 [DRBD_FAULT_DT_RD] = "Data read",
3916 [DRBD_FAULT_DT_RA] = "Data read ahead",
3917 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
Philipp Reisner6b4388a2010-04-26 14:11:45 +02003918 [DRBD_FAULT_AL_EE] = "EE allocation",
3919 [DRBD_FAULT_RECEIVE] = "receive data corruption",
Philipp Reisnerb411b362009-09-25 16:07:19 -07003920 };
3921
3922 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3923}
3924
3925unsigned int
3926_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3927{
3928 static struct fault_random_state rrs = {0, 0};
3929
3930 unsigned int ret = (
3931 (fault_devs == 0 ||
3932 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
3933 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
3934
3935 if (ret) {
3936 fault_count++;
3937
Lars Ellenberg73835062010-05-27 11:51:56 +02003938 if (__ratelimit(&drbd_ratelimit_state))
Philipp Reisnerb411b362009-09-25 16:07:19 -07003939 dev_warn(DEV, "***Simulating %s failure\n",
3940 _drbd_fault_str(type));
3941 }
3942
3943 return ret;
3944}
3945#endif
3946
3947const char *drbd_buildtag(void)
3948{
3949 /* DRBD built from external sources has here a reference to the
3950 git hash of the source code. */
3951
3952 static char buildtag[38] = "\0uilt-in";
3953
3954 if (buildtag[0] == 0) {
3955#ifdef CONFIG_MODULES
3956 if (THIS_MODULE != NULL)
3957 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3958 else
3959#endif
3960 buildtag[0] = 'b';
3961 }
3962
3963 return buildtag;
3964}
3965
3966module_init(drbd_init)
3967module_exit(drbd_cleanup)
3968
Philipp Reisnerb411b362009-09-25 16:07:19 -07003969EXPORT_SYMBOL(drbd_conn_str);
3970EXPORT_SYMBOL(drbd_role_str);
3971EXPORT_SYMBOL(drbd_disk_str);
3972EXPORT_SYMBOL(drbd_set_st_err_str);