blob: dfa08b7411c031c43e140cfe36651269c77881ee [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
Philipp Reisnerb411b362009-09-25 16:07:19 -070029#include <linux/module.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070030#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020035#include <linux/mutex.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070036#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070055#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020067static DEFINE_MUTEX(drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -070068int drbdd_init(struct drbd_thread *);
69int drbd_worker(struct drbd_thread *);
70int drbd_asender(struct drbd_thread *);
71
72int drbd_init(void);
73static int drbd_open(struct block_device *bdev, fmode_t mode);
74static int drbd_release(struct gendisk *gd, fmode_t mode);
75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79static void md_sync_timer_fn(unsigned long data);
80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +020081static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Philipp Reisner509fc012012-07-31 11:22:58 +020082static void _tl_clear(struct drbd_conf *mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -070083
Philipp Reisnerb411b362009-09-25 16:07:19 -070084MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
85 "Lars Ellenberg <lars@linbit.com>");
86MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
87MODULE_VERSION(REL_VERSION);
88MODULE_LICENSE("GPL");
Philipp Reisner2b8a90b2011-01-10 11:15:17 +010089MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
90 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
Philipp Reisnerb411b362009-09-25 16:07:19 -070091MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
92
93#include <linux/moduleparam.h>
94/* allow_open_on_secondary */
95MODULE_PARM_DESC(allow_oos, "DONT USE!");
96/* thanks to these macros, if compiled into the kernel (not-module),
97 * this becomes the boot parameter drbd.minor_count */
98module_param(minor_count, uint, 0444);
99module_param(disable_sendpage, bool, 0644);
100module_param(allow_oos, bool, 0);
101module_param(cn_idx, uint, 0444);
102module_param(proc_details, int, 0644);
103
104#ifdef CONFIG_DRBD_FAULT_INJECTION
105int enable_faults;
106int fault_rate;
107static int fault_count;
108int fault_devs;
109/* bitmap of enabled faults */
110module_param(enable_faults, int, 0664);
111/* fault rate % value - applies to all enabled faults */
112module_param(fault_rate, int, 0664);
113/* count of faults inserted */
114module_param(fault_count, int, 0664);
115/* bitmap of devices to insert faults on */
116module_param(fault_devs, int, 0644);
117#endif
118
119/* module parameter, defined */
Philipp Reisner2b8a90b2011-01-10 11:15:17 +0100120unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
Rusty Russell90ab5ee2012-01-13 09:32:20 +1030121bool disable_sendpage;
122bool allow_oos;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700123unsigned int cn_idx = CN_IDX_DRBD;
124int proc_details; /* Detail level in proc drbd*/
125
126/* Module parameter for setting the user mode helper program
127 * to run. Default is /sbin/drbdadm */
128char usermode_helper[80] = "/sbin/drbdadm";
129
130module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
131
132/* in 2.6.x, our device mapping and config info contains our virtual gendisks
133 * as member "struct gendisk *vdisk;"
134 */
135struct drbd_conf **minor_table;
136
137struct kmem_cache *drbd_request_cache;
138struct kmem_cache *drbd_ee_cache; /* epoch entries */
139struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
140struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
141mempool_t *drbd_request_mempool;
142mempool_t *drbd_ee_mempool;
Lars Ellenberg42818082011-02-23 12:39:46 +0100143mempool_t *drbd_md_io_page_pool;
Lars Ellenberg9476f392011-02-23 17:02:01 +0100144struct bio_set *drbd_md_io_bio_set;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700145
146/* I do not use a standard mempool, because:
147 1) I want to hand out the pre-allocated objects first.
148 2) I want to be able to interrupt sleeping allocation with a signal.
149 Note: This is a single linked list, the next pointer is the private
150 member of struct page.
151 */
152struct page *drbd_pp_pool;
153spinlock_t drbd_pp_lock;
154int drbd_pp_vacant;
155wait_queue_head_t drbd_pp_wait;
156
157DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
158
Emese Revfy7d4e9d02009-12-14 00:59:30 +0100159static const struct block_device_operations drbd_ops = {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700160 .owner = THIS_MODULE,
161 .open = drbd_open,
162 .release = drbd_release,
163};
164
Lars Ellenberg9476f392011-02-23 17:02:01 +0100165struct bio *bio_alloc_drbd(gfp_t gfp_mask)
166{
Lars Ellenberg9476f392011-02-23 17:02:01 +0100167 if (!drbd_md_io_bio_set)
168 return bio_alloc(gfp_mask, 1);
169
Kent Overstreet395c72a2012-09-06 15:34:55 -0700170 return bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
Lars Ellenberg9476f392011-02-23 17:02:01 +0100171}
172
Philipp Reisnerb411b362009-09-25 16:07:19 -0700173#ifdef __CHECKER__
174/* When checking with sparse, and this is an inline function, sparse will
175 give tons of false positives. When this is a real functions sparse works.
176 */
177int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
178{
179 int io_allowed;
180
181 atomic_inc(&mdev->local_cnt);
182 io_allowed = (mdev->state.disk >= mins);
183 if (!io_allowed) {
184 if (atomic_dec_and_test(&mdev->local_cnt))
185 wake_up(&mdev->misc_wait);
186 }
187 return io_allowed;
188}
189
190#endif
191
192/**
193 * DOC: The transfer log
194 *
195 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
196 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
197 * of the list. There is always at least one &struct drbd_tl_epoch object.
198 *
199 * Each &struct drbd_tl_epoch has a circular double linked list of requests
200 * attached.
201 */
202static int tl_init(struct drbd_conf *mdev)
203{
204 struct drbd_tl_epoch *b;
205
206 /* during device minor initialization, we may well use GFP_KERNEL */
207 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
208 if (!b)
209 return 0;
210 INIT_LIST_HEAD(&b->requests);
211 INIT_LIST_HEAD(&b->w.list);
212 b->next = NULL;
213 b->br_number = 4711;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200214 b->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700215 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
216
217 mdev->oldest_tle = b;
218 mdev->newest_tle = b;
219 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
Philipp Reisner6d7e32f2011-03-15 10:25:18 +0100220 INIT_LIST_HEAD(&mdev->barrier_acked_requests);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700221
222 mdev->tl_hash = NULL;
223 mdev->tl_hash_s = 0;
224
225 return 1;
226}
227
228static void tl_cleanup(struct drbd_conf *mdev)
229{
230 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
231 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
232 kfree(mdev->oldest_tle);
233 mdev->oldest_tle = NULL;
234 kfree(mdev->unused_spare_tle);
235 mdev->unused_spare_tle = NULL;
236 kfree(mdev->tl_hash);
237 mdev->tl_hash = NULL;
238 mdev->tl_hash_s = 0;
239}
240
241/**
242 * _tl_add_barrier() - Adds a barrier to the transfer log
243 * @mdev: DRBD device.
244 * @new: Barrier to be added before the current head of the TL.
245 *
246 * The caller must hold the req_lock.
247 */
248void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
249{
250 struct drbd_tl_epoch *newest_before;
251
252 INIT_LIST_HEAD(&new->requests);
253 INIT_LIST_HEAD(&new->w.list);
254 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
255 new->next = NULL;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200256 new->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700257
258 newest_before = mdev->newest_tle;
Lars Ellenbergc088b2d2012-03-23 13:57:13 +0100259 new->br_number = newest_before->br_number+1;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700260 if (mdev->newest_tle != new) {
261 mdev->newest_tle->next = new;
262 mdev->newest_tle = new;
263 }
264}
265
266/**
267 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
268 * @mdev: DRBD device.
269 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
270 * @set_size: Expected number of requests before that barrier.
271 *
272 * In case the passed barrier_nr or set_size does not match the oldest
273 * &struct drbd_tl_epoch objects this function will cause a termination
274 * of the connection.
275 */
276void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
277 unsigned int set_size)
278{
279 struct drbd_tl_epoch *b, *nob; /* next old barrier */
280 struct list_head *le, *tle;
281 struct drbd_request *r;
282
283 spin_lock_irq(&mdev->req_lock);
284
285 b = mdev->oldest_tle;
286
287 /* first some paranoia code */
288 if (b == NULL) {
289 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
290 barrier_nr);
291 goto bail;
292 }
293 if (b->br_number != barrier_nr) {
294 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
295 barrier_nr, b->br_number);
296 goto bail;
297 }
Philipp Reisner7e602c02010-05-27 14:49:27 +0200298 if (b->n_writes != set_size) {
299 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
300 barrier_nr, set_size, b->n_writes);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700301 goto bail;
302 }
303
304 /* Clean up list of requests processed during current epoch */
305 list_for_each_safe(le, tle, &b->requests) {
306 r = list_entry(le, struct drbd_request, tl_requests);
307 _req_mod(r, barrier_acked);
308 }
309 /* There could be requests on the list waiting for completion
310 of the write to the local disk. To avoid corruptions of
311 slab's data structures we have to remove the lists head.
312
313 Also there could have been a barrier ack out of sequence, overtaking
314 the write acks - which would be a bug and violating write ordering.
315 To not deadlock in case we lose connection while such requests are
316 still pending, we need some way to find them for the
317 _req_mode(connection_lost_while_pending).
318
319 These have been list_move'd to the out_of_sequence_requests list in
320 _req_mod(, barrier_acked) above.
321 */
Philipp Reisner6d7e32f2011-03-15 10:25:18 +0100322 list_splice_init(&b->requests, &mdev->barrier_acked_requests);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700323
324 nob = b->next;
325 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
326 _tl_add_barrier(mdev, b);
327 if (nob)
328 mdev->oldest_tle = nob;
329 /* if nob == NULL b was the only barrier, and becomes the new
330 barrier. Therefore mdev->oldest_tle points already to b */
331 } else {
332 D_ASSERT(nob != NULL);
333 mdev->oldest_tle = nob;
334 kfree(b);
335 }
336
337 spin_unlock_irq(&mdev->req_lock);
338 dec_ap_pending(mdev);
339
340 return;
341
342bail:
343 spin_unlock_irq(&mdev->req_lock);
344 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
345}
346
Philipp Reisner617049a2010-12-22 12:48:31 +0100347
Philipp Reisner11b58e72010-05-12 17:08:26 +0200348/**
349 * _tl_restart() - Walks the transfer log, and applies an action to all requests
350 * @mdev: DRBD device.
351 * @what: The action/event to perform with all request objects
352 *
353 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
Philipp Reisnerfd2491f2011-07-18 16:25:15 +0200354 * restart_frozen_disk_io.
Philipp Reisner11b58e72010-05-12 17:08:26 +0200355 */
356static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
357{
358 struct drbd_tl_epoch *b, *tmp, **pn;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200359 struct list_head *le, *tle, carry_reads;
Philipp Reisner11b58e72010-05-12 17:08:26 +0200360 struct drbd_request *req;
361 int rv, n_writes, n_reads;
362
363 b = mdev->oldest_tle;
364 pn = &mdev->oldest_tle;
365 while (b) {
366 n_writes = 0;
367 n_reads = 0;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200368 INIT_LIST_HEAD(&carry_reads);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200369 list_for_each_safe(le, tle, &b->requests) {
370 req = list_entry(le, struct drbd_request, tl_requests);
371 rv = _req_mod(req, what);
372
373 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
374 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
375 }
376 tmp = b->next;
377
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200378 if (n_writes) {
Philipp Reisner11b58e72010-05-12 17:08:26 +0200379 if (what == resend) {
380 b->n_writes = n_writes;
381 if (b->w.cb == NULL) {
382 b->w.cb = w_send_barrier;
383 inc_ap_pending(mdev);
384 set_bit(CREATE_BARRIER, &mdev->flags);
385 }
386
387 drbd_queue_work(&mdev->data.work, &b->w);
388 }
389 pn = &b->next;
390 } else {
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200391 if (n_reads)
392 list_add(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200393 /* there could still be requests on that ring list,
394 * in case local io is still pending */
395 list_del(&b->requests);
396
397 /* dec_ap_pending corresponding to queue_barrier.
398 * the newest barrier may not have been queued yet,
399 * in which case w.cb is still NULL. */
400 if (b->w.cb != NULL)
401 dec_ap_pending(mdev);
402
403 if (b == mdev->newest_tle) {
404 /* recycle, but reinit! */
405 D_ASSERT(tmp == NULL);
406 INIT_LIST_HEAD(&b->requests);
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200407 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200408 INIT_LIST_HEAD(&b->w.list);
409 b->w.cb = NULL;
410 b->br_number = net_random();
411 b->n_writes = 0;
412
413 *pn = b;
414 break;
415 }
416 *pn = tmp;
417 kfree(b);
418 }
419 b = tmp;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200420 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200421 }
Philipp Reisner6d7e32f2011-03-15 10:25:18 +0100422
423 /* Actions operating on the disk state, also want to work on
424 requests that got barrier acked. */
Philipp Reisner6d7e32f2011-03-15 10:25:18 +0100425
Philipp Reisner509fc012012-07-31 11:22:58 +0200426 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
427 req = list_entry(le, struct drbd_request, tl_requests);
428 _req_mod(req, what);
Philipp Reisner6d7e32f2011-03-15 10:25:18 +0100429 }
Philipp Reisner11b58e72010-05-12 17:08:26 +0200430}
431
Philipp Reisnerb411b362009-09-25 16:07:19 -0700432
433/**
434 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
435 * @mdev: DRBD device.
436 *
437 * This is called after the connection to the peer was lost. The storage covered
438 * by the requests on the transfer gets marked as our of sync. Called from the
439 * receiver thread and the worker thread.
440 */
441void tl_clear(struct drbd_conf *mdev)
442{
Philipp Reisner509fc012012-07-31 11:22:58 +0200443 spin_lock_irq(&mdev->req_lock);
444 _tl_clear(mdev);
445 spin_unlock_irq(&mdev->req_lock);
446}
447
448static void _tl_clear(struct drbd_conf *mdev)
449{
Philipp Reisnerb411b362009-09-25 16:07:19 -0700450 struct list_head *le, *tle;
451 struct drbd_request *r;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700452
Philipp Reisner11b58e72010-05-12 17:08:26 +0200453 _tl_restart(mdev, connection_lost_while_pending);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700454
455 /* we expect this list to be empty. */
456 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
457
458 /* but just in case, clean it up anyways! */
459 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
460 r = list_entry(le, struct drbd_request, tl_requests);
461 /* It would be nice to complete outside of spinlock.
462 * But this is easier for now. */
463 _req_mod(r, connection_lost_while_pending);
464 }
465
466 /* ensure bit indicating barrier is required is clear */
467 clear_bit(CREATE_BARRIER, &mdev->flags);
468
Philipp Reisner288f4222010-05-27 15:07:43 +0200469 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
470
Philipp Reisnerb411b362009-09-25 16:07:19 -0700471}
472
Philipp Reisner11b58e72010-05-12 17:08:26 +0200473void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
474{
475 spin_lock_irq(&mdev->req_lock);
476 _tl_restart(mdev, what);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700477 spin_unlock_irq(&mdev->req_lock);
478}
479
480/**
Philipp Reisnerfd2491f2011-07-18 16:25:15 +0200481 * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
482 * @mdev: DRBD device.
483 */
484void tl_abort_disk_io(struct drbd_conf *mdev)
485{
486 struct drbd_tl_epoch *b;
487 struct list_head *le, *tle;
488 struct drbd_request *req;
489
490 spin_lock_irq(&mdev->req_lock);
491 b = mdev->oldest_tle;
492 while (b) {
493 list_for_each_safe(le, tle, &b->requests) {
494 req = list_entry(le, struct drbd_request, tl_requests);
495 if (!(req->rq_state & RQ_LOCAL_PENDING))
496 continue;
497 _req_mod(req, abort_disk_io);
498 }
499 b = b->next;
500 }
501
502 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
503 req = list_entry(le, struct drbd_request, tl_requests);
504 if (!(req->rq_state & RQ_LOCAL_PENDING))
505 continue;
506 _req_mod(req, abort_disk_io);
507 }
508
509 spin_unlock_irq(&mdev->req_lock);
510}
511
512/**
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100513 * cl_wide_st_chg() - true if the state change is a cluster wide one
Philipp Reisnerb411b362009-09-25 16:07:19 -0700514 * @mdev: DRBD device.
515 * @os: old (current) state.
516 * @ns: new (wanted) state.
517 */
518static int cl_wide_st_chg(struct drbd_conf *mdev,
519 union drbd_state os, union drbd_state ns)
520{
521 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
522 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
523 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
524 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
Philipp Reisner02ee8f92011-03-14 11:54:47 +0100525 (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
Philipp Reisnerb411b362009-09-25 16:07:19 -0700526 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
527 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
528}
529
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100530enum drbd_state_rv
531drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
532 union drbd_state mask, union drbd_state val)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700533{
534 unsigned long flags;
535 union drbd_state os, ns;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100536 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700537
538 spin_lock_irqsave(&mdev->req_lock, flags);
539 os = mdev->state;
540 ns.i = (os.i & ~mask.i) | val.i;
541 rv = _drbd_set_state(mdev, ns, f, NULL);
542 ns = mdev->state;
543 spin_unlock_irqrestore(&mdev->req_lock, flags);
544
545 return rv;
546}
547
548/**
549 * drbd_force_state() - Impose a change which happens outside our control on our state
550 * @mdev: DRBD device.
551 * @mask: mask of state bits to change.
552 * @val: value of new state bits.
553 */
554void drbd_force_state(struct drbd_conf *mdev,
555 union drbd_state mask, union drbd_state val)
556{
557 drbd_change_state(mdev, CS_HARD, mask, val);
558}
559
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100560static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
561static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
562 union drbd_state,
563 union drbd_state);
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200564enum sanitize_state_warnings {
565 NO_WARNING,
566 ABORTED_ONLINE_VERIFY,
567 ABORTED_RESYNC,
568 CONNECTION_LOST_NEGOTIATING,
569 IMPLICITLY_UPGRADED_DISK,
570 IMPLICITLY_UPGRADED_PDSK,
571};
Philipp Reisnerb411b362009-09-25 16:07:19 -0700572static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200573 union drbd_state ns, enum sanitize_state_warnings *warn);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700574int drbd_send_state_req(struct drbd_conf *,
575 union drbd_state, union drbd_state);
576
Andreas Gruenbacherc8b32562010-12-08 01:06:16 +0100577static enum drbd_state_rv
578_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
579 union drbd_state val)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700580{
581 union drbd_state os, ns;
582 unsigned long flags;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100583 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700584
585 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
586 return SS_CW_SUCCESS;
587
588 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
589 return SS_CW_FAILED_BY_PEER;
590
591 rv = 0;
592 spin_lock_irqsave(&mdev->req_lock, flags);
593 os = mdev->state;
594 ns.i = (os.i & ~mask.i) | val.i;
595 ns = sanitize_state(mdev, os, ns, NULL);
596
597 if (!cl_wide_st_chg(mdev, os, ns))
598 rv = SS_CW_NO_NEED;
599 if (!rv) {
600 rv = is_valid_state(mdev, ns);
601 if (rv == SS_SUCCESS) {
602 rv = is_valid_state_transition(mdev, ns, os);
603 if (rv == SS_SUCCESS)
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100604 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700605 }
606 }
607 spin_unlock_irqrestore(&mdev->req_lock, flags);
608
609 return rv;
610}
611
612/**
613 * drbd_req_state() - Perform an eventually cluster wide state change
614 * @mdev: DRBD device.
615 * @mask: mask of state bits to change.
616 * @val: value of new state bits.
617 * @f: flags
618 *
619 * Should not be called directly, use drbd_request_state() or
620 * _drbd_request_state().
621 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100622static enum drbd_state_rv
623drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
624 union drbd_state val, enum chg_state_flags f)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700625{
626 struct completion done;
627 unsigned long flags;
628 union drbd_state os, ns;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100629 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700630
631 init_completion(&done);
632
633 if (f & CS_SERIALIZE)
634 mutex_lock(&mdev->state_mutex);
635
636 spin_lock_irqsave(&mdev->req_lock, flags);
637 os = mdev->state;
638 ns.i = (os.i & ~mask.i) | val.i;
639 ns = sanitize_state(mdev, os, ns, NULL);
640
641 if (cl_wide_st_chg(mdev, os, ns)) {
642 rv = is_valid_state(mdev, ns);
643 if (rv == SS_SUCCESS)
644 rv = is_valid_state_transition(mdev, ns, os);
645 spin_unlock_irqrestore(&mdev->req_lock, flags);
646
647 if (rv < SS_SUCCESS) {
648 if (f & CS_VERBOSE)
649 print_st_err(mdev, os, ns, rv);
650 goto abort;
651 }
652
653 drbd_state_lock(mdev);
654 if (!drbd_send_state_req(mdev, mask, val)) {
655 drbd_state_unlock(mdev);
656 rv = SS_CW_FAILED_BY_PEER;
657 if (f & CS_VERBOSE)
658 print_st_err(mdev, os, ns, rv);
659 goto abort;
660 }
661
662 wait_event(mdev->state_wait,
663 (rv = _req_st_cond(mdev, mask, val)));
664
665 if (rv < SS_SUCCESS) {
666 drbd_state_unlock(mdev);
667 if (f & CS_VERBOSE)
668 print_st_err(mdev, os, ns, rv);
669 goto abort;
670 }
671 spin_lock_irqsave(&mdev->req_lock, flags);
672 os = mdev->state;
673 ns.i = (os.i & ~mask.i) | val.i;
674 rv = _drbd_set_state(mdev, ns, f, &done);
675 drbd_state_unlock(mdev);
676 } else {
677 rv = _drbd_set_state(mdev, ns, f, &done);
678 }
679
680 spin_unlock_irqrestore(&mdev->req_lock, flags);
681
682 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
683 D_ASSERT(current != mdev->worker.task);
684 wait_for_completion(&done);
685 }
686
687abort:
688 if (f & CS_SERIALIZE)
689 mutex_unlock(&mdev->state_mutex);
690
691 return rv;
692}
693
694/**
695 * _drbd_request_state() - Request a state change (with flags)
696 * @mdev: DRBD device.
697 * @mask: mask of state bits to change.
698 * @val: value of new state bits.
699 * @f: flags
700 *
701 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
702 * flag, or when logging of failed state change requests is not desired.
703 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100704enum drbd_state_rv
705_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
706 union drbd_state val, enum chg_state_flags f)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700707{
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100708 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700709
710 wait_event(mdev->state_wait,
711 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
712
713 return rv;
714}
715
716static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
717{
718 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
719 name,
720 drbd_conn_str(ns.conn),
721 drbd_role_str(ns.role),
722 drbd_role_str(ns.peer),
723 drbd_disk_str(ns.disk),
724 drbd_disk_str(ns.pdsk),
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200725 is_susp(ns) ? 's' : 'r',
Philipp Reisnerb411b362009-09-25 16:07:19 -0700726 ns.aftr_isp ? 'a' : '-',
727 ns.peer_isp ? 'p' : '-',
728 ns.user_isp ? 'u' : '-'
729 );
730}
731
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100732void print_st_err(struct drbd_conf *mdev, union drbd_state os,
733 union drbd_state ns, enum drbd_state_rv err)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700734{
735 if (err == SS_IN_TRANSIENT_STATE)
736 return;
737 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
738 print_st(mdev, " state", os);
739 print_st(mdev, "wanted", ns);
740}
741
742
Philipp Reisnerb411b362009-09-25 16:07:19 -0700743/**
744 * is_valid_state() - Returns an SS_ error code if ns is not valid
745 * @mdev: DRBD device.
746 * @ns: State to consider.
747 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100748static enum drbd_state_rv
749is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700750{
751 /* See drbd_state_sw_errors in drbd_strings.c */
752
753 enum drbd_fencing_p fp;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100754 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700755
756 fp = FP_DONT_CARE;
757 if (get_ldev(mdev)) {
758 fp = mdev->ldev->dc.fencing;
759 put_ldev(mdev);
760 }
761
762 if (get_net_conf(mdev)) {
763 if (!mdev->net_conf->two_primaries &&
764 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
765 rv = SS_TWO_PRIMARIES;
766 put_net_conf(mdev);
767 }
768
769 if (rv <= 0)
770 /* already found a reason to abort */;
771 else if (ns.role == R_SECONDARY && mdev->open_cnt)
772 rv = SS_DEVICE_IN_USE;
773
774 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
775 rv = SS_NO_UP_TO_DATE_DISK;
776
777 else if (fp >= FP_RESOURCE &&
778 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
779 rv = SS_PRIMARY_NOP;
780
781 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
782 rv = SS_NO_UP_TO_DATE_DISK;
783
784 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
785 rv = SS_NO_LOCAL_DISK;
786
787 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
788 rv = SS_NO_REMOTE_DISK;
789
Lars Ellenberg8d4ce822010-04-01 16:59:32 +0200790 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
791 rv = SS_NO_UP_TO_DATE_DISK;
792
Philipp Reisnerb411b362009-09-25 16:07:19 -0700793 else if ((ns.conn == C_CONNECTED ||
794 ns.conn == C_WF_BITMAP_S ||
795 ns.conn == C_SYNC_SOURCE ||
796 ns.conn == C_PAUSED_SYNC_S) &&
797 ns.disk == D_OUTDATED)
798 rv = SS_CONNECTED_OUTDATES;
799
800 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
801 (mdev->sync_conf.verify_alg[0] == 0))
802 rv = SS_NO_VERIFY_ALG;
803
804 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
805 mdev->agreed_pro_version < 88)
806 rv = SS_NOT_SUPPORTED;
807
Philipp Reisnerfa7d9392011-05-17 14:48:55 +0200808 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
809 rv = SS_CONNECTED_OUTDATES;
810
Philipp Reisnerb411b362009-09-25 16:07:19 -0700811 return rv;
812}
813
814/**
815 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
816 * @mdev: DRBD device.
817 * @ns: new state.
818 * @os: old state.
819 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100820static enum drbd_state_rv
821is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
822 union drbd_state os)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700823{
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100824 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700825
826 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
827 os.conn > C_CONNECTED)
828 rv = SS_RESYNC_RUNNING;
829
830 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
831 rv = SS_ALREADY_STANDALONE;
832
833 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
834 rv = SS_IS_DISKLESS;
835
836 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
837 rv = SS_NO_NET_CONFIG;
838
839 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
840 rv = SS_LOWER_THAN_OUTDATED;
841
842 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
843 rv = SS_IN_TRANSIENT_STATE;
844
845 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
846 rv = SS_IN_TRANSIENT_STATE;
847
Philipp Reisner197296f2012-03-26 16:47:11 +0200848 /* While establishing a connection only allow cstate to change.
849 Delay/refuse role changes, detach attach etc... */
850 if (test_bit(STATE_SENT, &mdev->flags) &&
851 !(os.conn == C_WF_REPORT_PARAMS ||
852 (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
853 rv = SS_IN_TRANSIENT_STATE;
854
Philipp Reisnerb411b362009-09-25 16:07:19 -0700855 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
856 rv = SS_NEED_CONNECTION;
857
858 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
859 ns.conn != os.conn && os.conn > C_CONNECTED)
860 rv = SS_RESYNC_RUNNING;
861
862 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
863 os.conn < C_CONNECTED)
864 rv = SS_NEED_CONNECTION;
865
Philipp Reisner1fc80cf2010-11-22 14:18:47 +0100866 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
867 && os.conn < C_WF_REPORT_PARAMS)
868 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
869
Philipp Reisnerb411b362009-09-25 16:07:19 -0700870 return rv;
871}
872
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200873static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
874{
875 static const char *msg_table[] = {
876 [NO_WARNING] = "",
877 [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
878 [ABORTED_RESYNC] = "Resync aborted.",
879 [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
880 [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
881 [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
882 };
883
884 if (warn != NO_WARNING)
885 dev_warn(DEV, "%s\n", msg_table[warn]);
886}
887
Philipp Reisnerb411b362009-09-25 16:07:19 -0700888/**
889 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
890 * @mdev: DRBD device.
891 * @os: old state.
892 * @ns: new state.
893 * @warn_sync_abort:
894 *
895 * When we loose connection, we have to set the state of the peers disk (pdsk)
896 * to D_UNKNOWN. This rule and many more along those lines are in this function.
897 */
898static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200899 union drbd_state ns, enum sanitize_state_warnings *warn)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700900{
901 enum drbd_fencing_p fp;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100902 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700903
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200904 if (warn)
905 *warn = NO_WARNING;
906
Philipp Reisnerb411b362009-09-25 16:07:19 -0700907 fp = FP_DONT_CARE;
908 if (get_ldev(mdev)) {
909 fp = mdev->ldev->dc.fencing;
910 put_ldev(mdev);
911 }
912
913 /* Disallow Network errors to configure a device's network part */
914 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
915 os.conn <= C_DISCONNECTING)
916 ns.conn = os.conn;
917
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200918 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
919 * If you try to go into some Sync* state, that shall fail (elsewhere). */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700920 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
Lars Ellenberg545752d2011-12-05 14:39:25 +0100921 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700922 ns.conn = os.conn;
923
Lars Ellenberg82f59cc2010-10-16 12:13:47 +0200924 /* we cannot fail (again) if we already detached */
925 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
926 ns.disk = D_DISKLESS;
927
Philipp Reisnerb411b362009-09-25 16:07:19 -0700928 /* After C_DISCONNECTING only C_STANDALONE may follow */
929 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
930 ns.conn = os.conn;
931
932 if (ns.conn < C_CONNECTED) {
933 ns.peer_isp = 0;
934 ns.peer = R_UNKNOWN;
935 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
936 ns.pdsk = D_UNKNOWN;
937 }
938
939 /* Clear the aftr_isp when becoming unconfigured */
940 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
941 ns.aftr_isp = 0;
942
Philipp Reisnerb411b362009-09-25 16:07:19 -0700943 /* Abort resync if a disk fails/detaches */
944 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
945 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200946 if (warn)
947 *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
948 ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700949 ns.conn = C_CONNECTED;
950 }
951
Philipp Reisnerb411b362009-09-25 16:07:19 -0700952 /* Connection breaks down before we finished "Negotiating" */
953 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
954 get_ldev_if_state(mdev, D_NEGOTIATING)) {
955 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
956 ns.disk = mdev->new_state_tmp.disk;
957 ns.pdsk = mdev->new_state_tmp.pdsk;
958 } else {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200959 if (warn)
960 *warn = CONNECTION_LOST_NEGOTIATING;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700961 ns.disk = D_DISKLESS;
962 ns.pdsk = D_UNKNOWN;
963 }
964 put_ldev(mdev);
965 }
966
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100967 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
968 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
969 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
970 ns.disk = D_UP_TO_DATE;
971 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
972 ns.pdsk = D_UP_TO_DATE;
973 }
974
975 /* Implications of the connection stat on the disk states */
976 disk_min = D_DISKLESS;
977 disk_max = D_UP_TO_DATE;
978 pdsk_min = D_INCONSISTENT;
979 pdsk_max = D_UNKNOWN;
980 switch ((enum drbd_conns)ns.conn) {
981 case C_WF_BITMAP_T:
982 case C_PAUSED_SYNC_T:
983 case C_STARTING_SYNC_T:
984 case C_WF_SYNC_UUID:
985 case C_BEHIND:
986 disk_min = D_INCONSISTENT;
987 disk_max = D_OUTDATED;
988 pdsk_min = D_UP_TO_DATE;
989 pdsk_max = D_UP_TO_DATE;
990 break;
991 case C_VERIFY_S:
992 case C_VERIFY_T:
993 disk_min = D_UP_TO_DATE;
994 disk_max = D_UP_TO_DATE;
995 pdsk_min = D_UP_TO_DATE;
996 pdsk_max = D_UP_TO_DATE;
997 break;
998 case C_CONNECTED:
999 disk_min = D_DISKLESS;
1000 disk_max = D_UP_TO_DATE;
1001 pdsk_min = D_DISKLESS;
1002 pdsk_max = D_UP_TO_DATE;
1003 break;
1004 case C_WF_BITMAP_S:
1005 case C_PAUSED_SYNC_S:
1006 case C_STARTING_SYNC_S:
1007 case C_AHEAD:
1008 disk_min = D_UP_TO_DATE;
1009 disk_max = D_UP_TO_DATE;
1010 pdsk_min = D_INCONSISTENT;
1011 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
1012 break;
1013 case C_SYNC_TARGET:
1014 disk_min = D_INCONSISTENT;
1015 disk_max = D_INCONSISTENT;
1016 pdsk_min = D_UP_TO_DATE;
1017 pdsk_max = D_UP_TO_DATE;
1018 break;
1019 case C_SYNC_SOURCE:
1020 disk_min = D_UP_TO_DATE;
1021 disk_max = D_UP_TO_DATE;
1022 pdsk_min = D_INCONSISTENT;
1023 pdsk_max = D_INCONSISTENT;
1024 break;
1025 case C_STANDALONE:
1026 case C_DISCONNECTING:
1027 case C_UNCONNECTED:
1028 case C_TIMEOUT:
1029 case C_BROKEN_PIPE:
1030 case C_NETWORK_FAILURE:
1031 case C_PROTOCOL_ERROR:
1032 case C_TEAR_DOWN:
1033 case C_WF_CONNECTION:
1034 case C_WF_REPORT_PARAMS:
1035 case C_MASK:
1036 break;
1037 }
1038 if (ns.disk > disk_max)
1039 ns.disk = disk_max;
1040
1041 if (ns.disk < disk_min) {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001042 if (warn)
1043 *warn = IMPLICITLY_UPGRADED_DISK;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +01001044 ns.disk = disk_min;
1045 }
1046 if (ns.pdsk > pdsk_max)
1047 ns.pdsk = pdsk_max;
1048
1049 if (ns.pdsk < pdsk_min) {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001050 if (warn)
1051 *warn = IMPLICITLY_UPGRADED_PDSK;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +01001052 ns.pdsk = pdsk_min;
1053 }
1054
Philipp Reisnerb411b362009-09-25 16:07:19 -07001055 if (fp == FP_STONITH &&
Philipp Reisner0a492162009-10-21 13:08:29 +02001056 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
1057 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001058 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
Philipp Reisner265be2d2010-05-31 10:14:17 +02001059
1060 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
1061 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
1062 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001063 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001064
1065 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
1066 if (ns.conn == C_SYNC_SOURCE)
1067 ns.conn = C_PAUSED_SYNC_S;
1068 if (ns.conn == C_SYNC_TARGET)
1069 ns.conn = C_PAUSED_SYNC_T;
1070 } else {
1071 if (ns.conn == C_PAUSED_SYNC_S)
1072 ns.conn = C_SYNC_SOURCE;
1073 if (ns.conn == C_PAUSED_SYNC_T)
1074 ns.conn = C_SYNC_TARGET;
1075 }
1076
1077 return ns;
1078}
1079
1080/* helper for __drbd_set_state */
1081static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
1082{
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001083 if (mdev->agreed_pro_version < 90)
1084 mdev->ov_start_sector = 0;
1085 mdev->rs_total = drbd_bm_bits(mdev);
1086 mdev->ov_position = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001087 if (cs == C_VERIFY_T) {
1088 /* starting online verify from an arbitrary position
1089 * does not fit well into the existing protocol.
1090 * on C_VERIFY_T, we initialize ov_left and friends
1091 * implicitly in receive_DataRequest once the
1092 * first P_OV_REQUEST is received */
1093 mdev->ov_start_sector = ~(sector_t)0;
1094 } else {
1095 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001096 if (bit >= mdev->rs_total) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001097 mdev->ov_start_sector =
1098 BM_BIT_TO_SECT(mdev->rs_total - 1);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001099 mdev->rs_total = 1;
1100 } else
1101 mdev->rs_total -= bit;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001102 mdev->ov_position = mdev->ov_start_sector;
1103 }
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001104 mdev->ov_left = mdev->rs_total;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001105}
1106
Philipp Reisner07782862010-08-31 12:00:50 +02001107static void drbd_resume_al(struct drbd_conf *mdev)
1108{
1109 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1110 dev_info(DEV, "Resumed AL updates\n");
1111}
1112
Philipp Reisnerb411b362009-09-25 16:07:19 -07001113/**
1114 * __drbd_set_state() - Set a new DRBD state
1115 * @mdev: DRBD device.
1116 * @ns: new state.
1117 * @flags: Flags
1118 * @done: Optional completion, that will get completed after the after_state_ch() finished
1119 *
1120 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1121 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01001122enum drbd_state_rv
1123__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1124 enum chg_state_flags flags, struct completion *done)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001125{
1126 union drbd_state os;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01001127 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001128 enum sanitize_state_warnings ssw;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001129 struct after_state_chg_work *ascw;
1130
1131 os = mdev->state;
1132
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001133 ns = sanitize_state(mdev, os, ns, &ssw);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001134
1135 if (ns.i == os.i)
1136 return SS_NOTHING_TO_DO;
1137
1138 if (!(flags & CS_HARD)) {
1139 /* pre-state-change checks ; only look at ns */
1140 /* See drbd_state_sw_errors in drbd_strings.c */
1141
1142 rv = is_valid_state(mdev, ns);
1143 if (rv < SS_SUCCESS) {
1144 /* If the old state was illegal as well, then let
1145 this happen...*/
1146
Philipp Reisner1616a252010-06-10 16:55:15 +02001147 if (is_valid_state(mdev, os) == rv)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001148 rv = is_valid_state_transition(mdev, ns, os);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001149 } else
1150 rv = is_valid_state_transition(mdev, ns, os);
1151 }
1152
1153 if (rv < SS_SUCCESS) {
1154 if (flags & CS_VERBOSE)
1155 print_st_err(mdev, os, ns, rv);
1156 return rv;
1157 }
1158
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001159 print_sanitize_warnings(mdev, ssw);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001160
1161 {
Andreas Gruenbacher662d91a2010-12-07 03:01:41 +01001162 char *pbp, pb[300];
1163 pbp = pb;
1164 *pbp = 0;
1165 if (ns.role != os.role)
1166 pbp += sprintf(pbp, "role( %s -> %s ) ",
1167 drbd_role_str(os.role),
1168 drbd_role_str(ns.role));
1169 if (ns.peer != os.peer)
1170 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1171 drbd_role_str(os.peer),
1172 drbd_role_str(ns.peer));
1173 if (ns.conn != os.conn)
1174 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1175 drbd_conn_str(os.conn),
1176 drbd_conn_str(ns.conn));
1177 if (ns.disk != os.disk)
1178 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1179 drbd_disk_str(os.disk),
1180 drbd_disk_str(ns.disk));
1181 if (ns.pdsk != os.pdsk)
1182 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1183 drbd_disk_str(os.pdsk),
1184 drbd_disk_str(ns.pdsk));
1185 if (is_susp(ns) != is_susp(os))
1186 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1187 is_susp(os),
1188 is_susp(ns));
1189 if (ns.aftr_isp != os.aftr_isp)
1190 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1191 os.aftr_isp,
1192 ns.aftr_isp);
1193 if (ns.peer_isp != os.peer_isp)
1194 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1195 os.peer_isp,
1196 ns.peer_isp);
1197 if (ns.user_isp != os.user_isp)
1198 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1199 os.user_isp,
1200 ns.user_isp);
1201 dev_info(DEV, "%s\n", pb);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001202 }
1203
1204 /* solve the race between becoming unconfigured,
1205 * worker doing the cleanup, and
1206 * admin reconfiguring us:
1207 * on (re)configure, first set CONFIG_PENDING,
1208 * then wait for a potentially exiting worker,
1209 * start the worker, and schedule one no_op.
1210 * then proceed with configuration.
1211 */
1212 if (ns.disk == D_DISKLESS &&
1213 ns.conn == C_STANDALONE &&
1214 ns.role == R_SECONDARY &&
1215 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1216 set_bit(DEVICE_DYING, &mdev->flags);
1217
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001218 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1219 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1220 * drbd_ldev_destroy() won't happen before our corresponding
1221 * after_state_ch works run, where we put_ldev again. */
1222 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1223 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1224 atomic_inc(&mdev->local_cnt);
1225
1226 mdev->state = ns;
Lars Ellenberg62b0da32011-01-20 13:25:21 +01001227
1228 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1229 drbd_print_uuids(mdev, "attached to UUIDs");
1230
Philipp Reisnerb411b362009-09-25 16:07:19 -07001231 wake_up(&mdev->misc_wait);
1232 wake_up(&mdev->state_wait);
1233
Philipp Reisnerb411b362009-09-25 16:07:19 -07001234 /* aborted verify run. log the last position */
1235 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1236 ns.conn < C_CONNECTED) {
1237 mdev->ov_start_sector =
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001238 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001239 dev_info(DEV, "Online Verify reached sector %llu\n",
1240 (unsigned long long)mdev->ov_start_sector);
1241 }
1242
1243 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1244 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1245 dev_info(DEV, "Syncer continues.\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001246 mdev->rs_paused += (long)jiffies
1247 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
Philipp Reisner63106d32010-09-01 15:47:15 +02001248 if (ns.conn == C_SYNC_TARGET)
1249 mod_timer(&mdev->resync_timer, jiffies);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001250 }
1251
1252 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1253 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1254 dev_info(DEV, "Resync suspended\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001255 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001256 }
1257
1258 if (os.conn == C_CONNECTED &&
1259 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001260 unsigned long now = jiffies;
1261 int i;
1262
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001263 set_ov_position(mdev, ns.conn);
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001264 mdev->rs_start = now;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001265 mdev->rs_last_events = 0;
1266 mdev->rs_last_sect_ev = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001267 mdev->ov_last_oos_size = 0;
1268 mdev->ov_last_oos_start = 0;
1269
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001270 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001271 mdev->rs_mark_left[i] = mdev->ov_left;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001272 mdev->rs_mark_time[i] = now;
1273 }
1274
Lars Ellenberg2649f082010-11-05 10:05:47 +01001275 drbd_rs_controller_reset(mdev);
1276
Philipp Reisnerb411b362009-09-25 16:07:19 -07001277 if (ns.conn == C_VERIFY_S) {
1278 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1279 (unsigned long long)mdev->ov_position);
1280 mod_timer(&mdev->resync_timer, jiffies);
1281 }
1282 }
1283
1284 if (get_ldev(mdev)) {
1285 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1286 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1287 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1288
1289 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1290 mdf |= MDF_CRASHED_PRIMARY;
1291 if (mdev->state.role == R_PRIMARY ||
1292 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1293 mdf |= MDF_PRIMARY_IND;
1294 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1295 mdf |= MDF_CONNECTED_IND;
1296 if (mdev->state.disk > D_INCONSISTENT)
1297 mdf |= MDF_CONSISTENT;
1298 if (mdev->state.disk > D_OUTDATED)
1299 mdf |= MDF_WAS_UP_TO_DATE;
1300 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1301 mdf |= MDF_PEER_OUT_DATED;
1302 if (mdf != mdev->ldev->md.flags) {
1303 mdev->ldev->md.flags = mdf;
1304 drbd_md_mark_dirty(mdev);
1305 }
1306 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1307 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1308 put_ldev(mdev);
1309 }
1310
1311 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1312 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1313 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1314 set_bit(CONSIDER_RESYNC, &mdev->flags);
1315
1316 /* Receiver should clean up itself */
1317 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1318 drbd_thread_stop_nowait(&mdev->receiver);
1319
1320 /* Now the receiver finished cleaning up itself, it should die */
1321 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1322 drbd_thread_stop_nowait(&mdev->receiver);
1323
1324 /* Upon network failure, we need to restart the receiver. */
Philipp Reisner1e86ac42011-08-04 10:33:08 +02001325 if (os.conn > C_WF_CONNECTION &&
Philipp Reisnerb411b362009-09-25 16:07:19 -07001326 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1327 drbd_thread_restart_nowait(&mdev->receiver);
1328
Philipp Reisner07782862010-08-31 12:00:50 +02001329 /* Resume AL writing if we get a connection */
1330 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1331 drbd_resume_al(mdev);
1332
Lars Ellenbergba280c02012-04-25 11:46:14 +02001333 /* remember last connect and attach times so request_timer_fn() won't
1334 * kill newly established sessions while we are still trying to thaw
1335 * previously frozen IO */
1336 if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS)
1337 mdev->last_reconnect_jif = jiffies;
1338 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
1339 ns.disk > D_NEGOTIATING)
1340 mdev->last_reattach_jif = jiffies;
1341
Philipp Reisnerb411b362009-09-25 16:07:19 -07001342 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1343 if (ascw) {
1344 ascw->os = os;
1345 ascw->ns = ns;
1346 ascw->flags = flags;
1347 ascw->w.cb = w_after_state_ch;
1348 ascw->done = done;
1349 drbd_queue_work(&mdev->data.work, &ascw->w);
1350 } else {
1351 dev_warn(DEV, "Could not kmalloc an ascw\n");
1352 }
1353
1354 return rv;
1355}
1356
1357static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1358{
1359 struct after_state_chg_work *ascw =
1360 container_of(w, struct after_state_chg_work, w);
1361 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1362 if (ascw->flags & CS_WAIT_COMPLETE) {
1363 D_ASSERT(ascw->done != NULL);
1364 complete(ascw->done);
1365 }
1366 kfree(ascw);
1367
1368 return 1;
1369}
1370
1371static void abw_start_sync(struct drbd_conf *mdev, int rv)
1372{
1373 if (rv) {
1374 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1375 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1376 return;
1377 }
1378
1379 switch (mdev->state.conn) {
1380 case C_STARTING_SYNC_T:
1381 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1382 break;
1383 case C_STARTING_SYNC_S:
1384 drbd_start_resync(mdev, C_SYNC_SOURCE);
1385 break;
1386 }
1387}
1388
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001389int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1390 int (*io_fn)(struct drbd_conf *),
1391 char *why, enum bm_flag flags)
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001392{
1393 int rv;
1394
1395 D_ASSERT(current == mdev->worker.task);
1396
1397 /* open coded non-blocking drbd_suspend_io(mdev); */
1398 set_bit(SUSPEND_IO, &mdev->flags);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001399
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001400 drbd_bm_lock(mdev, why, flags);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001401 rv = io_fn(mdev);
1402 drbd_bm_unlock(mdev);
1403
1404 drbd_resume_io(mdev);
1405
1406 return rv;
1407}
1408
Philipp Reisnerb411b362009-09-25 16:07:19 -07001409/**
1410 * after_state_ch() - Perform after state change actions that may sleep
1411 * @mdev: DRBD device.
1412 * @os: old state.
1413 * @ns: new state.
1414 * @flags: Flags
1415 */
1416static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1417 union drbd_state ns, enum chg_state_flags flags)
1418{
1419 enum drbd_fencing_p fp;
Philipp Reisner67098932010-06-24 16:24:25 +02001420 enum drbd_req_event what = nothing;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001421 union drbd_state nsm = (union drbd_state){ .i = -1 };
Philipp Reisnerb411b362009-09-25 16:07:19 -07001422
1423 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1424 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1425 if (mdev->p_uuid)
1426 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1427 }
1428
1429 fp = FP_DONT_CARE;
1430 if (get_ldev(mdev)) {
1431 fp = mdev->ldev->dc.fencing;
1432 put_ldev(mdev);
1433 }
1434
1435 /* Inform userspace about the change... */
1436 drbd_bcast_state(mdev, ns);
1437
1438 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1439 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1440 drbd_khelper(mdev, "pri-on-incon-degr");
1441
1442 /* Here we have the actions that are performed after a
1443 state change. This function might sleep */
1444
Philipp Reisnerdfa8bed2011-06-29 14:06:08 +02001445 if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING)
1446 mod_timer(&mdev->request_timer, jiffies + HZ);
1447
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001448 nsm.i = -1;
1449 if (ns.susp_nod) {
Philipp Reisner3f986882010-12-20 14:48:20 +01001450 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1451 what = resend;
Philipp Reisner265be2d2010-05-31 10:14:17 +02001452
Philipp Reisner79f16f52011-07-15 18:44:26 +02001453 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
1454 ns.disk > D_NEGOTIATING)
Philipp Reisner3f986882010-12-20 14:48:20 +01001455 what = restart_frozen_disk_io;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001456
Philipp Reisner3f986882010-12-20 14:48:20 +01001457 if (what != nothing)
1458 nsm.susp_nod = 0;
Philipp Reisner265be2d2010-05-31 10:14:17 +02001459 }
1460
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001461 if (ns.susp_fen) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001462 /* case1: The outdate peer handler is successful: */
1463 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001464 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1465 drbd_uuid_new_current(mdev);
1466 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner43a51822010-06-11 11:26:34 +02001467 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001468 spin_lock_irq(&mdev->req_lock);
Philipp Reisner509fc012012-07-31 11:22:58 +02001469 _tl_clear(mdev);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001470 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001471 spin_unlock_irq(&mdev->req_lock);
1472 }
Philipp Reisner43a51822010-06-11 11:26:34 +02001473 /* case2: The connection was established again: */
1474 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1475 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner67098932010-06-24 16:24:25 +02001476 what = resend;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001477 nsm.susp_fen = 0;
Philipp Reisner43a51822010-06-11 11:26:34 +02001478 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001479 }
Philipp Reisner67098932010-06-24 16:24:25 +02001480
1481 if (what != nothing) {
1482 spin_lock_irq(&mdev->req_lock);
1483 _tl_restart(mdev, what);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001484 nsm.i &= mdev->state.i;
1485 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
Philipp Reisner67098932010-06-24 16:24:25 +02001486 spin_unlock_irq(&mdev->req_lock);
1487 }
1488
Lars Ellenberg5a22db82010-12-17 21:14:23 +01001489 /* Became sync source. With protocol >= 96, we still need to send out
1490 * the sync uuid now. Need to do that before any drbd_send_state, or
1491 * the other side may go "paused sync" before receiving the sync uuids,
1492 * which is unexpected. */
1493 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1494 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1495 mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1496 drbd_gen_and_send_sync_uuid(mdev);
1497 put_ldev(mdev);
1498 }
1499
Philipp Reisnerb411b362009-09-25 16:07:19 -07001500 /* Do not change the order of the if above and the two below... */
1501 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
Lars Ellenberg0029d622012-06-14 18:02:52 +02001502 /* we probably will start a resync soon.
1503 * make sure those things are properly reset. */
1504 mdev->rs_total = 0;
1505 mdev->rs_failed = 0;
1506 atomic_set(&mdev->rs_pending_cnt, 0);
1507 drbd_rs_cancel_all(mdev);
1508
Philipp Reisnerb411b362009-09-25 16:07:19 -07001509 drbd_send_uuids(mdev);
Lars Ellenbergf479ea02011-10-27 16:52:30 +02001510 drbd_send_state(mdev, ns);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001511 }
Lars Ellenberg54b956a2011-01-20 10:47:53 +01001512 /* No point in queuing send_bitmap if we don't have a connection
1513 * anymore, so check also the _current_ state, not only the new state
1514 * at the time this work was queued. */
1515 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1516 mdev->state.conn == C_WF_BITMAP_S)
1517 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001518 "send_bitmap (WFBitMapS)",
1519 BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001520
1521 /* Lost contact to peer's copy of the data */
1522 if ((os.pdsk >= D_INCONSISTENT &&
1523 os.pdsk != D_UNKNOWN &&
1524 os.pdsk != D_OUTDATED)
1525 && (ns.pdsk < D_INCONSISTENT ||
1526 ns.pdsk == D_UNKNOWN ||
1527 ns.pdsk == D_OUTDATED)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001528 if (get_ldev(mdev)) {
1529 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001530 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001531 if (is_susp(mdev->state)) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001532 set_bit(NEW_CUR_UUID, &mdev->flags);
1533 } else {
1534 drbd_uuid_new_current(mdev);
1535 drbd_send_uuids(mdev);
1536 }
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001537 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001538 put_ldev(mdev);
1539 }
1540 }
1541
1542 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
Philipp Reisnerbca482e2011-07-15 12:14:27 +02001543 if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
1544 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001545 drbd_uuid_new_current(mdev);
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001546 drbd_send_uuids(mdev);
1547 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001548 /* D_DISKLESS Peer becomes secondary */
1549 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001550 /* We may still be Primary ourselves.
1551 * No harm done if the bitmap still changes,
1552 * redirtied pages will follow later. */
1553 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1554 "demote diskless peer", BM_LOCKED_SET_ALLOWED);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001555 put_ldev(mdev);
1556 }
1557
Lars Ellenberg06d33e92010-12-18 17:00:59 +01001558 /* Write out all changed bits on demote.
1559 * Though, no need to da that just yet
1560 * if there is a resync going on still */
1561 if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1562 mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001563 /* No changes to the bitmap expected this time, so assert that,
1564 * even though no harm was done if it did change. */
1565 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1566 "demote", BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001567 put_ldev(mdev);
1568 }
1569
1570 /* Last part of the attaching process ... */
1571 if (ns.conn >= C_CONNECTED &&
1572 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
Philipp Reisnere89b5912010-03-24 17:11:33 +01001573 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001574 drbd_send_uuids(mdev);
Lars Ellenbergf479ea02011-10-27 16:52:30 +02001575 drbd_send_state(mdev, ns);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001576 }
1577
1578 /* We want to pause/continue resync, tell peer. */
1579 if (ns.conn >= C_CONNECTED &&
1580 ((os.aftr_isp != ns.aftr_isp) ||
1581 (os.user_isp != ns.user_isp)))
Lars Ellenbergf479ea02011-10-27 16:52:30 +02001582 drbd_send_state(mdev, ns);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001583
1584 /* In case one of the isp bits got set, suspend other devices. */
1585 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1586 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1587 suspend_other_sg(mdev);
1588
1589 /* Make sure the peer gets informed about eventual state
1590 changes (ISP bits) while we were in WFReportParams. */
1591 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
Lars Ellenbergf479ea02011-10-27 16:52:30 +02001592 drbd_send_state(mdev, ns);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001593
Philipp Reisner67531712010-10-27 12:21:30 +02001594 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
Lars Ellenbergf479ea02011-10-27 16:52:30 +02001595 drbd_send_state(mdev, ns);
Philipp Reisner67531712010-10-27 12:21:30 +02001596
Philipp Reisnerb411b362009-09-25 16:07:19 -07001597 /* We are in the progress to start a full sync... */
1598 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1599 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001600 /* no other bitmap changes expected during this phase */
1601 drbd_queue_bitmap_io(mdev,
1602 &drbd_bmio_set_n_write, &abw_start_sync,
1603 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001604
1605 /* We are invalidating our self... */
1606 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1607 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001608 /* other bitmap operation expected during this phase */
1609 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1610 "set_n_write from invalidate", BM_LOCKED_MASK);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001611
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001612 /* first half of local IO error, failure to attach,
1613 * or administrative detach */
1614 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
Philipp Reisner7caacb62011-12-14 18:01:21 +01001615 enum drbd_io_error_p eh = EP_PASS_ON;
1616 int was_io_error = 0;
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001617 /* corresponding get_ldev was in __drbd_set_state, to serialize
Philipp Reisner7caacb62011-12-14 18:01:21 +01001618 * our cleanup here with the transition to D_DISKLESS.
1619 * But is is still not save to dreference ldev here, since
1620 * we might come from an failed Attach before ldev was set. */
1621 if (mdev->ldev) {
1622 eh = mdev->ldev->dc.on_io_error;
1623 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001624
Lars Ellenberg63a6d0b2012-06-18 11:46:01 +02001625 if (was_io_error && eh == EP_CALL_HELPER)
1626 drbd_khelper(mdev, "local-io-error");
1627
Lars Ellenberg383606e2012-06-14 14:21:32 +02001628 /* Immediately allow completion of all application IO,
1629 * that waits for completion from the local disk,
1630 * if this was a force-detach due to disk_timeout
1631 * or administrator request (drbdsetup detach --force).
1632 * Do NOT abort otherwise.
1633 * Aborting local requests may cause serious problems,
1634 * if requests are completed to upper layers already,
1635 * and then later the already submitted local bio completes.
1636 * This can cause DMA into former bio pages that meanwhile
1637 * have been re-used for other things.
1638 * So aborting local requests may cause crashes,
1639 * or even worse, silent data corruption.
1640 */
1641 if (test_and_clear_bit(FORCE_DETACH, &mdev->flags))
1642 tl_abort_disk_io(mdev);
Philipp Reisner2b4dd362011-03-14 13:01:50 +01001643
Philipp Reisner7caacb62011-12-14 18:01:21 +01001644 /* current state still has to be D_FAILED,
1645 * there is only one way out: to D_DISKLESS,
1646 * and that may only happen after our put_ldev below. */
1647 if (mdev->state.disk != D_FAILED)
1648 dev_err(DEV,
1649 "ASSERT FAILED: disk is %s during detach\n",
1650 drbd_disk_str(mdev->state.disk));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001651
Philipp Reisner7caacb62011-12-14 18:01:21 +01001652 if (ns.conn >= C_CONNECTED)
1653 drbd_send_state(mdev, ns);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001654
Philipp Reisner7caacb62011-12-14 18:01:21 +01001655 drbd_rs_cancel_all(mdev);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001656
Philipp Reisner7caacb62011-12-14 18:01:21 +01001657 /* In case we want to get something to stable storage still,
1658 * this may be the last chance.
1659 * Following put_ldev may transition to D_DISKLESS. */
1660 drbd_md_sync(mdev);
1661 }
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001662 put_ldev(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001663 }
1664
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001665 /* second half of local IO error, failure to attach,
1666 * or administrative detach,
1667 * after local_cnt references have reached zero again */
1668 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1669 /* We must still be diskless,
1670 * re-attach has to be serialized with this! */
1671 if (mdev->state.disk != D_DISKLESS)
1672 dev_err(DEV,
1673 "ASSERT FAILED: disk is %s while going diskless\n",
1674 drbd_disk_str(mdev->state.disk));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001675
Philipp Reisner4afc4332011-12-13 10:31:32 +01001676 if (ns.conn >= C_CONNECTED)
1677 drbd_send_state(mdev, ns);
1678
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001679 /* corresponding get_ldev in __drbd_set_state
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001680 * this may finally trigger drbd_ldev_destroy. */
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001681 put_ldev(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001682 }
1683
Philipp Reisner738a84b2011-03-03 00:21:30 +01001684 /* Notify peer that I had a local IO error, and did not detached.. */
Philipp Reisner4afc4332011-12-13 10:31:32 +01001685 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
Lars Ellenbergf479ea02011-10-27 16:52:30 +02001686 drbd_send_state(mdev, ns);
Philipp Reisner738a84b2011-03-03 00:21:30 +01001687
Philipp Reisnerb411b362009-09-25 16:07:19 -07001688 /* Disks got bigger while they were detached */
1689 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1690 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1691 if (ns.conn == C_CONNECTED)
1692 resync_after_online_grow(mdev);
1693 }
1694
1695 /* A resync finished or aborted, wake paused devices... */
1696 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1697 (os.peer_isp && !ns.peer_isp) ||
1698 (os.user_isp && !ns.user_isp))
1699 resume_next_sg(mdev);
1700
Lars Ellenbergaf85e8e2010-10-07 16:07:55 +02001701 /* sync target done with resync. Explicitly notify peer, even though
1702 * it should (at least for non-empty resyncs) already know itself. */
1703 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
Lars Ellenbergf479ea02011-10-27 16:52:30 +02001704 drbd_send_state(mdev, ns);
Lars Ellenbergaf85e8e2010-10-07 16:07:55 +02001705
Philipp Reisner197296f2012-03-26 16:47:11 +02001706 /* Wake up role changes, that were delayed because of connection establishing */
1707 if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) {
1708 clear_bit(STATE_SENT, &mdev->flags);
1709 wake_up(&mdev->state_wait);
1710 }
1711
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001712 /* This triggers bitmap writeout of potentially still unwritten pages
1713 * if the resync finished cleanly, or aborted because of peer disk
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001714 * failure, or because of connection loss.
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001715 * For resync aborted because of local disk failure, we cannot do
1716 * any bitmap writeout anymore.
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001717 * No harm done if some bits change during this phase.
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001718 */
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001719 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
Lars Ellenberg0e8488a2012-04-25 23:06:45 +02001720 drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
1721 "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001722 put_ldev(mdev);
1723 }
Lars Ellenberg02851e92010-12-16 14:47:39 +01001724
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001725 /* free tl_hash if we Got thawed and are C_STANDALONE */
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001726 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001727 drbd_free_tl_hash(mdev);
1728
Philipp Reisnerb411b362009-09-25 16:07:19 -07001729 /* Upon network connection, we need to start the receiver */
1730 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1731 drbd_thread_start(&mdev->receiver);
1732
1733 /* Terminate worker thread if we are unconfigured - it will be
1734 restarted as needed... */
1735 if (ns.disk == D_DISKLESS &&
1736 ns.conn == C_STANDALONE &&
1737 ns.role == R_SECONDARY) {
1738 if (os.aftr_isp != ns.aftr_isp)
1739 resume_next_sg(mdev);
1740 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1741 if (test_bit(DEVICE_DYING, &mdev->flags))
1742 drbd_thread_stop_nowait(&mdev->worker);
1743 }
1744
1745 drbd_md_sync(mdev);
1746}
1747
1748
1749static int drbd_thread_setup(void *arg)
1750{
1751 struct drbd_thread *thi = (struct drbd_thread *) arg;
1752 struct drbd_conf *mdev = thi->mdev;
1753 unsigned long flags;
1754 int retval;
1755
1756restart:
1757 retval = thi->function(thi);
1758
1759 spin_lock_irqsave(&thi->t_lock, flags);
1760
1761 /* if the receiver has been "Exiting", the last thing it did
1762 * was set the conn state to "StandAlone",
1763 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1764 * and receiver thread will be "started".
1765 * drbd_thread_start needs to set "Restarting" in that case.
1766 * t_state check and assignment needs to be within the same spinlock,
1767 * so either thread_start sees Exiting, and can remap to Restarting,
1768 * or thread_start see None, and can proceed as normal.
1769 */
1770
1771 if (thi->t_state == Restarting) {
1772 dev_info(DEV, "Restarting %s\n", current->comm);
1773 thi->t_state = Running;
1774 spin_unlock_irqrestore(&thi->t_lock, flags);
1775 goto restart;
1776 }
1777
1778 thi->task = NULL;
1779 thi->t_state = None;
1780 smp_mb();
1781 complete(&thi->stop);
1782 spin_unlock_irqrestore(&thi->t_lock, flags);
1783
1784 dev_info(DEV, "Terminating %s\n", current->comm);
1785
1786 /* Release mod reference taken when thread was started */
1787 module_put(THIS_MODULE);
1788 return retval;
1789}
1790
1791static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1792 int (*func) (struct drbd_thread *))
1793{
1794 spin_lock_init(&thi->t_lock);
1795 thi->task = NULL;
1796 thi->t_state = None;
1797 thi->function = func;
1798 thi->mdev = mdev;
1799}
1800
1801int drbd_thread_start(struct drbd_thread *thi)
1802{
1803 struct drbd_conf *mdev = thi->mdev;
1804 struct task_struct *nt;
1805 unsigned long flags;
1806
1807 const char *me =
1808 thi == &mdev->receiver ? "receiver" :
1809 thi == &mdev->asender ? "asender" :
1810 thi == &mdev->worker ? "worker" : "NONSENSE";
1811
1812 /* is used from state engine doing drbd_thread_stop_nowait,
1813 * while holding the req lock irqsave */
1814 spin_lock_irqsave(&thi->t_lock, flags);
1815
1816 switch (thi->t_state) {
1817 case None:
1818 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1819 me, current->comm, current->pid);
1820
1821 /* Get ref on module for thread - this is released when thread exits */
1822 if (!try_module_get(THIS_MODULE)) {
1823 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1824 spin_unlock_irqrestore(&thi->t_lock, flags);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001825 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001826 }
1827
1828 init_completion(&thi->stop);
1829 D_ASSERT(thi->task == NULL);
1830 thi->reset_cpu_mask = 1;
1831 thi->t_state = Running;
1832 spin_unlock_irqrestore(&thi->t_lock, flags);
1833 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1834
1835 nt = kthread_create(drbd_thread_setup, (void *) thi,
1836 "drbd%d_%s", mdev_to_minor(mdev), me);
1837
1838 if (IS_ERR(nt)) {
1839 dev_err(DEV, "Couldn't start thread\n");
1840
1841 module_put(THIS_MODULE);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001842 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001843 }
1844 spin_lock_irqsave(&thi->t_lock, flags);
1845 thi->task = nt;
1846 thi->t_state = Running;
1847 spin_unlock_irqrestore(&thi->t_lock, flags);
1848 wake_up_process(nt);
1849 break;
1850 case Exiting:
1851 thi->t_state = Restarting;
1852 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1853 me, current->comm, current->pid);
1854 /* fall through */
1855 case Running:
1856 case Restarting:
1857 default:
1858 spin_unlock_irqrestore(&thi->t_lock, flags);
1859 break;
1860 }
1861
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001862 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001863}
1864
1865
1866void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1867{
1868 unsigned long flags;
1869
1870 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1871
1872 /* may be called from state engine, holding the req lock irqsave */
1873 spin_lock_irqsave(&thi->t_lock, flags);
1874
1875 if (thi->t_state == None) {
1876 spin_unlock_irqrestore(&thi->t_lock, flags);
1877 if (restart)
1878 drbd_thread_start(thi);
1879 return;
1880 }
1881
1882 if (thi->t_state != ns) {
1883 if (thi->task == NULL) {
1884 spin_unlock_irqrestore(&thi->t_lock, flags);
1885 return;
1886 }
1887
1888 thi->t_state = ns;
1889 smp_mb();
1890 init_completion(&thi->stop);
1891 if (thi->task != current)
1892 force_sig(DRBD_SIGKILL, thi->task);
1893
1894 }
1895
1896 spin_unlock_irqrestore(&thi->t_lock, flags);
1897
1898 if (wait)
1899 wait_for_completion(&thi->stop);
1900}
1901
1902#ifdef CONFIG_SMP
1903/**
1904 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1905 * @mdev: DRBD device.
1906 *
1907 * Forces all threads of a device onto the same CPU. This is beneficial for
1908 * DRBD's performance. May be overwritten by user's configuration.
1909 */
1910void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1911{
1912 int ord, cpu;
1913
1914 /* user override. */
1915 if (cpumask_weight(mdev->cpu_mask))
1916 return;
1917
1918 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1919 for_each_online_cpu(cpu) {
1920 if (ord-- == 0) {
1921 cpumask_set_cpu(cpu, mdev->cpu_mask);
1922 return;
1923 }
1924 }
1925 /* should not be reached */
1926 cpumask_setall(mdev->cpu_mask);
1927}
1928
1929/**
1930 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1931 * @mdev: DRBD device.
1932 *
1933 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1934 * prematurely.
1935 */
1936void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1937{
1938 struct task_struct *p = current;
1939 struct drbd_thread *thi =
1940 p == mdev->asender.task ? &mdev->asender :
1941 p == mdev->receiver.task ? &mdev->receiver :
1942 p == mdev->worker.task ? &mdev->worker :
1943 NULL;
1944 ERR_IF(thi == NULL)
1945 return;
1946 if (!thi->reset_cpu_mask)
1947 return;
1948 thi->reset_cpu_mask = 0;
1949 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1950}
1951#endif
1952
1953/* the appropriate socket mutex must be held already */
1954int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001955 enum drbd_packets cmd, struct p_header80 *h,
Philipp Reisnerb411b362009-09-25 16:07:19 -07001956 size_t size, unsigned msg_flags)
1957{
1958 int sent, ok;
1959
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001960 ERR_IF(!h) return false;
1961 ERR_IF(!size) return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001962
1963 h->magic = BE_DRBD_MAGIC;
1964 h->command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02001965 h->length = cpu_to_be16(size-sizeof(struct p_header80));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001966
Philipp Reisnerb411b362009-09-25 16:07:19 -07001967 sent = drbd_send(mdev, sock, h, size, msg_flags);
1968
1969 ok = (sent == size);
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01001970 if (!ok && !signal_pending(current))
1971 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
Philipp Reisnerb411b362009-09-25 16:07:19 -07001972 cmdname(cmd), (int)size, sent);
1973 return ok;
1974}
1975
1976/* don't pass the socket. we may only look at it
1977 * when we hold the appropriate socket mutex.
1978 */
1979int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001980 enum drbd_packets cmd, struct p_header80 *h, size_t size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001981{
1982 int ok = 0;
1983 struct socket *sock;
1984
1985 if (use_data_socket) {
1986 mutex_lock(&mdev->data.mutex);
1987 sock = mdev->data.socket;
1988 } else {
1989 mutex_lock(&mdev->meta.mutex);
1990 sock = mdev->meta.socket;
1991 }
1992
1993 /* drbd_disconnect() could have called drbd_free_sock()
1994 * while we were waiting in down()... */
1995 if (likely(sock != NULL))
1996 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1997
1998 if (use_data_socket)
1999 mutex_unlock(&mdev->data.mutex);
2000 else
2001 mutex_unlock(&mdev->meta.mutex);
2002 return ok;
2003}
2004
2005int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
2006 size_t size)
2007{
Philipp Reisner0b70a132010-08-20 13:36:10 +02002008 struct p_header80 h;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002009 int ok;
2010
2011 h.magic = BE_DRBD_MAGIC;
2012 h.command = cpu_to_be16(cmd);
2013 h.length = cpu_to_be16(size);
2014
2015 if (!drbd_get_data_sock(mdev))
2016 return 0;
2017
Philipp Reisnerb411b362009-09-25 16:07:19 -07002018 ok = (sizeof(h) ==
2019 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
2020 ok = ok && (size ==
2021 drbd_send(mdev, mdev->data.socket, data, size, 0));
2022
2023 drbd_put_data_sock(mdev);
2024
2025 return ok;
2026}
2027
2028int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
2029{
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02002030 struct p_rs_param_95 *p;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002031 struct socket *sock;
2032 int size, rv;
2033 const int apv = mdev->agreed_pro_version;
2034
2035 size = apv <= 87 ? sizeof(struct p_rs_param)
2036 : apv == 88 ? sizeof(struct p_rs_param)
2037 + strlen(mdev->sync_conf.verify_alg) + 1
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02002038 : apv <= 94 ? sizeof(struct p_rs_param_89)
2039 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002040
2041 /* used from admin command context and receiver/worker context.
2042 * to avoid kmalloc, grab the socket right here,
2043 * then use the pre-allocated sbuf there */
2044 mutex_lock(&mdev->data.mutex);
2045 sock = mdev->data.socket;
2046
2047 if (likely(sock != NULL)) {
2048 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
2049
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02002050 p = &mdev->data.sbuf.rs_param_95;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002051
2052 /* initialize verify_alg and csums_alg */
2053 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
2054
2055 p->rate = cpu_to_be32(sc->rate);
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02002056 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
2057 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
2058 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
2059 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002060
2061 if (apv >= 88)
2062 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
2063 if (apv >= 89)
2064 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
2065
2066 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
2067 } else
2068 rv = 0; /* not ok */
2069
2070 mutex_unlock(&mdev->data.mutex);
2071
2072 return rv;
2073}
2074
2075int drbd_send_protocol(struct drbd_conf *mdev)
2076{
2077 struct p_protocol *p;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002078 int size, cf, rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002079
2080 size = sizeof(struct p_protocol);
2081
2082 if (mdev->agreed_pro_version >= 87)
2083 size += strlen(mdev->net_conf->integrity_alg) + 1;
2084
2085 /* we must not recurse into our own queue,
2086 * as that is blocked during handshake */
2087 p = kmalloc(size, GFP_NOIO);
2088 if (p == NULL)
2089 return 0;
2090
2091 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
2092 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
2093 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
2094 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002095 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
2096
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002097 cf = 0;
2098 if (mdev->net_conf->want_lose)
2099 cf |= CF_WANT_LOSE;
2100 if (mdev->net_conf->dry_run) {
2101 if (mdev->agreed_pro_version >= 92)
2102 cf |= CF_DRY_RUN;
2103 else {
2104 dev_err(DEV, "--dry-run is not supported by peer");
Dan Carpenter7ac314c2010-04-22 14:27:23 +02002105 kfree(p);
Philipp Reisner148efa12011-01-15 00:21:15 +01002106 return -1;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002107 }
2108 }
2109 p->conn_flags = cpu_to_be32(cf);
2110
Philipp Reisnerb411b362009-09-25 16:07:19 -07002111 if (mdev->agreed_pro_version >= 87)
2112 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
2113
2114 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002115 (struct p_header80 *)p, size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002116 kfree(p);
2117 return rv;
2118}
2119
2120int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
2121{
2122 struct p_uuids p;
2123 int i;
2124
2125 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2126 return 1;
2127
Philipp Reisner9f2247b2012-08-16 14:25:58 +02002128 spin_lock_irq(&mdev->ldev->md.uuid_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002129 for (i = UI_CURRENT; i < UI_SIZE; i++)
2130 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
Philipp Reisner9f2247b2012-08-16 14:25:58 +02002131 spin_unlock_irq(&mdev->ldev->md.uuid_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002132
2133 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2134 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
2135 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
2136 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
2137 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2138 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2139
2140 put_ldev(mdev);
2141
2142 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002143 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002144}
2145
2146int drbd_send_uuids(struct drbd_conf *mdev)
2147{
2148 return _drbd_send_uuids(mdev, 0);
2149}
2150
2151int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2152{
2153 return _drbd_send_uuids(mdev, 8);
2154}
2155
Lars Ellenberg62b0da32011-01-20 13:25:21 +01002156void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2157{
2158 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2159 u64 *uuid = mdev->ldev->md.uuid;
2160 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2161 text,
2162 (unsigned long long)uuid[UI_CURRENT],
2163 (unsigned long long)uuid[UI_BITMAP],
2164 (unsigned long long)uuid[UI_HISTORY_START],
2165 (unsigned long long)uuid[UI_HISTORY_END]);
2166 put_ldev(mdev);
2167 } else {
2168 dev_info(DEV, "%s effective data uuid: %016llX\n",
2169 text,
2170 (unsigned long long)mdev->ed_uuid);
2171 }
2172}
2173
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002174int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002175{
2176 struct p_rs_uuid p;
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002177 u64 uuid;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002178
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002179 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
2180
Philipp Reisner5ba3dac2011-10-05 15:54:18 +02002181 uuid = mdev->ldev->md.uuid[UI_BITMAP];
2182 if (uuid && uuid != UUID_JUST_CREATED)
2183 uuid = uuid + UUID_NEW_BM_OFFSET;
2184 else
2185 get_random_bytes(&uuid, sizeof(u64));
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002186 drbd_uuid_set(mdev, UI_BITMAP, uuid);
Lars Ellenberg62b0da32011-01-20 13:25:21 +01002187 drbd_print_uuids(mdev, "updated sync UUID");
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002188 drbd_md_sync(mdev);
2189 p.uuid = cpu_to_be64(uuid);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002190
2191 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002192 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002193}
2194
Philipp Reisnere89b5912010-03-24 17:11:33 +01002195int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002196{
2197 struct p_sizes p;
2198 sector_t d_size, u_size;
Lars Ellenbergdb141b22012-06-25 19:15:58 +02002199 int q_order_type;
2200 unsigned int max_bio_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002201 int ok;
2202
2203 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2204 D_ASSERT(mdev->ldev->backing_bdev);
2205 d_size = drbd_get_max_capacity(mdev->ldev);
2206 u_size = mdev->ldev->dc.disk_size;
2207 q_order_type = drbd_queue_order_type(mdev);
Philipp Reisner99432fc2011-05-20 16:39:13 +02002208 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
Lars Ellenbergdb141b22012-06-25 19:15:58 +02002209 max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002210 put_ldev(mdev);
2211 } else {
2212 d_size = 0;
2213 u_size = 0;
2214 q_order_type = QUEUE_ORDERED_NONE;
Philipp Reisner99432fc2011-05-20 16:39:13 +02002215 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002216 }
2217
Philipp Reisner68093842011-06-30 15:43:06 +02002218 /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
2219 if (mdev->agreed_pro_version <= 94)
Lars Ellenbergdb141b22012-06-25 19:15:58 +02002220 max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
Philipp Reisner68093842011-06-30 15:43:06 +02002221
Philipp Reisnerb411b362009-09-25 16:07:19 -07002222 p.d_size = cpu_to_be64(d_size);
2223 p.u_size = cpu_to_be64(u_size);
2224 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
Philipp Reisner99432fc2011-05-20 16:39:13 +02002225 p.max_bio_size = cpu_to_be32(max_bio_size);
Philipp Reisnere89b5912010-03-24 17:11:33 +01002226 p.queue_order_type = cpu_to_be16(q_order_type);
2227 p.dds_flags = cpu_to_be16(flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002228
2229 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002230 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002231 return ok;
2232}
2233
2234/**
Lars Ellenbergf479ea02011-10-27 16:52:30 +02002235 * drbd_send_current_state() - Sends the drbd state to the peer
Philipp Reisnerb411b362009-09-25 16:07:19 -07002236 * @mdev: DRBD device.
2237 */
Lars Ellenbergf479ea02011-10-27 16:52:30 +02002238int drbd_send_current_state(struct drbd_conf *mdev)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002239{
2240 struct socket *sock;
2241 struct p_state p;
2242 int ok = 0;
2243
2244 /* Grab state lock so we wont send state if we're in the middle
2245 * of a cluster wide state change on another thread */
2246 drbd_state_lock(mdev);
2247
2248 mutex_lock(&mdev->data.mutex);
2249
2250 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2251 sock = mdev->data.socket;
2252
2253 if (likely(sock != NULL)) {
2254 ok = _drbd_send_cmd(mdev, sock, P_STATE,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002255 (struct p_header80 *)&p, sizeof(p), 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002256 }
2257
2258 mutex_unlock(&mdev->data.mutex);
2259
2260 drbd_state_unlock(mdev);
2261 return ok;
2262}
2263
Lars Ellenbergf479ea02011-10-27 16:52:30 +02002264/**
2265 * drbd_send_state() - After a state change, sends the new state to the peer
2266 * @mdev: DRBD device.
2267 * @state: the state to send, not necessarily the current state.
2268 *
2269 * Each state change queues an "after_state_ch" work, which will eventually
2270 * send the resulting new state to the peer. If more state changes happen
2271 * between queuing and processing of the after_state_ch work, we still
2272 * want to send each intermediary state in the order it occurred.
2273 */
2274int drbd_send_state(struct drbd_conf *mdev, union drbd_state state)
2275{
2276 struct socket *sock;
2277 struct p_state p;
2278 int ok = 0;
2279
2280 mutex_lock(&mdev->data.mutex);
2281
2282 p.state = cpu_to_be32(state.i);
2283 sock = mdev->data.socket;
2284
2285 if (likely(sock != NULL)) {
2286 ok = _drbd_send_cmd(mdev, sock, P_STATE,
2287 (struct p_header80 *)&p, sizeof(p), 0);
2288 }
2289
2290 mutex_unlock(&mdev->data.mutex);
2291
2292 return ok;
2293}
2294
Philipp Reisnerb411b362009-09-25 16:07:19 -07002295int drbd_send_state_req(struct drbd_conf *mdev,
2296 union drbd_state mask, union drbd_state val)
2297{
2298 struct p_req_state p;
2299
2300 p.mask = cpu_to_be32(mask.i);
2301 p.val = cpu_to_be32(val.i);
2302
2303 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002304 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002305}
2306
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01002307int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002308{
2309 struct p_req_state_reply p;
2310
2311 p.retcode = cpu_to_be32(retcode);
2312
2313 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002314 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002315}
2316
2317int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2318 struct p_compressed_bm *p,
2319 struct bm_xfer_ctx *c)
2320{
2321 struct bitstream bs;
2322 unsigned long plain_bits;
2323 unsigned long tmp;
2324 unsigned long rl;
2325 unsigned len;
2326 unsigned toggle;
2327 int bits;
2328
2329 /* may we use this feature? */
2330 if ((mdev->sync_conf.use_rle == 0) ||
2331 (mdev->agreed_pro_version < 90))
2332 return 0;
2333
2334 if (c->bit_offset >= c->bm_bits)
2335 return 0; /* nothing to do. */
2336
2337 /* use at most thus many bytes */
2338 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2339 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2340 /* plain bits covered in this code string */
2341 plain_bits = 0;
2342
2343 /* p->encoding & 0x80 stores whether the first run length is set.
2344 * bit offset is implicit.
2345 * start with toggle == 2 to be able to tell the first iteration */
2346 toggle = 2;
2347
2348 /* see how much plain bits we can stuff into one packet
2349 * using RLE and VLI. */
2350 do {
2351 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2352 : _drbd_bm_find_next(mdev, c->bit_offset);
2353 if (tmp == -1UL)
2354 tmp = c->bm_bits;
2355 rl = tmp - c->bit_offset;
2356
2357 if (toggle == 2) { /* first iteration */
2358 if (rl == 0) {
2359 /* the first checked bit was set,
2360 * store start value, */
2361 DCBP_set_start(p, 1);
2362 /* but skip encoding of zero run length */
2363 toggle = !toggle;
2364 continue;
2365 }
2366 DCBP_set_start(p, 0);
2367 }
2368
2369 /* paranoia: catch zero runlength.
2370 * can only happen if bitmap is modified while we scan it. */
2371 if (rl == 0) {
2372 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2373 "t:%u bo:%lu\n", toggle, c->bit_offset);
2374 return -1;
2375 }
2376
2377 bits = vli_encode_bits(&bs, rl);
2378 if (bits == -ENOBUFS) /* buffer full */
2379 break;
2380 if (bits <= 0) {
2381 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2382 return 0;
2383 }
2384
2385 toggle = !toggle;
2386 plain_bits += rl;
2387 c->bit_offset = tmp;
2388 } while (c->bit_offset < c->bm_bits);
2389
2390 len = bs.cur.b - p->code + !!bs.cur.bit;
2391
2392 if (plain_bits < (len << 3)) {
2393 /* incompressible with this method.
2394 * we need to rewind both word and bit position. */
2395 c->bit_offset -= plain_bits;
2396 bm_xfer_ctx_bit_to_word_offset(c);
2397 c->bit_offset = c->word_offset * BITS_PER_LONG;
2398 return 0;
2399 }
2400
2401 /* RLE + VLI was able to compress it just fine.
2402 * update c->word_offset. */
2403 bm_xfer_ctx_bit_to_word_offset(c);
2404
2405 /* store pad_bits */
2406 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2407
2408 return len;
2409}
2410
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002411/**
2412 * send_bitmap_rle_or_plain
2413 *
2414 * Return 0 when done, 1 when another iteration is needed, and a negative error
2415 * code upon failure.
2416 */
2417static int
Philipp Reisnerb411b362009-09-25 16:07:19 -07002418send_bitmap_rle_or_plain(struct drbd_conf *mdev,
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002419 struct p_header80 *h, struct bm_xfer_ctx *c)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002420{
2421 struct p_compressed_bm *p = (void*)h;
2422 unsigned long num_words;
2423 int len;
2424 int ok;
2425
2426 len = fill_bitmap_rle_bits(mdev, p, c);
2427
2428 if (len < 0)
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002429 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002430
2431 if (len) {
2432 DCBP_set_code(p, RLE_VLI_Bits);
2433 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2434 sizeof(*p) + len, 0);
2435
2436 c->packets[0]++;
2437 c->bytes[0] += sizeof(*p) + len;
2438
2439 if (c->bit_offset >= c->bm_bits)
2440 len = 0; /* DONE */
2441 } else {
2442 /* was not compressible.
2443 * send a buffer full of plain text bits instead. */
2444 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2445 len = num_words * sizeof(long);
2446 if (len)
2447 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2448 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002449 h, sizeof(struct p_header80) + len, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002450 c->word_offset += num_words;
2451 c->bit_offset = c->word_offset * BITS_PER_LONG;
2452
2453 c->packets[1]++;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002454 c->bytes[1] += sizeof(struct p_header80) + len;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002455
2456 if (c->bit_offset > c->bm_bits)
2457 c->bit_offset = c->bm_bits;
2458 }
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002459 if (ok) {
2460 if (len == 0) {
2461 INFO_bm_xfer_stats(mdev, "send", c);
2462 return 0;
2463 } else
2464 return 1;
2465 }
2466 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002467}
2468
2469/* See the comment at receive_bitmap() */
2470int _drbd_send_bitmap(struct drbd_conf *mdev)
2471{
2472 struct bm_xfer_ctx c;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002473 struct p_header80 *p;
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002474 int err;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002475
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002476 ERR_IF(!mdev->bitmap) return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002477
2478 /* maybe we should use some per thread scratch page,
2479 * and allocate that during initial device creation? */
Philipp Reisner0b70a132010-08-20 13:36:10 +02002480 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002481 if (!p) {
2482 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002483 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002484 }
2485
2486 if (get_ldev(mdev)) {
2487 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2488 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2489 drbd_bm_set_all(mdev);
2490 if (drbd_bm_write(mdev)) {
2491 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2492 * but otherwise process as per normal - need to tell other
2493 * side that a full resync is required! */
2494 dev_err(DEV, "Failed to write bitmap to disk!\n");
2495 } else {
2496 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2497 drbd_md_sync(mdev);
2498 }
2499 }
2500 put_ldev(mdev);
2501 }
2502
2503 c = (struct bm_xfer_ctx) {
2504 .bm_bits = drbd_bm_bits(mdev),
2505 .bm_words = drbd_bm_words(mdev),
2506 };
2507
2508 do {
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002509 err = send_bitmap_rle_or_plain(mdev, p, &c);
2510 } while (err > 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002511
2512 free_page((unsigned long) p);
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002513 return err == 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002514}
2515
2516int drbd_send_bitmap(struct drbd_conf *mdev)
2517{
2518 int err;
2519
2520 if (!drbd_get_data_sock(mdev))
2521 return -1;
2522 err = !_drbd_send_bitmap(mdev);
2523 drbd_put_data_sock(mdev);
2524 return err;
2525}
2526
2527int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2528{
2529 int ok;
2530 struct p_barrier_ack p;
2531
2532 p.barrier = barrier_nr;
2533 p.set_size = cpu_to_be32(set_size);
2534
2535 if (mdev->state.conn < C_CONNECTED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002536 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002537 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002538 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002539 return ok;
2540}
2541
2542/**
2543 * _drbd_send_ack() - Sends an ack packet
2544 * @mdev: DRBD device.
2545 * @cmd: Packet command code.
2546 * @sector: sector, needs to be in big endian byte order
2547 * @blksize: size in byte, needs to be in big endian byte order
2548 * @block_id: Id, big endian byte order
2549 */
2550static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2551 u64 sector,
2552 u32 blksize,
2553 u64 block_id)
2554{
2555 int ok;
2556 struct p_block_ack p;
2557
2558 p.sector = sector;
2559 p.block_id = block_id;
2560 p.blksize = blksize;
2561 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2562
2563 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002564 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002565 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002566 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002567 return ok;
2568}
2569
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002570/* dp->sector and dp->block_id already/still in network byte order,
2571 * data_size is payload size according to dp->head,
2572 * and may need to be corrected for digest size. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002573int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002574 struct p_data *dp, int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002575{
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002576 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2577 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002578 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2579 dp->block_id);
2580}
2581
2582int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2583 struct p_block_req *rp)
2584{
2585 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2586}
2587
2588/**
2589 * drbd_send_ack() - Sends an ack packet
2590 * @mdev: DRBD device.
2591 * @cmd: Packet command code.
2592 * @e: Epoch entry.
2593 */
2594int drbd_send_ack(struct drbd_conf *mdev,
2595 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2596{
2597 return _drbd_send_ack(mdev, cmd,
2598 cpu_to_be64(e->sector),
2599 cpu_to_be32(e->size),
2600 e->block_id);
2601}
2602
2603/* This function misuses the block_id field to signal if the blocks
2604 * are is sync or not. */
2605int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2606 sector_t sector, int blksize, u64 block_id)
2607{
2608 return _drbd_send_ack(mdev, cmd,
2609 cpu_to_be64(sector),
2610 cpu_to_be32(blksize),
2611 cpu_to_be64(block_id));
2612}
2613
2614int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2615 sector_t sector, int size, u64 block_id)
2616{
2617 int ok;
2618 struct p_block_req p;
2619
2620 p.sector = cpu_to_be64(sector);
2621 p.block_id = block_id;
2622 p.blksize = cpu_to_be32(size);
2623
2624 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002625 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002626 return ok;
2627}
2628
2629int drbd_send_drequest_csum(struct drbd_conf *mdev,
2630 sector_t sector, int size,
2631 void *digest, int digest_size,
2632 enum drbd_packets cmd)
2633{
2634 int ok;
2635 struct p_block_req p;
2636
2637 p.sector = cpu_to_be64(sector);
2638 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2639 p.blksize = cpu_to_be32(size);
2640
2641 p.head.magic = BE_DRBD_MAGIC;
2642 p.head.command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002643 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002644
2645 mutex_lock(&mdev->data.mutex);
2646
2647 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2648 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2649
2650 mutex_unlock(&mdev->data.mutex);
2651
2652 return ok;
2653}
2654
2655int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2656{
2657 int ok;
2658 struct p_block_req p;
2659
2660 p.sector = cpu_to_be64(sector);
2661 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2662 p.blksize = cpu_to_be32(size);
2663
2664 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002665 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002666 return ok;
2667}
2668
2669/* called on sndtimeo
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002670 * returns false if we should retry,
2671 * true if we think connection is dead
Philipp Reisnerb411b362009-09-25 16:07:19 -07002672 */
2673static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2674{
2675 int drop_it;
2676 /* long elapsed = (long)(jiffies - mdev->last_received); */
2677
2678 drop_it = mdev->meta.socket == sock
2679 || !mdev->asender.task
2680 || get_t_state(&mdev->asender) != Running
2681 || mdev->state.conn < C_CONNECTED;
2682
2683 if (drop_it)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002684 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002685
2686 drop_it = !--mdev->ko_count;
2687 if (!drop_it) {
2688 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2689 current->comm, current->pid, mdev->ko_count);
2690 request_ping(mdev);
2691 }
2692
2693 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2694}
2695
2696/* The idea of sendpage seems to be to put some kind of reference
2697 * to the page into the skb, and to hand it over to the NIC. In
2698 * this process get_page() gets called.
2699 *
2700 * As soon as the page was really sent over the network put_page()
2701 * gets called by some part of the network layer. [ NIC driver? ]
2702 *
2703 * [ get_page() / put_page() increment/decrement the count. If count
2704 * reaches 0 the page will be freed. ]
2705 *
2706 * This works nicely with pages from FSs.
2707 * But this means that in protocol A we might signal IO completion too early!
2708 *
2709 * In order not to corrupt data during a resync we must make sure
2710 * that we do not reuse our own buffer pages (EEs) to early, therefore
2711 * we have the net_ee list.
2712 *
2713 * XFS seems to have problems, still, it submits pages with page_count == 0!
2714 * As a workaround, we disable sendpage on pages
2715 * with page_count == 0 or PageSlab.
2716 */
2717static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002718 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002719{
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002720 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002721 kunmap(page);
2722 if (sent == size)
2723 mdev->send_cnt += size>>9;
2724 return sent == size;
2725}
2726
2727static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002728 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002729{
2730 mm_segment_t oldfs = get_fs();
2731 int sent, ok;
2732 int len = size;
2733
2734 /* e.g. XFS meta- & log-data is in slab pages, which have a
2735 * page_count of 0 and/or have PageSlab() set.
2736 * we cannot use send_page for those, as that does get_page();
2737 * put_page(); and would cause either a VM_BUG directly, or
2738 * __page_cache_release a page that would actually still be referenced
2739 * by someone, leading to some obscure delayed Oops somewhere else. */
2740 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002741 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002742
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002743 msg_flags |= MSG_NOSIGNAL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002744 drbd_update_congested(mdev);
2745 set_fs(KERNEL_DS);
2746 do {
2747 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2748 offset, len,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002749 msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002750 if (sent == -EAGAIN) {
2751 if (we_should_drop_the_connection(mdev,
2752 mdev->data.socket))
2753 break;
2754 else
2755 continue;
2756 }
2757 if (sent <= 0) {
2758 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2759 __func__, (int)size, len, sent);
2760 break;
2761 }
2762 len -= sent;
2763 offset += sent;
2764 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2765 set_fs(oldfs);
2766 clear_bit(NET_CONGESTED, &mdev->flags);
2767
2768 ok = (len == 0);
2769 if (likely(ok))
2770 mdev->send_cnt += size>>9;
2771 return ok;
2772}
2773
2774static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2775{
2776 struct bio_vec *bvec;
2777 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002778 /* hint all but last page with MSG_MORE */
Lars Ellenberg001a8862012-03-08 16:43:45 +01002779 bio_for_each_segment(bvec, bio, i) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07002780 if (!_drbd_no_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002781 bvec->bv_offset, bvec->bv_len,
2782 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002783 return 0;
2784 }
2785 return 1;
2786}
2787
2788static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2789{
2790 struct bio_vec *bvec;
2791 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002792 /* hint all but last page with MSG_MORE */
Lars Ellenberg001a8862012-03-08 16:43:45 +01002793 bio_for_each_segment(bvec, bio, i) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07002794 if (!_drbd_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002795 bvec->bv_offset, bvec->bv_len,
2796 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002797 return 0;
2798 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002799 return 1;
2800}
2801
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002802static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2803{
2804 struct page *page = e->pages;
2805 unsigned len = e->size;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002806 /* hint all but last page with MSG_MORE */
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002807 page_chain_for_each(page) {
2808 unsigned l = min_t(unsigned, len, PAGE_SIZE);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002809 if (!_drbd_send_page(mdev, page, 0, l,
2810 page_chain_next(page) ? MSG_MORE : 0))
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002811 return 0;
2812 len -= l;
2813 }
2814 return 1;
2815}
2816
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002817static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2818{
2819 if (mdev->agreed_pro_version >= 95)
2820 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002821 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2822 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2823 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2824 else
Jens Axboe721a9602011-03-09 11:56:30 +01002825 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002826}
2827
Philipp Reisnerb411b362009-09-25 16:07:19 -07002828/* Used to send write requests
2829 * R_PRIMARY -> Peer (P_DATA)
2830 */
2831int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2832{
2833 int ok = 1;
2834 struct p_data p;
2835 unsigned int dp_flags = 0;
2836 void *dgb;
2837 int dgs;
2838
2839 if (!drbd_get_data_sock(mdev))
2840 return 0;
2841
2842 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2843 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2844
Philipp Reisnerd5373382010-08-23 15:18:33 +02002845 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002846 p.head.h80.magic = BE_DRBD_MAGIC;
2847 p.head.h80.command = cpu_to_be16(P_DATA);
2848 p.head.h80.length =
2849 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2850 } else {
2851 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2852 p.head.h95.command = cpu_to_be16(P_DATA);
2853 p.head.h95.length =
2854 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2855 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002856
2857 p.sector = cpu_to_be64(req->sector);
2858 p.block_id = (unsigned long)req;
Lars Ellenberg671a74e2012-03-08 11:45:57 +01002859 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002860
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002861 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2862
Philipp Reisnerb411b362009-09-25 16:07:19 -07002863 if (mdev->state.conn >= C_SYNC_SOURCE &&
2864 mdev->state.conn <= C_PAUSED_SYNC_T)
2865 dp_flags |= DP_MAY_SET_IN_SYNC;
2866
2867 p.dp_flags = cpu_to_be32(dp_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002868 set_bit(UNPLUG_REMOTE, &mdev->flags);
2869 ok = (sizeof(p) ==
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002870 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002871 if (ok && dgs) {
2872 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002873 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
Andreas Gruenbachercab2f742010-12-09 16:08:46 +01002874 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002875 }
2876 if (ok) {
Lars Ellenberg470be442010-11-10 10:36:52 +01002877 /* For protocol A, we have to memcpy the payload into
2878 * socket buffers, as we may complete right away
2879 * as soon as we handed it over to tcp, at which point the data
2880 * pages may become invalid.
2881 *
2882 * For data-integrity enabled, we copy it as well, so we can be
2883 * sure that even if the bio pages may still be modified, it
2884 * won't change the data on the wire, thus if the digest checks
2885 * out ok after sending on this side, but does not fit on the
2886 * receiving side, we sure have detected corruption elsewhere.
2887 */
2888 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002889 ok = _drbd_send_bio(mdev, req->master_bio);
2890 else
2891 ok = _drbd_send_zc_bio(mdev, req->master_bio);
Lars Ellenberg470be442010-11-10 10:36:52 +01002892
2893 /* double check digest, sometimes buffers have been modified in flight. */
2894 if (dgs > 0 && dgs <= 64) {
Bart Van Assche24c48302011-05-21 18:32:29 +02002895 /* 64 byte, 512 bit, is the largest digest size
Lars Ellenberg470be442010-11-10 10:36:52 +01002896 * currently supported in kernel crypto. */
2897 unsigned char digest[64];
2898 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2899 if (memcmp(mdev->int_dig_out, digest, dgs)) {
2900 dev_warn(DEV,
2901 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2902 (unsigned long long)req->sector, req->size);
2903 }
2904 } /* else if (dgs > 64) {
2905 ... Be noisy about digest too large ...
2906 } */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002907 }
2908
2909 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002910
Philipp Reisnerb411b362009-09-25 16:07:19 -07002911 return ok;
2912}
2913
2914/* answer packet, used to send data back for read requests:
2915 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2916 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2917 */
2918int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2919 struct drbd_epoch_entry *e)
2920{
2921 int ok;
2922 struct p_data p;
2923 void *dgb;
2924 int dgs;
2925
2926 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2927 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2928
Philipp Reisnerd5373382010-08-23 15:18:33 +02002929 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002930 p.head.h80.magic = BE_DRBD_MAGIC;
2931 p.head.h80.command = cpu_to_be16(cmd);
2932 p.head.h80.length =
2933 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2934 } else {
2935 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2936 p.head.h95.command = cpu_to_be16(cmd);
2937 p.head.h95.length =
2938 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2939 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002940
2941 p.sector = cpu_to_be64(e->sector);
2942 p.block_id = e->block_id;
2943 /* p.seq_num = 0; No sequence numbers here.. */
2944
2945 /* Only called by our kernel thread.
2946 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2947 * in response to admin command or module unload.
2948 */
2949 if (!drbd_get_data_sock(mdev))
2950 return 0;
2951
Philipp Reisner0b70a132010-08-20 13:36:10 +02002952 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002953 if (ok && dgs) {
2954 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002955 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
Andreas Gruenbachercab2f742010-12-09 16:08:46 +01002956 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002957 }
2958 if (ok)
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002959 ok = _drbd_send_zc_ee(mdev, e);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002960
2961 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002962
Philipp Reisnerb411b362009-09-25 16:07:19 -07002963 return ok;
2964}
2965
Philipp Reisner73a01a12010-10-27 14:33:00 +02002966int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2967{
2968 struct p_block_desc p;
2969
2970 p.sector = cpu_to_be64(req->sector);
2971 p.blksize = cpu_to_be32(req->size);
2972
2973 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2974}
2975
Philipp Reisnerb411b362009-09-25 16:07:19 -07002976/*
2977 drbd_send distinguishes two cases:
2978
2979 Packets sent via the data socket "sock"
2980 and packets sent via the meta data socket "msock"
2981
2982 sock msock
2983 -----------------+-------------------------+------------------------------
2984 timeout conf.timeout / 2 conf.timeout / 2
2985 timeout action send a ping via msock Abort communication
2986 and close all sockets
2987*/
2988
2989/*
2990 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2991 */
2992int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2993 void *buf, size_t size, unsigned msg_flags)
2994{
2995 struct kvec iov;
2996 struct msghdr msg;
2997 int rv, sent = 0;
2998
2999 if (!sock)
3000 return -1000;
3001
3002 /* THINK if (signal_pending) return ... ? */
3003
3004 iov.iov_base = buf;
3005 iov.iov_len = size;
3006
3007 msg.msg_name = NULL;
3008 msg.msg_namelen = 0;
3009 msg.msg_control = NULL;
3010 msg.msg_controllen = 0;
3011 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
3012
3013 if (sock == mdev->data.socket) {
3014 mdev->ko_count = mdev->net_conf->ko_count;
3015 drbd_update_congested(mdev);
3016 }
3017 do {
3018 /* STRANGE
3019 * tcp_sendmsg does _not_ use its size parameter at all ?
3020 *
3021 * -EAGAIN on timeout, -EINTR on signal.
3022 */
3023/* THINK
3024 * do we need to block DRBD_SIG if sock == &meta.socket ??
3025 * otherwise wake_asender() might interrupt some send_*Ack !
3026 */
3027 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
3028 if (rv == -EAGAIN) {
3029 if (we_should_drop_the_connection(mdev, sock))
3030 break;
3031 else
3032 continue;
3033 }
3034 D_ASSERT(rv != 0);
3035 if (rv == -EINTR) {
3036 flush_signals(current);
3037 rv = 0;
3038 }
3039 if (rv < 0)
3040 break;
3041 sent += rv;
3042 iov.iov_base += rv;
3043 iov.iov_len -= rv;
3044 } while (sent < size);
3045
3046 if (sock == mdev->data.socket)
3047 clear_bit(NET_CONGESTED, &mdev->flags);
3048
3049 if (rv <= 0) {
3050 if (rv != -EAGAIN) {
3051 dev_err(DEV, "%s_sendmsg returned %d\n",
3052 sock == mdev->meta.socket ? "msock" : "sock",
3053 rv);
3054 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
3055 } else
3056 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
3057 }
3058
3059 return sent;
3060}
3061
3062static int drbd_open(struct block_device *bdev, fmode_t mode)
3063{
3064 struct drbd_conf *mdev = bdev->bd_disk->private_data;
3065 unsigned long flags;
3066 int rv = 0;
3067
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02003068 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003069 spin_lock_irqsave(&mdev->req_lock, flags);
3070 /* to have a stable mdev->state.role
3071 * and no race with updating open_cnt */
3072
3073 if (mdev->state.role != R_PRIMARY) {
3074 if (mode & FMODE_WRITE)
3075 rv = -EROFS;
3076 else if (!allow_oos)
3077 rv = -EMEDIUMTYPE;
3078 }
3079
3080 if (!rv)
3081 mdev->open_cnt++;
3082 spin_unlock_irqrestore(&mdev->req_lock, flags);
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02003083 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003084
3085 return rv;
3086}
3087
3088static int drbd_release(struct gendisk *gd, fmode_t mode)
3089{
3090 struct drbd_conf *mdev = gd->private_data;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02003091 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003092 mdev->open_cnt--;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02003093 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003094 return 0;
3095}
3096
Philipp Reisnerb411b362009-09-25 16:07:19 -07003097static void drbd_set_defaults(struct drbd_conf *mdev)
3098{
Philipp Reisner85f4cc12010-06-29 17:35:34 +02003099 /* This way we get a compile error when sync_conf grows,
3100 and we forgot to initialize it here */
3101 mdev->sync_conf = (struct syncer_conf) {
3102 /* .rate = */ DRBD_RATE_DEF,
3103 /* .after = */ DRBD_AFTER_DEF,
3104 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
Philipp Reisner85f4cc12010-06-29 17:35:34 +02003105 /* .verify_alg = */ {}, 0,
3106 /* .cpu_mask = */ {}, 0,
3107 /* .csums_alg = */ {}, 0,
Philipp Reisnere7564142010-06-29 17:35:34 +02003108 /* .use_rle = */ 0,
Philipp Reisner9a31d712010-07-05 13:42:03 +02003109 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
3110 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
3111 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
3112 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02003113 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
3114 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
Philipp Reisner85f4cc12010-06-29 17:35:34 +02003115 };
3116
3117 /* Have to use that way, because the layout differs between
3118 big endian and little endian */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003119 mdev->state = (union drbd_state) {
3120 { .role = R_SECONDARY,
3121 .peer = R_UNKNOWN,
3122 .conn = C_STANDALONE,
3123 .disk = D_DISKLESS,
3124 .pdsk = D_UNKNOWN,
Philipp Reisnerfb22c402010-09-08 23:20:21 +02003125 .susp = 0,
3126 .susp_nod = 0,
3127 .susp_fen = 0
Philipp Reisnerb411b362009-09-25 16:07:19 -07003128 } };
3129}
3130
3131void drbd_init_set_defaults(struct drbd_conf *mdev)
3132{
3133 /* the memset(,0,) did most of this.
3134 * note: only assignments, no allocation in here */
3135
3136 drbd_set_defaults(mdev);
3137
Philipp Reisnerb411b362009-09-25 16:07:19 -07003138 atomic_set(&mdev->ap_bio_cnt, 0);
3139 atomic_set(&mdev->ap_pending_cnt, 0);
3140 atomic_set(&mdev->rs_pending_cnt, 0);
3141 atomic_set(&mdev->unacked_cnt, 0);
3142 atomic_set(&mdev->local_cnt, 0);
3143 atomic_set(&mdev->net_cnt, 0);
3144 atomic_set(&mdev->packet_seq, 0);
3145 atomic_set(&mdev->pp_in_use, 0);
Lars Ellenberg435f0742010-09-06 12:30:25 +02003146 atomic_set(&mdev->pp_in_use_by_net, 0);
Philipp Reisner778f2712010-07-06 11:14:00 +02003147 atomic_set(&mdev->rs_sect_in, 0);
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02003148 atomic_set(&mdev->rs_sect_ev, 0);
Philipp Reisner759fbdf2010-10-26 16:02:27 +02003149 atomic_set(&mdev->ap_in_flight, 0);
Philipp Reisnere1711732011-06-27 11:51:46 +02003150 atomic_set(&mdev->md_io_in_use, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003151
Philipp Reisnerb411b362009-09-25 16:07:19 -07003152 mutex_init(&mdev->data.mutex);
3153 mutex_init(&mdev->meta.mutex);
3154 sema_init(&mdev->data.work.s, 0);
3155 sema_init(&mdev->meta.work.s, 0);
3156 mutex_init(&mdev->state_mutex);
3157
3158 spin_lock_init(&mdev->data.work.q_lock);
3159 spin_lock_init(&mdev->meta.work.q_lock);
3160
3161 spin_lock_init(&mdev->al_lock);
3162 spin_lock_init(&mdev->req_lock);
3163 spin_lock_init(&mdev->peer_seq_lock);
3164 spin_lock_init(&mdev->epoch_lock);
3165
3166 INIT_LIST_HEAD(&mdev->active_ee);
3167 INIT_LIST_HEAD(&mdev->sync_ee);
3168 INIT_LIST_HEAD(&mdev->done_ee);
3169 INIT_LIST_HEAD(&mdev->read_ee);
3170 INIT_LIST_HEAD(&mdev->net_ee);
3171 INIT_LIST_HEAD(&mdev->resync_reads);
3172 INIT_LIST_HEAD(&mdev->data.work.q);
3173 INIT_LIST_HEAD(&mdev->meta.work.q);
3174 INIT_LIST_HEAD(&mdev->resync_work.list);
3175 INIT_LIST_HEAD(&mdev->unplug_work.list);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003176 INIT_LIST_HEAD(&mdev->go_diskless.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003177 INIT_LIST_HEAD(&mdev->md_sync_work.list);
Philipp Reisnerc4752ef2010-10-27 17:32:36 +02003178 INIT_LIST_HEAD(&mdev->start_resync_work.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003179 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
Philipp Reisner0ced55a2010-04-30 15:26:20 +02003180
Philipp Reisner794abb72010-12-27 11:51:23 +01003181 mdev->resync_work.cb = w_resync_timer;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003182 mdev->unplug_work.cb = w_send_write_hint;
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003183 mdev->go_diskless.cb = w_go_diskless;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003184 mdev->md_sync_work.cb = w_md_sync;
3185 mdev->bm_io_work.w.cb = w_bitmap_io;
Philipp Reisner370a43e2011-01-14 16:03:11 +01003186 mdev->start_resync_work.cb = w_start_resync;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003187 init_timer(&mdev->resync_timer);
3188 init_timer(&mdev->md_sync_timer);
Philipp Reisner370a43e2011-01-14 16:03:11 +01003189 init_timer(&mdev->start_resync_timer);
Philipp Reisner7fde2be2011-03-01 11:08:28 +01003190 init_timer(&mdev->request_timer);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003191 mdev->resync_timer.function = resync_timer_fn;
3192 mdev->resync_timer.data = (unsigned long) mdev;
3193 mdev->md_sync_timer.function = md_sync_timer_fn;
3194 mdev->md_sync_timer.data = (unsigned long) mdev;
Philipp Reisner370a43e2011-01-14 16:03:11 +01003195 mdev->start_resync_timer.function = start_resync_timer_fn;
3196 mdev->start_resync_timer.data = (unsigned long) mdev;
Philipp Reisner7fde2be2011-03-01 11:08:28 +01003197 mdev->request_timer.function = request_timer_fn;
3198 mdev->request_timer.data = (unsigned long) mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003199
3200 init_waitqueue_head(&mdev->misc_wait);
3201 init_waitqueue_head(&mdev->state_wait);
Philipp Reisner84dfb9f2010-06-23 11:20:05 +02003202 init_waitqueue_head(&mdev->net_cnt_wait);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003203 init_waitqueue_head(&mdev->ee_wait);
3204 init_waitqueue_head(&mdev->al_wait);
3205 init_waitqueue_head(&mdev->seq_wait);
3206
3207 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
3208 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
3209 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
3210
3211 mdev->agreed_pro_version = PRO_VERSION_MAX;
Philipp Reisner2451fc32010-08-24 13:43:11 +02003212 mdev->write_ordering = WO_bdev_flush;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003213 mdev->resync_wenr = LC_FREE;
Philipp Reisner99432fc2011-05-20 16:39:13 +02003214 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3215 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003216}
3217
3218void drbd_mdev_cleanup(struct drbd_conf *mdev)
3219{
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003220 int i;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003221 if (mdev->receiver.t_state != None)
3222 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
3223 mdev->receiver.t_state);
3224
3225 /* no need to lock it, I'm the only thread alive */
3226 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
3227 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
3228 mdev->al_writ_cnt =
3229 mdev->bm_writ_cnt =
3230 mdev->read_cnt =
3231 mdev->recv_cnt =
3232 mdev->send_cnt =
3233 mdev->writ_cnt =
3234 mdev->p_size =
3235 mdev->rs_start =
3236 mdev->rs_total =
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003237 mdev->rs_failed = 0;
3238 mdev->rs_last_events = 0;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02003239 mdev->rs_last_sect_ev = 0;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003240 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3241 mdev->rs_mark_left[i] = 0;
3242 mdev->rs_mark_time[i] = 0;
3243 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003244 D_ASSERT(mdev->net_conf == NULL);
3245
3246 drbd_set_my_capacity(mdev, 0);
3247 if (mdev->bitmap) {
3248 /* maybe never allocated. */
Philipp Reisner02d9a942010-03-24 16:23:03 +01003249 drbd_bm_resize(mdev, 0, 1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003250 drbd_bm_cleanup(mdev);
3251 }
3252
3253 drbd_free_resources(mdev);
Philipp Reisner07782862010-08-31 12:00:50 +02003254 clear_bit(AL_SUSPENDED, &mdev->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003255
3256 /*
3257 * currently we drbd_init_ee only on module load, so
3258 * we may do drbd_release_ee only on module unload!
3259 */
3260 D_ASSERT(list_empty(&mdev->active_ee));
3261 D_ASSERT(list_empty(&mdev->sync_ee));
3262 D_ASSERT(list_empty(&mdev->done_ee));
3263 D_ASSERT(list_empty(&mdev->read_ee));
3264 D_ASSERT(list_empty(&mdev->net_ee));
3265 D_ASSERT(list_empty(&mdev->resync_reads));
3266 D_ASSERT(list_empty(&mdev->data.work.q));
3267 D_ASSERT(list_empty(&mdev->meta.work.q));
3268 D_ASSERT(list_empty(&mdev->resync_work.list));
3269 D_ASSERT(list_empty(&mdev->unplug_work.list));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003270 D_ASSERT(list_empty(&mdev->go_diskless.list));
Lars Ellenberg2265b472010-12-16 15:41:26 +01003271
3272 drbd_set_defaults(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003273}
3274
3275
3276static void drbd_destroy_mempools(void)
3277{
3278 struct page *page;
3279
3280 while (drbd_pp_pool) {
3281 page = drbd_pp_pool;
3282 drbd_pp_pool = (struct page *)page_private(page);
3283 __free_page(page);
3284 drbd_pp_vacant--;
3285 }
3286
3287 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3288
Lars Ellenberg9476f392011-02-23 17:02:01 +01003289 if (drbd_md_io_bio_set)
3290 bioset_free(drbd_md_io_bio_set);
Lars Ellenberg42818082011-02-23 12:39:46 +01003291 if (drbd_md_io_page_pool)
3292 mempool_destroy(drbd_md_io_page_pool);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003293 if (drbd_ee_mempool)
3294 mempool_destroy(drbd_ee_mempool);
3295 if (drbd_request_mempool)
3296 mempool_destroy(drbd_request_mempool);
3297 if (drbd_ee_cache)
3298 kmem_cache_destroy(drbd_ee_cache);
3299 if (drbd_request_cache)
3300 kmem_cache_destroy(drbd_request_cache);
3301 if (drbd_bm_ext_cache)
3302 kmem_cache_destroy(drbd_bm_ext_cache);
3303 if (drbd_al_ext_cache)
3304 kmem_cache_destroy(drbd_al_ext_cache);
3305
Lars Ellenberg9476f392011-02-23 17:02:01 +01003306 drbd_md_io_bio_set = NULL;
Lars Ellenberg42818082011-02-23 12:39:46 +01003307 drbd_md_io_page_pool = NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003308 drbd_ee_mempool = NULL;
3309 drbd_request_mempool = NULL;
3310 drbd_ee_cache = NULL;
3311 drbd_request_cache = NULL;
3312 drbd_bm_ext_cache = NULL;
3313 drbd_al_ext_cache = NULL;
3314
3315 return;
3316}
3317
3318static int drbd_create_mempools(void)
3319{
3320 struct page *page;
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01003321 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003322 int i;
3323
3324 /* prepare our caches and mempools */
3325 drbd_request_mempool = NULL;
3326 drbd_ee_cache = NULL;
3327 drbd_request_cache = NULL;
3328 drbd_bm_ext_cache = NULL;
3329 drbd_al_ext_cache = NULL;
3330 drbd_pp_pool = NULL;
Lars Ellenberg42818082011-02-23 12:39:46 +01003331 drbd_md_io_page_pool = NULL;
Lars Ellenberg9476f392011-02-23 17:02:01 +01003332 drbd_md_io_bio_set = NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003333
3334 /* caches */
3335 drbd_request_cache = kmem_cache_create(
3336 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3337 if (drbd_request_cache == NULL)
3338 goto Enomem;
3339
3340 drbd_ee_cache = kmem_cache_create(
3341 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3342 if (drbd_ee_cache == NULL)
3343 goto Enomem;
3344
3345 drbd_bm_ext_cache = kmem_cache_create(
3346 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3347 if (drbd_bm_ext_cache == NULL)
3348 goto Enomem;
3349
3350 drbd_al_ext_cache = kmem_cache_create(
3351 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3352 if (drbd_al_ext_cache == NULL)
3353 goto Enomem;
3354
3355 /* mempools */
Lars Ellenberg9476f392011-02-23 17:02:01 +01003356#ifdef COMPAT_HAVE_BIOSET_CREATE
3357 drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
3358 if (drbd_md_io_bio_set == NULL)
3359 goto Enomem;
3360#endif
3361
Lars Ellenberg42818082011-02-23 12:39:46 +01003362 drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
3363 if (drbd_md_io_page_pool == NULL)
3364 goto Enomem;
3365
Philipp Reisnerb411b362009-09-25 16:07:19 -07003366 drbd_request_mempool = mempool_create(number,
3367 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3368 if (drbd_request_mempool == NULL)
3369 goto Enomem;
3370
3371 drbd_ee_mempool = mempool_create(number,
3372 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
Nicolas Kaiser2027ae12010-10-28 06:15:26 -06003373 if (drbd_ee_mempool == NULL)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003374 goto Enomem;
3375
3376 /* drbd's page pool */
3377 spin_lock_init(&drbd_pp_lock);
3378
3379 for (i = 0; i < number; i++) {
3380 page = alloc_page(GFP_HIGHUSER);
3381 if (!page)
3382 goto Enomem;
3383 set_page_private(page, (unsigned long)drbd_pp_pool);
3384 drbd_pp_pool = page;
3385 }
3386 drbd_pp_vacant = number;
3387
3388 return 0;
3389
3390Enomem:
3391 drbd_destroy_mempools(); /* in case we allocated some */
3392 return -ENOMEM;
3393}
3394
3395static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3396 void *unused)
3397{
3398 /* just so we have it. you never know what interesting things we
3399 * might want to do here some day...
3400 */
3401
3402 return NOTIFY_DONE;
3403}
3404
3405static struct notifier_block drbd_notifier = {
3406 .notifier_call = drbd_notify_sys,
3407};
3408
3409static void drbd_release_ee_lists(struct drbd_conf *mdev)
3410{
3411 int rr;
3412
3413 rr = drbd_release_ee(mdev, &mdev->active_ee);
3414 if (rr)
3415 dev_err(DEV, "%d EEs in active list found!\n", rr);
3416
3417 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3418 if (rr)
3419 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3420
3421 rr = drbd_release_ee(mdev, &mdev->read_ee);
3422 if (rr)
3423 dev_err(DEV, "%d EEs in read list found!\n", rr);
3424
3425 rr = drbd_release_ee(mdev, &mdev->done_ee);
3426 if (rr)
3427 dev_err(DEV, "%d EEs in done list found!\n", rr);
3428
3429 rr = drbd_release_ee(mdev, &mdev->net_ee);
3430 if (rr)
3431 dev_err(DEV, "%d EEs in net list found!\n", rr);
3432}
3433
3434/* caution. no locking.
3435 * currently only used from module cleanup code. */
3436static void drbd_delete_device(unsigned int minor)
3437{
3438 struct drbd_conf *mdev = minor_to_mdev(minor);
3439
3440 if (!mdev)
3441 return;
3442
Philipp Reisnerdfa8bed2011-06-29 14:06:08 +02003443 del_timer_sync(&mdev->request_timer);
3444
Philipp Reisnerb411b362009-09-25 16:07:19 -07003445 /* paranoia asserts */
3446 if (mdev->open_cnt != 0)
3447 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3448 __FILE__ , __LINE__);
3449
3450 ERR_IF (!list_empty(&mdev->data.work.q)) {
3451 struct list_head *lp;
3452 list_for_each(lp, &mdev->data.work.q) {
3453 dev_err(DEV, "lp = %p\n", lp);
3454 }
3455 };
3456 /* end paranoia asserts */
3457
3458 del_gendisk(mdev->vdisk);
3459
3460 /* cleanup stuff that may have been allocated during
3461 * device (re-)configuration or state changes */
3462
3463 if (mdev->this_bdev)
3464 bdput(mdev->this_bdev);
3465
3466 drbd_free_resources(mdev);
3467
3468 drbd_release_ee_lists(mdev);
3469
Bart Van Assche24c48302011-05-21 18:32:29 +02003470 /* should be freed on disconnect? */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003471 kfree(mdev->ee_hash);
3472 /*
3473 mdev->ee_hash_s = 0;
3474 mdev->ee_hash = NULL;
3475 */
3476
3477 lc_destroy(mdev->act_log);
3478 lc_destroy(mdev->resync);
3479
3480 kfree(mdev->p_uuid);
3481 /* mdev->p_uuid = NULL; */
3482
3483 kfree(mdev->int_dig_out);
3484 kfree(mdev->int_dig_in);
3485 kfree(mdev->int_dig_vv);
3486
3487 /* cleanup the rest that has been
3488 * allocated from drbd_new_device
3489 * and actually free the mdev itself */
3490 drbd_free_mdev(mdev);
3491}
3492
3493static void drbd_cleanup(void)
3494{
3495 unsigned int i;
3496
3497 unregister_reboot_notifier(&drbd_notifier);
3498
Lars Ellenberg17a93f32010-11-24 10:37:35 +01003499 /* first remove proc,
3500 * drbdsetup uses it's presence to detect
3501 * whether DRBD is loaded.
3502 * If we would get stuck in proc removal,
3503 * but have netlink already deregistered,
3504 * some drbdsetup commands may wait forever
3505 * for an answer.
3506 */
3507 if (drbd_proc)
3508 remove_proc_entry("drbd", NULL);
3509
Philipp Reisnerb411b362009-09-25 16:07:19 -07003510 drbd_nl_cleanup();
3511
3512 if (minor_table) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003513 i = minor_count;
3514 while (i--)
3515 drbd_delete_device(i);
3516 drbd_destroy_mempools();
3517 }
3518
3519 kfree(minor_table);
3520
3521 unregister_blkdev(DRBD_MAJOR, "drbd");
3522
3523 printk(KERN_INFO "drbd: module cleanup done.\n");
3524}
3525
3526/**
Artem Bityutskiyd97482e2012-07-25 18:12:12 +03003527 * drbd_congested() - Callback for the flusher thread
Philipp Reisnerb411b362009-09-25 16:07:19 -07003528 * @congested_data: User data
Artem Bityutskiyd97482e2012-07-25 18:12:12 +03003529 * @bdi_bits: Bits the BDI flusher thread is currently interested in
Philipp Reisnerb411b362009-09-25 16:07:19 -07003530 *
3531 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3532 */
3533static int drbd_congested(void *congested_data, int bdi_bits)
3534{
3535 struct drbd_conf *mdev = congested_data;
3536 struct request_queue *q;
3537 char reason = '-';
3538 int r = 0;
3539
Andreas Gruenbacher1b881ef2010-12-13 18:03:38 +01003540 if (!may_inc_ap_bio(mdev)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003541 /* DRBD has frozen IO */
3542 r = bdi_bits;
3543 reason = 'd';
3544 goto out;
3545 }
3546
Lars Ellenbergc2ba6862012-06-14 15:14:06 +02003547 if (test_bit(CALLBACK_PENDING, &mdev->flags)) {
3548 r |= (1 << BDI_async_congested);
3549 /* Without good local data, we would need to read from remote,
3550 * and that would need the worker thread as well, which is
3551 * currently blocked waiting for that usermode helper to
3552 * finish.
3553 */
3554 if (!get_ldev_if_state(mdev, D_UP_TO_DATE))
3555 r |= (1 << BDI_sync_congested);
3556 else
3557 put_ldev(mdev);
3558 r &= bdi_bits;
3559 reason = 'c';
3560 goto out;
3561 }
3562
Philipp Reisnerb411b362009-09-25 16:07:19 -07003563 if (get_ldev(mdev)) {
3564 q = bdev_get_queue(mdev->ldev->backing_bdev);
3565 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3566 put_ldev(mdev);
3567 if (r)
3568 reason = 'b';
3569 }
3570
3571 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3572 r |= (1 << BDI_async_congested);
3573 reason = reason == 'b' ? 'a' : 'n';
3574 }
3575
3576out:
3577 mdev->congestion_reason = reason;
3578 return r;
3579}
3580
3581struct drbd_conf *drbd_new_device(unsigned int minor)
3582{
3583 struct drbd_conf *mdev;
3584 struct gendisk *disk;
3585 struct request_queue *q;
3586
3587 /* GFP_KERNEL, we are outside of all write-out paths */
3588 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3589 if (!mdev)
3590 return NULL;
3591 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3592 goto out_no_cpumask;
3593
3594 mdev->minor = minor;
3595
3596 drbd_init_set_defaults(mdev);
3597
3598 q = blk_alloc_queue(GFP_KERNEL);
3599 if (!q)
3600 goto out_no_q;
3601 mdev->rq_queue = q;
3602 q->queuedata = mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003603
3604 disk = alloc_disk(1);
3605 if (!disk)
3606 goto out_no_disk;
3607 mdev->vdisk = disk;
3608
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003609 set_disk_ro(disk, true);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003610
3611 disk->queue = q;
3612 disk->major = DRBD_MAJOR;
3613 disk->first_minor = minor;
3614 disk->fops = &drbd_ops;
3615 sprintf(disk->disk_name, "drbd%d", minor);
3616 disk->private_data = mdev;
3617
3618 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3619 /* we have no partitions. we contain only ourselves. */
3620 mdev->this_bdev->bd_contains = mdev->this_bdev;
3621
3622 q->backing_dev_info.congested_fn = drbd_congested;
3623 q->backing_dev_info.congested_data = mdev;
3624
Andreas Gruenbacher2f58dcf2010-12-13 17:48:19 +01003625 blk_queue_make_request(q, drbd_make_request);
Lars Ellenberga73ff322012-06-25 19:15:38 +02003626 blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
Philipp Reisner99432fc2011-05-20 16:39:13 +02003627 /* Setting the max_hw_sectors to an odd value of 8kibyte here
3628 This triggers a max_bio_size message upon first attach or connect */
3629 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003630 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3631 blk_queue_merge_bvec(q, drbd_merge_bvec);
Jens Axboe7eaceac2011-03-10 08:52:07 +01003632 q->queue_lock = &mdev->req_lock;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003633
3634 mdev->md_io_page = alloc_page(GFP_KERNEL);
3635 if (!mdev->md_io_page)
3636 goto out_no_io_page;
3637
3638 if (drbd_bm_init(mdev))
3639 goto out_no_bitmap;
3640 /* no need to lock access, we are still initializing this minor device. */
3641 if (!tl_init(mdev))
3642 goto out_no_tl;
3643
3644 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3645 if (!mdev->app_reads_hash)
3646 goto out_no_app_reads;
3647
3648 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3649 if (!mdev->current_epoch)
3650 goto out_no_epoch;
3651
3652 INIT_LIST_HEAD(&mdev->current_epoch->list);
3653 mdev->epochs = 1;
3654
3655 return mdev;
3656
3657/* out_whatever_else:
3658 kfree(mdev->current_epoch); */
3659out_no_epoch:
3660 kfree(mdev->app_reads_hash);
3661out_no_app_reads:
3662 tl_cleanup(mdev);
3663out_no_tl:
3664 drbd_bm_cleanup(mdev);
3665out_no_bitmap:
3666 __free_page(mdev->md_io_page);
3667out_no_io_page:
3668 put_disk(disk);
3669out_no_disk:
3670 blk_cleanup_queue(q);
3671out_no_q:
3672 free_cpumask_var(mdev->cpu_mask);
3673out_no_cpumask:
3674 kfree(mdev);
3675 return NULL;
3676}
3677
3678/* counterpart of drbd_new_device.
3679 * last part of drbd_delete_device. */
3680void drbd_free_mdev(struct drbd_conf *mdev)
3681{
3682 kfree(mdev->current_epoch);
3683 kfree(mdev->app_reads_hash);
3684 tl_cleanup(mdev);
3685 if (mdev->bitmap) /* should no longer be there. */
3686 drbd_bm_cleanup(mdev);
3687 __free_page(mdev->md_io_page);
3688 put_disk(mdev->vdisk);
3689 blk_cleanup_queue(mdev->rq_queue);
3690 free_cpumask_var(mdev->cpu_mask);
Philipp Reisner37190942010-11-10 12:08:37 +01003691 drbd_free_tl_hash(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003692 kfree(mdev);
3693}
3694
3695
3696int __init drbd_init(void)
3697{
3698 int err;
3699
3700 if (sizeof(struct p_handshake) != 80) {
3701 printk(KERN_ERR
3702 "drbd: never change the size or layout "
3703 "of the HandShake packet.\n");
3704 return -EINVAL;
3705 }
3706
Philipp Reisner2b8a90b2011-01-10 11:15:17 +01003707 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003708 printk(KERN_ERR
3709 "drbd: invalid minor_count (%d)\n", minor_count);
3710#ifdef MODULE
3711 return -EINVAL;
3712#else
3713 minor_count = 8;
3714#endif
3715 }
3716
3717 err = drbd_nl_init();
3718 if (err)
3719 return err;
3720
3721 err = register_blkdev(DRBD_MAJOR, "drbd");
3722 if (err) {
3723 printk(KERN_ERR
3724 "drbd: unable to register block device major %d\n",
3725 DRBD_MAJOR);
3726 return err;
3727 }
3728
3729 register_reboot_notifier(&drbd_notifier);
3730
3731 /*
3732 * allocate all necessary structs
3733 */
3734 err = -ENOMEM;
3735
3736 init_waitqueue_head(&drbd_pp_wait);
3737
3738 drbd_proc = NULL; /* play safe for drbd_cleanup */
3739 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3740 GFP_KERNEL);
3741 if (!minor_table)
3742 goto Enomem;
3743
3744 err = drbd_create_mempools();
3745 if (err)
3746 goto Enomem;
3747
Lars Ellenberg8c484ee2010-03-11 16:47:58 +01003748 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003749 if (!drbd_proc) {
3750 printk(KERN_ERR "drbd: unable to register proc file\n");
3751 goto Enomem;
3752 }
3753
3754 rwlock_init(&global_state_lock);
3755
3756 printk(KERN_INFO "drbd: initialized. "
3757 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3758 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3759 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3760 printk(KERN_INFO "drbd: registered as block device major %d\n",
3761 DRBD_MAJOR);
3762 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3763
3764 return 0; /* Success! */
3765
3766Enomem:
3767 drbd_cleanup();
3768 if (err == -ENOMEM)
3769 /* currently always the case */
3770 printk(KERN_ERR "drbd: ran out of memory\n");
3771 else
3772 printk(KERN_ERR "drbd: initialization failure\n");
3773 return err;
3774}
3775
3776void drbd_free_bc(struct drbd_backing_dev *ldev)
3777{
3778 if (ldev == NULL)
3779 return;
3780
Tejun Heoe525fd82010-11-13 11:55:17 +01003781 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3782 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003783
3784 kfree(ldev);
3785}
3786
3787void drbd_free_sock(struct drbd_conf *mdev)
3788{
3789 if (mdev->data.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003790 mutex_lock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003791 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3792 sock_release(mdev->data.socket);
3793 mdev->data.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003794 mutex_unlock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003795 }
3796 if (mdev->meta.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003797 mutex_lock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003798 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3799 sock_release(mdev->meta.socket);
3800 mdev->meta.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003801 mutex_unlock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003802 }
3803}
3804
3805
3806void drbd_free_resources(struct drbd_conf *mdev)
3807{
3808 crypto_free_hash(mdev->csums_tfm);
3809 mdev->csums_tfm = NULL;
3810 crypto_free_hash(mdev->verify_tfm);
3811 mdev->verify_tfm = NULL;
3812 crypto_free_hash(mdev->cram_hmac_tfm);
3813 mdev->cram_hmac_tfm = NULL;
3814 crypto_free_hash(mdev->integrity_w_tfm);
3815 mdev->integrity_w_tfm = NULL;
3816 crypto_free_hash(mdev->integrity_r_tfm);
3817 mdev->integrity_r_tfm = NULL;
3818
3819 drbd_free_sock(mdev);
3820
3821 __no_warn(local,
3822 drbd_free_bc(mdev->ldev);
3823 mdev->ldev = NULL;);
3824}
3825
3826/* meta data management */
3827
3828struct meta_data_on_disk {
3829 u64 la_size; /* last agreed size. */
3830 u64 uuid[UI_SIZE]; /* UUIDs. */
3831 u64 device_uuid;
3832 u64 reserved_u64_1;
3833 u32 flags; /* MDF */
3834 u32 magic;
3835 u32 md_size_sect;
3836 u32 al_offset; /* offset to this block */
3837 u32 al_nr_extents; /* important for restoring the AL */
3838 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3839 u32 bm_offset; /* offset to the bitmap, from here */
3840 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
Philipp Reisner99432fc2011-05-20 16:39:13 +02003841 u32 la_peer_max_bio_size; /* last peer max_bio_size */
3842 u32 reserved_u32[3];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003843
3844} __packed;
3845
3846/**
3847 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3848 * @mdev: DRBD device.
3849 */
3850void drbd_md_sync(struct drbd_conf *mdev)
3851{
3852 struct meta_data_on_disk *buffer;
3853 sector_t sector;
3854 int i;
3855
Lars Ellenbergee15b032010-09-03 10:00:09 +02003856 del_timer(&mdev->md_sync_timer);
3857 /* timer may be rearmed by drbd_md_mark_dirty() now. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003858 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3859 return;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003860
3861 /* We use here D_FAILED and not D_ATTACHING because we try to write
3862 * metadata even if we detach due to a disk failure! */
3863 if (!get_ldev_if_state(mdev, D_FAILED))
3864 return;
3865
Philipp Reisnere1711732011-06-27 11:51:46 +02003866 buffer = drbd_md_get_buffer(mdev);
3867 if (!buffer)
3868 goto out;
3869
Philipp Reisnerb411b362009-09-25 16:07:19 -07003870 memset(buffer, 0, 512);
3871
3872 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3873 for (i = UI_CURRENT; i < UI_SIZE; i++)
3874 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3875 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3876 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3877
3878 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3879 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3880 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3881 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3882 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3883
3884 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
Philipp Reisner99432fc2011-05-20 16:39:13 +02003885 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003886
3887 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3888 sector = mdev->ldev->md.md_offset;
3889
Lars Ellenberg3f3a9b82010-09-01 15:12:12 +02003890 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003891 /* this was a try anyways ... */
3892 dev_err(DEV, "meta data update failed!\n");
Lars Ellenberg383606e2012-06-14 14:21:32 +02003893 drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003894 }
3895
3896 /* Update mdev->ldev->md.la_size_sect,
3897 * since we updated it on metadata. */
3898 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3899
Philipp Reisnere1711732011-06-27 11:51:46 +02003900 drbd_md_put_buffer(mdev);
3901out:
Philipp Reisnerb411b362009-09-25 16:07:19 -07003902 put_ldev(mdev);
3903}
3904
3905/**
3906 * drbd_md_read() - Reads in the meta data super block
3907 * @mdev: DRBD device.
3908 * @bdev: Device from which the meta data should be read in.
3909 *
Andreas Gruenbacher116676c2010-12-08 13:33:11 +01003910 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
Philipp Reisnerb411b362009-09-25 16:07:19 -07003911 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3912 */
3913int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3914{
3915 struct meta_data_on_disk *buffer;
3916 int i, rv = NO_ERROR;
3917
3918 if (!get_ldev_if_state(mdev, D_ATTACHING))
3919 return ERR_IO_MD_DISK;
3920
Philipp Reisnere1711732011-06-27 11:51:46 +02003921 buffer = drbd_md_get_buffer(mdev);
3922 if (!buffer)
3923 goto out;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003924
3925 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
Lucas De Marchi25985ed2011-03-30 22:57:33 -03003926 /* NOTE: can't do normal error processing here as this is
Philipp Reisnerb411b362009-09-25 16:07:19 -07003927 called BEFORE disk is attached */
3928 dev_err(DEV, "Error while reading metadata.\n");
3929 rv = ERR_IO_MD_DISK;
3930 goto err;
3931 }
3932
3933 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3934 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3935 rv = ERR_MD_INVALID;
3936 goto err;
3937 }
3938 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3939 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3940 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3941 rv = ERR_MD_INVALID;
3942 goto err;
3943 }
3944 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3945 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3946 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3947 rv = ERR_MD_INVALID;
3948 goto err;
3949 }
3950 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3951 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3952 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3953 rv = ERR_MD_INVALID;
3954 goto err;
3955 }
3956
3957 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3958 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3959 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3960 rv = ERR_MD_INVALID;
3961 goto err;
3962 }
3963
3964 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3965 for (i = UI_CURRENT; i < UI_SIZE; i++)
3966 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3967 bdev->md.flags = be32_to_cpu(buffer->flags);
3968 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3969 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3970
Philipp Reisner99432fc2011-05-20 16:39:13 +02003971 spin_lock_irq(&mdev->req_lock);
3972 if (mdev->state.conn < C_CONNECTED) {
Lars Ellenbergdb141b22012-06-25 19:15:58 +02003973 unsigned int peer;
Philipp Reisner99432fc2011-05-20 16:39:13 +02003974 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
Lars Ellenbergdb141b22012-06-25 19:15:58 +02003975 peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
Philipp Reisner99432fc2011-05-20 16:39:13 +02003976 mdev->peer_max_bio_size = peer;
3977 }
3978 spin_unlock_irq(&mdev->req_lock);
3979
Philipp Reisnerb411b362009-09-25 16:07:19 -07003980 if (mdev->sync_conf.al_extents < 7)
3981 mdev->sync_conf.al_extents = 127;
3982
3983 err:
Philipp Reisnere1711732011-06-27 11:51:46 +02003984 drbd_md_put_buffer(mdev);
3985 out:
Philipp Reisnerb411b362009-09-25 16:07:19 -07003986 put_ldev(mdev);
3987
3988 return rv;
3989}
3990
3991/**
3992 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3993 * @mdev: DRBD device.
3994 *
3995 * Call this function if you change anything that should be written to
3996 * the meta-data super block. This function sets MD_DIRTY, and starts a
3997 * timer that ensures that within five seconds you have to call drbd_md_sync().
3998 */
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003999#ifdef DEBUG
Lars Ellenbergee15b032010-09-03 10:00:09 +02004000void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
4001{
4002 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
4003 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
4004 mdev->last_md_mark_dirty.line = line;
4005 mdev->last_md_mark_dirty.func = func;
4006 }
4007}
4008#else
Philipp Reisnerb411b362009-09-25 16:07:19 -07004009void drbd_md_mark_dirty(struct drbd_conf *mdev)
4010{
Lars Ellenbergee15b032010-09-03 10:00:09 +02004011 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
Lars Ellenbergca0e6092010-10-14 15:01:21 +02004012 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004013}
Lars Ellenbergee15b032010-09-03 10:00:09 +02004014#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07004015
Philipp Reisner9f2247b2012-08-16 14:25:58 +02004016void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004017{
4018 int i;
4019
Lars Ellenberg62b0da32011-01-20 13:25:21 +01004020 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004021 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
Philipp Reisnerb411b362009-09-25 16:07:19 -07004022}
4023
Philipp Reisner9f2247b2012-08-16 14:25:58 +02004024void __drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004025{
4026 if (idx == UI_CURRENT) {
4027 if (mdev->state.role == R_PRIMARY)
4028 val |= 1;
4029 else
4030 val &= ~((u64)1);
4031
4032 drbd_set_ed_uuid(mdev, val);
4033 }
4034
4035 mdev->ldev->md.uuid[idx] = val;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004036 drbd_md_mark_dirty(mdev);
4037}
4038
Philipp Reisner9f2247b2012-08-16 14:25:58 +02004039void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
4040{
4041 unsigned long flags;
4042 spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags);
4043 __drbd_uuid_set(mdev, idx, val);
4044 spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags);
4045}
Philipp Reisnerb411b362009-09-25 16:07:19 -07004046
4047void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
4048{
Philipp Reisner9f2247b2012-08-16 14:25:58 +02004049 unsigned long flags;
4050 spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004051 if (mdev->ldev->md.uuid[idx]) {
4052 drbd_uuid_move_history(mdev);
4053 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
Philipp Reisnerb411b362009-09-25 16:07:19 -07004054 }
Philipp Reisner9f2247b2012-08-16 14:25:58 +02004055 __drbd_uuid_set(mdev, idx, val);
4056 spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004057}
4058
4059/**
4060 * drbd_uuid_new_current() - Creates a new current UUID
4061 * @mdev: DRBD device.
4062 *
4063 * Creates a new current UUID, and rotates the old current UUID into
4064 * the bitmap slot. Causes an incremental resync upon next connect.
4065 */
4066void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
4067{
4068 u64 val;
Philipp Reisner9f2247b2012-08-16 14:25:58 +02004069 unsigned long long bm_uuid;
4070
4071 get_random_bytes(&val, sizeof(u64));
4072
4073 spin_lock_irq(&mdev->ldev->md.uuid_lock);
4074 bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
Philipp Reisnerb411b362009-09-25 16:07:19 -07004075
Lars Ellenberg62b0da32011-01-20 13:25:21 +01004076 if (bm_uuid)
4077 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
4078
Philipp Reisnerb411b362009-09-25 16:07:19 -07004079 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
Philipp Reisner9f2247b2012-08-16 14:25:58 +02004080 __drbd_uuid_set(mdev, UI_CURRENT, val);
4081 spin_unlock_irq(&mdev->ldev->md.uuid_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004082
Lars Ellenberg62b0da32011-01-20 13:25:21 +01004083 drbd_print_uuids(mdev, "new current UUID");
Lars Ellenbergaaa8e2b2010-10-15 13:16:53 +02004084 /* get it to stable storage _now_ */
4085 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004086}
4087
4088void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
4089{
Philipp Reisner9f2247b2012-08-16 14:25:58 +02004090 unsigned long flags;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004091 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
4092 return;
4093
Philipp Reisner9f2247b2012-08-16 14:25:58 +02004094 spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004095 if (val == 0) {
4096 drbd_uuid_move_history(mdev);
4097 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
4098 mdev->ldev->md.uuid[UI_BITMAP] = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004099 } else {
Lars Ellenberg62b0da32011-01-20 13:25:21 +01004100 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
4101 if (bm_uuid)
4102 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004103
Lars Ellenberg62b0da32011-01-20 13:25:21 +01004104 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004105 }
Philipp Reisner9f2247b2012-08-16 14:25:58 +02004106 spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags);
4107
Philipp Reisnerb411b362009-09-25 16:07:19 -07004108 drbd_md_mark_dirty(mdev);
4109}
4110
4111/**
4112 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
4113 * @mdev: DRBD device.
4114 *
4115 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
4116 */
4117int drbd_bmio_set_n_write(struct drbd_conf *mdev)
4118{
4119 int rv = -EIO;
4120
4121 if (get_ldev_if_state(mdev, D_ATTACHING)) {
4122 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
4123 drbd_md_sync(mdev);
4124 drbd_bm_set_all(mdev);
4125
4126 rv = drbd_bm_write(mdev);
4127
4128 if (!rv) {
4129 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
4130 drbd_md_sync(mdev);
4131 }
4132
4133 put_ldev(mdev);
4134 }
4135
4136 return rv;
4137}
4138
4139/**
4140 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
4141 * @mdev: DRBD device.
4142 *
4143 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
4144 */
4145int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
4146{
4147 int rv = -EIO;
4148
Philipp Reisner07782862010-08-31 12:00:50 +02004149 drbd_resume_al(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004150 if (get_ldev_if_state(mdev, D_ATTACHING)) {
4151 drbd_bm_clear_all(mdev);
4152 rv = drbd_bm_write(mdev);
4153 put_ldev(mdev);
4154 }
4155
4156 return rv;
4157}
4158
4159static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4160{
4161 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
Lars Ellenberg02851e92010-12-16 14:47:39 +01004162 int rv = -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004163
4164 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
4165
Lars Ellenberg02851e92010-12-16 14:47:39 +01004166 if (get_ldev(mdev)) {
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004167 drbd_bm_lock(mdev, work->why, work->flags);
Lars Ellenberg02851e92010-12-16 14:47:39 +01004168 rv = work->io_fn(mdev);
4169 drbd_bm_unlock(mdev);
4170 put_ldev(mdev);
4171 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07004172
4173 clear_bit(BITMAP_IO, &mdev->flags);
Philipp Reisner127b3172010-11-16 10:07:53 +01004174 smp_mb__after_clear_bit();
Philipp Reisnerb411b362009-09-25 16:07:19 -07004175 wake_up(&mdev->misc_wait);
4176
4177 if (work->done)
4178 work->done(mdev, rv);
4179
4180 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
4181 work->why = NULL;
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004182 work->flags = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004183
4184 return 1;
4185}
4186
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02004187void drbd_ldev_destroy(struct drbd_conf *mdev)
4188{
4189 lc_destroy(mdev->resync);
4190 mdev->resync = NULL;
4191 lc_destroy(mdev->act_log);
4192 mdev->act_log = NULL;
4193 __no_warn(local,
4194 drbd_free_bc(mdev->ldev);
4195 mdev->ldev = NULL;);
4196
4197 if (mdev->md_io_tmpp) {
4198 __free_page(mdev->md_io_tmpp);
4199 mdev->md_io_tmpp = NULL;
4200 }
4201 clear_bit(GO_DISKLESS, &mdev->flags);
4202}
4203
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004204static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4205{
4206 D_ASSERT(mdev->state.disk == D_FAILED);
Lars Ellenberg9d282872010-10-14 13:57:07 +02004207 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
4208 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02004209 * the protected members anymore, though, so once put_ldev reaches zero
4210 * again, it will be safe to free them. */
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004211 drbd_force_state(mdev, NS(disk, D_DISKLESS));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004212 return 1;
4213}
4214
4215void drbd_go_diskless(struct drbd_conf *mdev)
4216{
4217 D_ASSERT(mdev->state.disk == D_FAILED);
4218 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
Lars Ellenberg9d282872010-10-14 13:57:07 +02004219 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004220}
4221
Philipp Reisnerb411b362009-09-25 16:07:19 -07004222/**
4223 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
4224 * @mdev: DRBD device.
4225 * @io_fn: IO callback to be called when bitmap IO is possible
4226 * @done: callback to be called after the bitmap IO was performed
4227 * @why: Descriptive text of the reason for doing the IO
4228 *
4229 * While IO on the bitmap happens we freeze application IO thus we ensure
4230 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
4231 * called from worker context. It MUST NOT be used while a previous such
4232 * work is still pending!
4233 */
4234void drbd_queue_bitmap_io(struct drbd_conf *mdev,
4235 int (*io_fn)(struct drbd_conf *),
4236 void (*done)(struct drbd_conf *, int),
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004237 char *why, enum bm_flag flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004238{
4239 D_ASSERT(current == mdev->worker.task);
4240
4241 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
4242 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
4243 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
4244 if (mdev->bm_io_work.why)
4245 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
4246 why, mdev->bm_io_work.why);
4247
4248 mdev->bm_io_work.io_fn = io_fn;
4249 mdev->bm_io_work.done = done;
4250 mdev->bm_io_work.why = why;
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004251 mdev->bm_io_work.flags = flags;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004252
Philipp Reisner22afd7e2010-11-16 15:30:44 +01004253 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004254 set_bit(BITMAP_IO, &mdev->flags);
4255 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
Philipp Reisner127b3172010-11-16 10:07:53 +01004256 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004257 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004258 }
Philipp Reisner22afd7e2010-11-16 15:30:44 +01004259 spin_unlock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004260}
4261
4262/**
4263 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
4264 * @mdev: DRBD device.
4265 * @io_fn: IO callback to be called when bitmap IO is possible
4266 * @why: Descriptive text of the reason for doing the IO
4267 *
4268 * freezes application IO while that the actual IO operations runs. This
4269 * functions MAY NOT be called from worker context.
4270 */
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004271int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4272 char *why, enum bm_flag flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004273{
4274 int rv;
4275
4276 D_ASSERT(current != mdev->worker.task);
4277
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004278 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4279 drbd_suspend_io(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004280
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004281 drbd_bm_lock(mdev, why, flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004282 rv = io_fn(mdev);
4283 drbd_bm_unlock(mdev);
4284
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004285 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4286 drbd_resume_io(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004287
4288 return rv;
4289}
4290
4291void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4292{
4293 if ((mdev->ldev->md.flags & flag) != flag) {
4294 drbd_md_mark_dirty(mdev);
4295 mdev->ldev->md.flags |= flag;
4296 }
4297}
4298
4299void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4300{
4301 if ((mdev->ldev->md.flags & flag) != 0) {
4302 drbd_md_mark_dirty(mdev);
4303 mdev->ldev->md.flags &= ~flag;
4304 }
4305}
4306int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4307{
4308 return (bdev->md.flags & flag) != 0;
4309}
4310
4311static void md_sync_timer_fn(unsigned long data)
4312{
4313 struct drbd_conf *mdev = (struct drbd_conf *) data;
4314
4315 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
4316}
4317
4318static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4319{
4320 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
Lars Ellenbergee15b032010-09-03 10:00:09 +02004321#ifdef DEBUG
4322 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4323 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4324#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07004325 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004326 return 1;
4327}
4328
4329#ifdef CONFIG_DRBD_FAULT_INJECTION
4330/* Fault insertion support including random number generator shamelessly
4331 * stolen from kernel/rcutorture.c */
4332struct fault_random_state {
4333 unsigned long state;
4334 unsigned long count;
4335};
4336
4337#define FAULT_RANDOM_MULT 39916801 /* prime */
4338#define FAULT_RANDOM_ADD 479001701 /* prime */
4339#define FAULT_RANDOM_REFRESH 10000
4340
4341/*
4342 * Crude but fast random-number generator. Uses a linear congruential
4343 * generator, with occasional help from get_random_bytes().
4344 */
4345static unsigned long
4346_drbd_fault_random(struct fault_random_state *rsp)
4347{
4348 long refresh;
4349
Roel Kluin49829ea2009-12-15 22:55:44 +01004350 if (!rsp->count--) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004351 get_random_bytes(&refresh, sizeof(refresh));
4352 rsp->state += refresh;
4353 rsp->count = FAULT_RANDOM_REFRESH;
4354 }
4355 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4356 return swahw32(rsp->state);
4357}
4358
4359static char *
4360_drbd_fault_str(unsigned int type) {
4361 static char *_faults[] = {
4362 [DRBD_FAULT_MD_WR] = "Meta-data write",
4363 [DRBD_FAULT_MD_RD] = "Meta-data read",
4364 [DRBD_FAULT_RS_WR] = "Resync write",
4365 [DRBD_FAULT_RS_RD] = "Resync read",
4366 [DRBD_FAULT_DT_WR] = "Data write",
4367 [DRBD_FAULT_DT_RD] = "Data read",
4368 [DRBD_FAULT_DT_RA] = "Data read ahead",
4369 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
Philipp Reisner6b4388a2010-04-26 14:11:45 +02004370 [DRBD_FAULT_AL_EE] = "EE allocation",
4371 [DRBD_FAULT_RECEIVE] = "receive data corruption",
Philipp Reisnerb411b362009-09-25 16:07:19 -07004372 };
4373
4374 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4375}
4376
4377unsigned int
4378_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4379{
4380 static struct fault_random_state rrs = {0, 0};
4381
4382 unsigned int ret = (
4383 (fault_devs == 0 ||
4384 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4385 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4386
4387 if (ret) {
4388 fault_count++;
4389
Lars Ellenberg73835062010-05-27 11:51:56 +02004390 if (__ratelimit(&drbd_ratelimit_state))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004391 dev_warn(DEV, "***Simulating %s failure\n",
4392 _drbd_fault_str(type));
4393 }
4394
4395 return ret;
4396}
4397#endif
4398
4399const char *drbd_buildtag(void)
4400{
4401 /* DRBD built from external sources has here a reference to the
4402 git hash of the source code. */
4403
4404 static char buildtag[38] = "\0uilt-in";
4405
4406 if (buildtag[0] == 0) {
Cong Wangbc4854b2012-04-03 14:13:36 +08004407#ifdef MODULE
4408 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4409#else
4410 buildtag[0] = 'b';
Philipp Reisnerb411b362009-09-25 16:07:19 -07004411#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07004412 }
4413
4414 return buildtag;
4415}
4416
4417module_init(drbd_init)
4418module_exit(drbd_cleanup)
4419
Philipp Reisnerb411b362009-09-25 16:07:19 -07004420EXPORT_SYMBOL(drbd_conn_str);
4421EXPORT_SYMBOL(drbd_role_str);
4422EXPORT_SYMBOL(drbd_disk_str);
4423EXPORT_SYMBOL(drbd_set_st_err_str);