blob: 0ee05315f9ac66b6e31cfdedb2339bd858b73754 [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
Philipp Reisnerb411b362009-09-25 16:07:19 -070029#include <linux/module.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070030#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020035#include <linux/mutex.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070036#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070055#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020067static DEFINE_MUTEX(drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -070068int drbdd_init(struct drbd_thread *);
69int drbd_worker(struct drbd_thread *);
70int drbd_asender(struct drbd_thread *);
71
72int drbd_init(void);
73static int drbd_open(struct block_device *bdev, fmode_t mode);
74static int drbd_release(struct gendisk *gd, fmode_t mode);
75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79static void md_sync_timer_fn(unsigned long data);
80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +020081static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Philipp Reisnerb411b362009-09-25 16:07:19 -070082
Philipp Reisnerb411b362009-09-25 16:07:19 -070083MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84 "Lars Ellenberg <lars@linbit.com>");
85MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86MODULE_VERSION(REL_VERSION);
87MODULE_LICENSE("GPL");
Philipp Reisner2b8a90b2011-01-10 11:15:17 +010088MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
89 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
Philipp Reisnerb411b362009-09-25 16:07:19 -070090MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
91
92#include <linux/moduleparam.h>
93/* allow_open_on_secondary */
94MODULE_PARM_DESC(allow_oos, "DONT USE!");
95/* thanks to these macros, if compiled into the kernel (not-module),
96 * this becomes the boot parameter drbd.minor_count */
97module_param(minor_count, uint, 0444);
98module_param(disable_sendpage, bool, 0644);
99module_param(allow_oos, bool, 0);
100module_param(cn_idx, uint, 0444);
101module_param(proc_details, int, 0644);
102
103#ifdef CONFIG_DRBD_FAULT_INJECTION
104int enable_faults;
105int fault_rate;
106static int fault_count;
107int fault_devs;
108/* bitmap of enabled faults */
109module_param(enable_faults, int, 0664);
110/* fault rate % value - applies to all enabled faults */
111module_param(fault_rate, int, 0664);
112/* count of faults inserted */
113module_param(fault_count, int, 0664);
114/* bitmap of devices to insert faults on */
115module_param(fault_devs, int, 0644);
116#endif
117
118/* module parameter, defined */
Philipp Reisner2b8a90b2011-01-10 11:15:17 +0100119unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
Rusty Russell90ab5ee2012-01-13 09:32:20 +1030120bool disable_sendpage;
121bool allow_oos;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700122unsigned int cn_idx = CN_IDX_DRBD;
123int proc_details; /* Detail level in proc drbd*/
124
125/* Module parameter for setting the user mode helper program
126 * to run. Default is /sbin/drbdadm */
127char usermode_helper[80] = "/sbin/drbdadm";
128
129module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
130
131/* in 2.6.x, our device mapping and config info contains our virtual gendisks
132 * as member "struct gendisk *vdisk;"
133 */
134struct drbd_conf **minor_table;
135
136struct kmem_cache *drbd_request_cache;
137struct kmem_cache *drbd_ee_cache; /* epoch entries */
138struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
139struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
140mempool_t *drbd_request_mempool;
141mempool_t *drbd_ee_mempool;
142
143/* I do not use a standard mempool, because:
144 1) I want to hand out the pre-allocated objects first.
145 2) I want to be able to interrupt sleeping allocation with a signal.
146 Note: This is a single linked list, the next pointer is the private
147 member of struct page.
148 */
149struct page *drbd_pp_pool;
150spinlock_t drbd_pp_lock;
151int drbd_pp_vacant;
152wait_queue_head_t drbd_pp_wait;
153
154DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
155
Emese Revfy7d4e9d02009-12-14 00:59:30 +0100156static const struct block_device_operations drbd_ops = {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700157 .owner = THIS_MODULE,
158 .open = drbd_open,
159 .release = drbd_release,
160};
161
162#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
163
164#ifdef __CHECKER__
165/* When checking with sparse, and this is an inline function, sparse will
166 give tons of false positives. When this is a real functions sparse works.
167 */
168int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
169{
170 int io_allowed;
171
172 atomic_inc(&mdev->local_cnt);
173 io_allowed = (mdev->state.disk >= mins);
174 if (!io_allowed) {
175 if (atomic_dec_and_test(&mdev->local_cnt))
176 wake_up(&mdev->misc_wait);
177 }
178 return io_allowed;
179}
180
181#endif
182
183/**
184 * DOC: The transfer log
185 *
186 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
187 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
188 * of the list. There is always at least one &struct drbd_tl_epoch object.
189 *
190 * Each &struct drbd_tl_epoch has a circular double linked list of requests
191 * attached.
192 */
193static int tl_init(struct drbd_conf *mdev)
194{
195 struct drbd_tl_epoch *b;
196
197 /* during device minor initialization, we may well use GFP_KERNEL */
198 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
199 if (!b)
200 return 0;
201 INIT_LIST_HEAD(&b->requests);
202 INIT_LIST_HEAD(&b->w.list);
203 b->next = NULL;
204 b->br_number = 4711;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200205 b->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700206 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
207
208 mdev->oldest_tle = b;
209 mdev->newest_tle = b;
210 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
Philipp Reisner6d7e32f2011-03-15 10:25:18 +0100211 INIT_LIST_HEAD(&mdev->barrier_acked_requests);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700212
213 mdev->tl_hash = NULL;
214 mdev->tl_hash_s = 0;
215
216 return 1;
217}
218
219static void tl_cleanup(struct drbd_conf *mdev)
220{
221 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
222 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
223 kfree(mdev->oldest_tle);
224 mdev->oldest_tle = NULL;
225 kfree(mdev->unused_spare_tle);
226 mdev->unused_spare_tle = NULL;
227 kfree(mdev->tl_hash);
228 mdev->tl_hash = NULL;
229 mdev->tl_hash_s = 0;
230}
231
232/**
233 * _tl_add_barrier() - Adds a barrier to the transfer log
234 * @mdev: DRBD device.
235 * @new: Barrier to be added before the current head of the TL.
236 *
237 * The caller must hold the req_lock.
238 */
239void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
240{
241 struct drbd_tl_epoch *newest_before;
242
243 INIT_LIST_HEAD(&new->requests);
244 INIT_LIST_HEAD(&new->w.list);
245 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
246 new->next = NULL;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200247 new->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700248
249 newest_before = mdev->newest_tle;
250 /* never send a barrier number == 0, because that is special-cased
251 * when using TCQ for our write ordering code */
252 new->br_number = (newest_before->br_number+1) ?: 1;
253 if (mdev->newest_tle != new) {
254 mdev->newest_tle->next = new;
255 mdev->newest_tle = new;
256 }
257}
258
259/**
260 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
261 * @mdev: DRBD device.
262 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
263 * @set_size: Expected number of requests before that barrier.
264 *
265 * In case the passed barrier_nr or set_size does not match the oldest
266 * &struct drbd_tl_epoch objects this function will cause a termination
267 * of the connection.
268 */
269void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
270 unsigned int set_size)
271{
272 struct drbd_tl_epoch *b, *nob; /* next old barrier */
273 struct list_head *le, *tle;
274 struct drbd_request *r;
275
276 spin_lock_irq(&mdev->req_lock);
277
278 b = mdev->oldest_tle;
279
280 /* first some paranoia code */
281 if (b == NULL) {
282 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
283 barrier_nr);
284 goto bail;
285 }
286 if (b->br_number != barrier_nr) {
287 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
288 barrier_nr, b->br_number);
289 goto bail;
290 }
Philipp Reisner7e602c02010-05-27 14:49:27 +0200291 if (b->n_writes != set_size) {
292 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
293 barrier_nr, set_size, b->n_writes);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700294 goto bail;
295 }
296
297 /* Clean up list of requests processed during current epoch */
298 list_for_each_safe(le, tle, &b->requests) {
299 r = list_entry(le, struct drbd_request, tl_requests);
300 _req_mod(r, barrier_acked);
301 }
302 /* There could be requests on the list waiting for completion
303 of the write to the local disk. To avoid corruptions of
304 slab's data structures we have to remove the lists head.
305
306 Also there could have been a barrier ack out of sequence, overtaking
307 the write acks - which would be a bug and violating write ordering.
308 To not deadlock in case we lose connection while such requests are
309 still pending, we need some way to find them for the
310 _req_mode(connection_lost_while_pending).
311
312 These have been list_move'd to the out_of_sequence_requests list in
313 _req_mod(, barrier_acked) above.
314 */
Philipp Reisner6d7e32f2011-03-15 10:25:18 +0100315 list_splice_init(&b->requests, &mdev->barrier_acked_requests);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700316
317 nob = b->next;
318 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
319 _tl_add_barrier(mdev, b);
320 if (nob)
321 mdev->oldest_tle = nob;
322 /* if nob == NULL b was the only barrier, and becomes the new
323 barrier. Therefore mdev->oldest_tle points already to b */
324 } else {
325 D_ASSERT(nob != NULL);
326 mdev->oldest_tle = nob;
327 kfree(b);
328 }
329
330 spin_unlock_irq(&mdev->req_lock);
331 dec_ap_pending(mdev);
332
333 return;
334
335bail:
336 spin_unlock_irq(&mdev->req_lock);
337 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
338}
339
Philipp Reisner617049a2010-12-22 12:48:31 +0100340
Philipp Reisner11b58e72010-05-12 17:08:26 +0200341/**
342 * _tl_restart() - Walks the transfer log, and applies an action to all requests
343 * @mdev: DRBD device.
344 * @what: The action/event to perform with all request objects
345 *
346 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
Philipp Reisner6d7e32f2011-03-15 10:25:18 +0100347 * restart_frozen_disk_io, abort_disk_io.
Philipp Reisner11b58e72010-05-12 17:08:26 +0200348 */
349static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
350{
351 struct drbd_tl_epoch *b, *tmp, **pn;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200352 struct list_head *le, *tle, carry_reads;
Philipp Reisner11b58e72010-05-12 17:08:26 +0200353 struct drbd_request *req;
354 int rv, n_writes, n_reads;
355
356 b = mdev->oldest_tle;
357 pn = &mdev->oldest_tle;
358 while (b) {
359 n_writes = 0;
360 n_reads = 0;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200361 INIT_LIST_HEAD(&carry_reads);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200362 list_for_each_safe(le, tle, &b->requests) {
363 req = list_entry(le, struct drbd_request, tl_requests);
364 rv = _req_mod(req, what);
365
366 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
367 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
368 }
369 tmp = b->next;
370
Philipp Reisner2b4dd362011-03-14 13:01:50 +0100371 if (what == abort_disk_io) {
372 /* Only walk the TL, leave barrier objects in place */
373 b = tmp;
374 continue;
375 }
376
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200377 if (n_writes) {
Philipp Reisner11b58e72010-05-12 17:08:26 +0200378 if (what == resend) {
379 b->n_writes = n_writes;
380 if (b->w.cb == NULL) {
381 b->w.cb = w_send_barrier;
382 inc_ap_pending(mdev);
383 set_bit(CREATE_BARRIER, &mdev->flags);
384 }
385
386 drbd_queue_work(&mdev->data.work, &b->w);
387 }
388 pn = &b->next;
389 } else {
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200390 if (n_reads)
391 list_add(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200392 /* there could still be requests on that ring list,
393 * in case local io is still pending */
394 list_del(&b->requests);
395
396 /* dec_ap_pending corresponding to queue_barrier.
397 * the newest barrier may not have been queued yet,
398 * in which case w.cb is still NULL. */
399 if (b->w.cb != NULL)
400 dec_ap_pending(mdev);
401
402 if (b == mdev->newest_tle) {
403 /* recycle, but reinit! */
404 D_ASSERT(tmp == NULL);
405 INIT_LIST_HEAD(&b->requests);
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200406 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200407 INIT_LIST_HEAD(&b->w.list);
408 b->w.cb = NULL;
409 b->br_number = net_random();
410 b->n_writes = 0;
411
412 *pn = b;
413 break;
414 }
415 *pn = tmp;
416 kfree(b);
417 }
418 b = tmp;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200419 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200420 }
Philipp Reisner6d7e32f2011-03-15 10:25:18 +0100421
422 /* Actions operating on the disk state, also want to work on
423 requests that got barrier acked. */
424 switch (what) {
425 case abort_disk_io:
426 case fail_frozen_disk_io:
427 case restart_frozen_disk_io:
428 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
429 req = list_entry(le, struct drbd_request, tl_requests);
430 _req_mod(req, what);
431 }
432
433 case connection_lost_while_pending:
434 case resend:
435 break;
436 default:
437 dev_err(DEV, "what = %d in _tl_restart()\n", what);
438 }
Philipp Reisner11b58e72010-05-12 17:08:26 +0200439}
440
Philipp Reisnerb411b362009-09-25 16:07:19 -0700441
442/**
443 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
444 * @mdev: DRBD device.
445 *
446 * This is called after the connection to the peer was lost. The storage covered
447 * by the requests on the transfer gets marked as our of sync. Called from the
448 * receiver thread and the worker thread.
449 */
450void tl_clear(struct drbd_conf *mdev)
451{
Philipp Reisnerb411b362009-09-25 16:07:19 -0700452 struct list_head *le, *tle;
453 struct drbd_request *r;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700454
455 spin_lock_irq(&mdev->req_lock);
456
Philipp Reisner11b58e72010-05-12 17:08:26 +0200457 _tl_restart(mdev, connection_lost_while_pending);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700458
459 /* we expect this list to be empty. */
460 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
461
462 /* but just in case, clean it up anyways! */
463 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
464 r = list_entry(le, struct drbd_request, tl_requests);
465 /* It would be nice to complete outside of spinlock.
466 * But this is easier for now. */
467 _req_mod(r, connection_lost_while_pending);
468 }
469
470 /* ensure bit indicating barrier is required is clear */
471 clear_bit(CREATE_BARRIER, &mdev->flags);
472
Philipp Reisner288f4222010-05-27 15:07:43 +0200473 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
474
Philipp Reisnerb411b362009-09-25 16:07:19 -0700475 spin_unlock_irq(&mdev->req_lock);
476}
477
Philipp Reisner11b58e72010-05-12 17:08:26 +0200478void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
479{
480 spin_lock_irq(&mdev->req_lock);
481 _tl_restart(mdev, what);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700482 spin_unlock_irq(&mdev->req_lock);
483}
484
485/**
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100486 * cl_wide_st_chg() - true if the state change is a cluster wide one
Philipp Reisnerb411b362009-09-25 16:07:19 -0700487 * @mdev: DRBD device.
488 * @os: old (current) state.
489 * @ns: new (wanted) state.
490 */
491static int cl_wide_st_chg(struct drbd_conf *mdev,
492 union drbd_state os, union drbd_state ns)
493{
494 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
495 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
496 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
497 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
Philipp Reisner02ee8f92011-03-14 11:54:47 +0100498 (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
Philipp Reisnerb411b362009-09-25 16:07:19 -0700499 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
500 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
501}
502
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100503enum drbd_state_rv
504drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
505 union drbd_state mask, union drbd_state val)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700506{
507 unsigned long flags;
508 union drbd_state os, ns;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100509 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700510
511 spin_lock_irqsave(&mdev->req_lock, flags);
512 os = mdev->state;
513 ns.i = (os.i & ~mask.i) | val.i;
514 rv = _drbd_set_state(mdev, ns, f, NULL);
515 ns = mdev->state;
516 spin_unlock_irqrestore(&mdev->req_lock, flags);
517
518 return rv;
519}
520
521/**
522 * drbd_force_state() - Impose a change which happens outside our control on our state
523 * @mdev: DRBD device.
524 * @mask: mask of state bits to change.
525 * @val: value of new state bits.
526 */
527void drbd_force_state(struct drbd_conf *mdev,
528 union drbd_state mask, union drbd_state val)
529{
530 drbd_change_state(mdev, CS_HARD, mask, val);
531}
532
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100533static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
534static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
535 union drbd_state,
536 union drbd_state);
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200537enum sanitize_state_warnings {
538 NO_WARNING,
539 ABORTED_ONLINE_VERIFY,
540 ABORTED_RESYNC,
541 CONNECTION_LOST_NEGOTIATING,
542 IMPLICITLY_UPGRADED_DISK,
543 IMPLICITLY_UPGRADED_PDSK,
544};
Philipp Reisnerb411b362009-09-25 16:07:19 -0700545static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200546 union drbd_state ns, enum sanitize_state_warnings *warn);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700547int drbd_send_state_req(struct drbd_conf *,
548 union drbd_state, union drbd_state);
549
Andreas Gruenbacherc8b32562010-12-08 01:06:16 +0100550static enum drbd_state_rv
551_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
552 union drbd_state val)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700553{
554 union drbd_state os, ns;
555 unsigned long flags;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100556 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700557
558 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
559 return SS_CW_SUCCESS;
560
561 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
562 return SS_CW_FAILED_BY_PEER;
563
564 rv = 0;
565 spin_lock_irqsave(&mdev->req_lock, flags);
566 os = mdev->state;
567 ns.i = (os.i & ~mask.i) | val.i;
568 ns = sanitize_state(mdev, os, ns, NULL);
569
570 if (!cl_wide_st_chg(mdev, os, ns))
571 rv = SS_CW_NO_NEED;
572 if (!rv) {
573 rv = is_valid_state(mdev, ns);
574 if (rv == SS_SUCCESS) {
575 rv = is_valid_state_transition(mdev, ns, os);
576 if (rv == SS_SUCCESS)
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100577 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700578 }
579 }
580 spin_unlock_irqrestore(&mdev->req_lock, flags);
581
582 return rv;
583}
584
585/**
586 * drbd_req_state() - Perform an eventually cluster wide state change
587 * @mdev: DRBD device.
588 * @mask: mask of state bits to change.
589 * @val: value of new state bits.
590 * @f: flags
591 *
592 * Should not be called directly, use drbd_request_state() or
593 * _drbd_request_state().
594 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100595static enum drbd_state_rv
596drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
597 union drbd_state val, enum chg_state_flags f)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700598{
599 struct completion done;
600 unsigned long flags;
601 union drbd_state os, ns;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100602 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700603
604 init_completion(&done);
605
606 if (f & CS_SERIALIZE)
607 mutex_lock(&mdev->state_mutex);
608
609 spin_lock_irqsave(&mdev->req_lock, flags);
610 os = mdev->state;
611 ns.i = (os.i & ~mask.i) | val.i;
612 ns = sanitize_state(mdev, os, ns, NULL);
613
614 if (cl_wide_st_chg(mdev, os, ns)) {
615 rv = is_valid_state(mdev, ns);
616 if (rv == SS_SUCCESS)
617 rv = is_valid_state_transition(mdev, ns, os);
618 spin_unlock_irqrestore(&mdev->req_lock, flags);
619
620 if (rv < SS_SUCCESS) {
621 if (f & CS_VERBOSE)
622 print_st_err(mdev, os, ns, rv);
623 goto abort;
624 }
625
626 drbd_state_lock(mdev);
627 if (!drbd_send_state_req(mdev, mask, val)) {
628 drbd_state_unlock(mdev);
629 rv = SS_CW_FAILED_BY_PEER;
630 if (f & CS_VERBOSE)
631 print_st_err(mdev, os, ns, rv);
632 goto abort;
633 }
634
635 wait_event(mdev->state_wait,
636 (rv = _req_st_cond(mdev, mask, val)));
637
638 if (rv < SS_SUCCESS) {
639 drbd_state_unlock(mdev);
640 if (f & CS_VERBOSE)
641 print_st_err(mdev, os, ns, rv);
642 goto abort;
643 }
644 spin_lock_irqsave(&mdev->req_lock, flags);
645 os = mdev->state;
646 ns.i = (os.i & ~mask.i) | val.i;
647 rv = _drbd_set_state(mdev, ns, f, &done);
648 drbd_state_unlock(mdev);
649 } else {
650 rv = _drbd_set_state(mdev, ns, f, &done);
651 }
652
653 spin_unlock_irqrestore(&mdev->req_lock, flags);
654
655 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
656 D_ASSERT(current != mdev->worker.task);
657 wait_for_completion(&done);
658 }
659
660abort:
661 if (f & CS_SERIALIZE)
662 mutex_unlock(&mdev->state_mutex);
663
664 return rv;
665}
666
667/**
668 * _drbd_request_state() - Request a state change (with flags)
669 * @mdev: DRBD device.
670 * @mask: mask of state bits to change.
671 * @val: value of new state bits.
672 * @f: flags
673 *
674 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
675 * flag, or when logging of failed state change requests is not desired.
676 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100677enum drbd_state_rv
678_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
679 union drbd_state val, enum chg_state_flags f)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700680{
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100681 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700682
683 wait_event(mdev->state_wait,
684 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
685
686 return rv;
687}
688
689static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
690{
691 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
692 name,
693 drbd_conn_str(ns.conn),
694 drbd_role_str(ns.role),
695 drbd_role_str(ns.peer),
696 drbd_disk_str(ns.disk),
697 drbd_disk_str(ns.pdsk),
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200698 is_susp(ns) ? 's' : 'r',
Philipp Reisnerb411b362009-09-25 16:07:19 -0700699 ns.aftr_isp ? 'a' : '-',
700 ns.peer_isp ? 'p' : '-',
701 ns.user_isp ? 'u' : '-'
702 );
703}
704
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100705void print_st_err(struct drbd_conf *mdev, union drbd_state os,
706 union drbd_state ns, enum drbd_state_rv err)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700707{
708 if (err == SS_IN_TRANSIENT_STATE)
709 return;
710 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
711 print_st(mdev, " state", os);
712 print_st(mdev, "wanted", ns);
713}
714
715
Philipp Reisnerb411b362009-09-25 16:07:19 -0700716/**
717 * is_valid_state() - Returns an SS_ error code if ns is not valid
718 * @mdev: DRBD device.
719 * @ns: State to consider.
720 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100721static enum drbd_state_rv
722is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700723{
724 /* See drbd_state_sw_errors in drbd_strings.c */
725
726 enum drbd_fencing_p fp;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100727 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700728
729 fp = FP_DONT_CARE;
730 if (get_ldev(mdev)) {
731 fp = mdev->ldev->dc.fencing;
732 put_ldev(mdev);
733 }
734
735 if (get_net_conf(mdev)) {
736 if (!mdev->net_conf->two_primaries &&
737 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
738 rv = SS_TWO_PRIMARIES;
739 put_net_conf(mdev);
740 }
741
742 if (rv <= 0)
743 /* already found a reason to abort */;
744 else if (ns.role == R_SECONDARY && mdev->open_cnt)
745 rv = SS_DEVICE_IN_USE;
746
747 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
748 rv = SS_NO_UP_TO_DATE_DISK;
749
750 else if (fp >= FP_RESOURCE &&
751 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
752 rv = SS_PRIMARY_NOP;
753
754 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
755 rv = SS_NO_UP_TO_DATE_DISK;
756
757 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
758 rv = SS_NO_LOCAL_DISK;
759
760 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
761 rv = SS_NO_REMOTE_DISK;
762
Lars Ellenberg8d4ce822010-04-01 16:59:32 +0200763 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
764 rv = SS_NO_UP_TO_DATE_DISK;
765
Philipp Reisnerb411b362009-09-25 16:07:19 -0700766 else if ((ns.conn == C_CONNECTED ||
767 ns.conn == C_WF_BITMAP_S ||
768 ns.conn == C_SYNC_SOURCE ||
769 ns.conn == C_PAUSED_SYNC_S) &&
770 ns.disk == D_OUTDATED)
771 rv = SS_CONNECTED_OUTDATES;
772
773 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
774 (mdev->sync_conf.verify_alg[0] == 0))
775 rv = SS_NO_VERIFY_ALG;
776
777 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
778 mdev->agreed_pro_version < 88)
779 rv = SS_NOT_SUPPORTED;
780
Philipp Reisnerfa7d9392011-05-17 14:48:55 +0200781 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
782 rv = SS_CONNECTED_OUTDATES;
783
Philipp Reisnerb411b362009-09-25 16:07:19 -0700784 return rv;
785}
786
787/**
788 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
789 * @mdev: DRBD device.
790 * @ns: new state.
791 * @os: old state.
792 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100793static enum drbd_state_rv
794is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
795 union drbd_state os)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700796{
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100797 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700798
799 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
800 os.conn > C_CONNECTED)
801 rv = SS_RESYNC_RUNNING;
802
803 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
804 rv = SS_ALREADY_STANDALONE;
805
806 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
807 rv = SS_IS_DISKLESS;
808
809 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
810 rv = SS_NO_NET_CONFIG;
811
812 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
813 rv = SS_LOWER_THAN_OUTDATED;
814
815 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
816 rv = SS_IN_TRANSIENT_STATE;
817
818 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
819 rv = SS_IN_TRANSIENT_STATE;
820
821 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
822 rv = SS_NEED_CONNECTION;
823
824 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
825 ns.conn != os.conn && os.conn > C_CONNECTED)
826 rv = SS_RESYNC_RUNNING;
827
828 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
829 os.conn < C_CONNECTED)
830 rv = SS_NEED_CONNECTION;
831
Philipp Reisner1fc80cf2010-11-22 14:18:47 +0100832 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
833 && os.conn < C_WF_REPORT_PARAMS)
834 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
835
Philipp Reisnerb411b362009-09-25 16:07:19 -0700836 return rv;
837}
838
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200839static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
840{
841 static const char *msg_table[] = {
842 [NO_WARNING] = "",
843 [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
844 [ABORTED_RESYNC] = "Resync aborted.",
845 [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
846 [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
847 [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
848 };
849
850 if (warn != NO_WARNING)
851 dev_warn(DEV, "%s\n", msg_table[warn]);
852}
853
Philipp Reisnerb411b362009-09-25 16:07:19 -0700854/**
855 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
856 * @mdev: DRBD device.
857 * @os: old state.
858 * @ns: new state.
859 * @warn_sync_abort:
860 *
861 * When we loose connection, we have to set the state of the peers disk (pdsk)
862 * to D_UNKNOWN. This rule and many more along those lines are in this function.
863 */
864static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200865 union drbd_state ns, enum sanitize_state_warnings *warn)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700866{
867 enum drbd_fencing_p fp;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100868 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700869
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200870 if (warn)
871 *warn = NO_WARNING;
872
Philipp Reisnerb411b362009-09-25 16:07:19 -0700873 fp = FP_DONT_CARE;
874 if (get_ldev(mdev)) {
875 fp = mdev->ldev->dc.fencing;
876 put_ldev(mdev);
877 }
878
879 /* Disallow Network errors to configure a device's network part */
880 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
881 os.conn <= C_DISCONNECTING)
882 ns.conn = os.conn;
883
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200884 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
885 * If you try to go into some Sync* state, that shall fail (elsewhere). */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700886 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200887 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700888 ns.conn = os.conn;
889
Lars Ellenberg82f59cc2010-10-16 12:13:47 +0200890 /* we cannot fail (again) if we already detached */
891 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
892 ns.disk = D_DISKLESS;
893
894 /* if we are only D_ATTACHING yet,
895 * we can (and should) go directly to D_DISKLESS. */
896 if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
897 ns.disk = D_DISKLESS;
898
Philipp Reisnerb411b362009-09-25 16:07:19 -0700899 /* After C_DISCONNECTING only C_STANDALONE may follow */
900 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
901 ns.conn = os.conn;
902
903 if (ns.conn < C_CONNECTED) {
904 ns.peer_isp = 0;
905 ns.peer = R_UNKNOWN;
906 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
907 ns.pdsk = D_UNKNOWN;
908 }
909
910 /* Clear the aftr_isp when becoming unconfigured */
911 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
912 ns.aftr_isp = 0;
913
Philipp Reisnerb411b362009-09-25 16:07:19 -0700914 /* Abort resync if a disk fails/detaches */
915 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
916 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200917 if (warn)
918 *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
919 ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700920 ns.conn = C_CONNECTED;
921 }
922
Philipp Reisnerb411b362009-09-25 16:07:19 -0700923 /* Connection breaks down before we finished "Negotiating" */
924 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
925 get_ldev_if_state(mdev, D_NEGOTIATING)) {
926 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
927 ns.disk = mdev->new_state_tmp.disk;
928 ns.pdsk = mdev->new_state_tmp.pdsk;
929 } else {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200930 if (warn)
931 *warn = CONNECTION_LOST_NEGOTIATING;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700932 ns.disk = D_DISKLESS;
933 ns.pdsk = D_UNKNOWN;
934 }
935 put_ldev(mdev);
936 }
937
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100938 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
939 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
940 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
941 ns.disk = D_UP_TO_DATE;
942 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
943 ns.pdsk = D_UP_TO_DATE;
944 }
945
946 /* Implications of the connection stat on the disk states */
947 disk_min = D_DISKLESS;
948 disk_max = D_UP_TO_DATE;
949 pdsk_min = D_INCONSISTENT;
950 pdsk_max = D_UNKNOWN;
951 switch ((enum drbd_conns)ns.conn) {
952 case C_WF_BITMAP_T:
953 case C_PAUSED_SYNC_T:
954 case C_STARTING_SYNC_T:
955 case C_WF_SYNC_UUID:
956 case C_BEHIND:
957 disk_min = D_INCONSISTENT;
958 disk_max = D_OUTDATED;
959 pdsk_min = D_UP_TO_DATE;
960 pdsk_max = D_UP_TO_DATE;
961 break;
962 case C_VERIFY_S:
963 case C_VERIFY_T:
964 disk_min = D_UP_TO_DATE;
965 disk_max = D_UP_TO_DATE;
966 pdsk_min = D_UP_TO_DATE;
967 pdsk_max = D_UP_TO_DATE;
968 break;
969 case C_CONNECTED:
970 disk_min = D_DISKLESS;
971 disk_max = D_UP_TO_DATE;
972 pdsk_min = D_DISKLESS;
973 pdsk_max = D_UP_TO_DATE;
974 break;
975 case C_WF_BITMAP_S:
976 case C_PAUSED_SYNC_S:
977 case C_STARTING_SYNC_S:
978 case C_AHEAD:
979 disk_min = D_UP_TO_DATE;
980 disk_max = D_UP_TO_DATE;
981 pdsk_min = D_INCONSISTENT;
982 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
983 break;
984 case C_SYNC_TARGET:
985 disk_min = D_INCONSISTENT;
986 disk_max = D_INCONSISTENT;
987 pdsk_min = D_UP_TO_DATE;
988 pdsk_max = D_UP_TO_DATE;
989 break;
990 case C_SYNC_SOURCE:
991 disk_min = D_UP_TO_DATE;
992 disk_max = D_UP_TO_DATE;
993 pdsk_min = D_INCONSISTENT;
994 pdsk_max = D_INCONSISTENT;
995 break;
996 case C_STANDALONE:
997 case C_DISCONNECTING:
998 case C_UNCONNECTED:
999 case C_TIMEOUT:
1000 case C_BROKEN_PIPE:
1001 case C_NETWORK_FAILURE:
1002 case C_PROTOCOL_ERROR:
1003 case C_TEAR_DOWN:
1004 case C_WF_CONNECTION:
1005 case C_WF_REPORT_PARAMS:
1006 case C_MASK:
1007 break;
1008 }
1009 if (ns.disk > disk_max)
1010 ns.disk = disk_max;
1011
1012 if (ns.disk < disk_min) {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001013 if (warn)
1014 *warn = IMPLICITLY_UPGRADED_DISK;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +01001015 ns.disk = disk_min;
1016 }
1017 if (ns.pdsk > pdsk_max)
1018 ns.pdsk = pdsk_max;
1019
1020 if (ns.pdsk < pdsk_min) {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001021 if (warn)
1022 *warn = IMPLICITLY_UPGRADED_PDSK;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +01001023 ns.pdsk = pdsk_min;
1024 }
1025
Philipp Reisnerb411b362009-09-25 16:07:19 -07001026 if (fp == FP_STONITH &&
Philipp Reisner0a492162009-10-21 13:08:29 +02001027 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
1028 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001029 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
Philipp Reisner265be2d2010-05-31 10:14:17 +02001030
1031 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
1032 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
1033 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001034 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001035
1036 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
1037 if (ns.conn == C_SYNC_SOURCE)
1038 ns.conn = C_PAUSED_SYNC_S;
1039 if (ns.conn == C_SYNC_TARGET)
1040 ns.conn = C_PAUSED_SYNC_T;
1041 } else {
1042 if (ns.conn == C_PAUSED_SYNC_S)
1043 ns.conn = C_SYNC_SOURCE;
1044 if (ns.conn == C_PAUSED_SYNC_T)
1045 ns.conn = C_SYNC_TARGET;
1046 }
1047
1048 return ns;
1049}
1050
1051/* helper for __drbd_set_state */
1052static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
1053{
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001054 if (mdev->agreed_pro_version < 90)
1055 mdev->ov_start_sector = 0;
1056 mdev->rs_total = drbd_bm_bits(mdev);
1057 mdev->ov_position = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001058 if (cs == C_VERIFY_T) {
1059 /* starting online verify from an arbitrary position
1060 * does not fit well into the existing protocol.
1061 * on C_VERIFY_T, we initialize ov_left and friends
1062 * implicitly in receive_DataRequest once the
1063 * first P_OV_REQUEST is received */
1064 mdev->ov_start_sector = ~(sector_t)0;
1065 } else {
1066 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001067 if (bit >= mdev->rs_total) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001068 mdev->ov_start_sector =
1069 BM_BIT_TO_SECT(mdev->rs_total - 1);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001070 mdev->rs_total = 1;
1071 } else
1072 mdev->rs_total -= bit;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001073 mdev->ov_position = mdev->ov_start_sector;
1074 }
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001075 mdev->ov_left = mdev->rs_total;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001076}
1077
Philipp Reisner07782862010-08-31 12:00:50 +02001078static void drbd_resume_al(struct drbd_conf *mdev)
1079{
1080 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1081 dev_info(DEV, "Resumed AL updates\n");
1082}
1083
Philipp Reisnerb411b362009-09-25 16:07:19 -07001084/**
1085 * __drbd_set_state() - Set a new DRBD state
1086 * @mdev: DRBD device.
1087 * @ns: new state.
1088 * @flags: Flags
1089 * @done: Optional completion, that will get completed after the after_state_ch() finished
1090 *
1091 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1092 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01001093enum drbd_state_rv
1094__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1095 enum chg_state_flags flags, struct completion *done)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001096{
1097 union drbd_state os;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01001098 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001099 enum sanitize_state_warnings ssw;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001100 struct after_state_chg_work *ascw;
1101
1102 os = mdev->state;
1103
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001104 ns = sanitize_state(mdev, os, ns, &ssw);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001105
1106 if (ns.i == os.i)
1107 return SS_NOTHING_TO_DO;
1108
1109 if (!(flags & CS_HARD)) {
1110 /* pre-state-change checks ; only look at ns */
1111 /* See drbd_state_sw_errors in drbd_strings.c */
1112
1113 rv = is_valid_state(mdev, ns);
1114 if (rv < SS_SUCCESS) {
1115 /* If the old state was illegal as well, then let
1116 this happen...*/
1117
Philipp Reisner1616a252010-06-10 16:55:15 +02001118 if (is_valid_state(mdev, os) == rv)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001119 rv = is_valid_state_transition(mdev, ns, os);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001120 } else
1121 rv = is_valid_state_transition(mdev, ns, os);
1122 }
1123
1124 if (rv < SS_SUCCESS) {
1125 if (flags & CS_VERBOSE)
1126 print_st_err(mdev, os, ns, rv);
1127 return rv;
1128 }
1129
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001130 print_sanitize_warnings(mdev, ssw);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001131
1132 {
Andreas Gruenbacher662d91a2010-12-07 03:01:41 +01001133 char *pbp, pb[300];
1134 pbp = pb;
1135 *pbp = 0;
1136 if (ns.role != os.role)
1137 pbp += sprintf(pbp, "role( %s -> %s ) ",
1138 drbd_role_str(os.role),
1139 drbd_role_str(ns.role));
1140 if (ns.peer != os.peer)
1141 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1142 drbd_role_str(os.peer),
1143 drbd_role_str(ns.peer));
1144 if (ns.conn != os.conn)
1145 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1146 drbd_conn_str(os.conn),
1147 drbd_conn_str(ns.conn));
1148 if (ns.disk != os.disk)
1149 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1150 drbd_disk_str(os.disk),
1151 drbd_disk_str(ns.disk));
1152 if (ns.pdsk != os.pdsk)
1153 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1154 drbd_disk_str(os.pdsk),
1155 drbd_disk_str(ns.pdsk));
1156 if (is_susp(ns) != is_susp(os))
1157 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1158 is_susp(os),
1159 is_susp(ns));
1160 if (ns.aftr_isp != os.aftr_isp)
1161 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1162 os.aftr_isp,
1163 ns.aftr_isp);
1164 if (ns.peer_isp != os.peer_isp)
1165 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1166 os.peer_isp,
1167 ns.peer_isp);
1168 if (ns.user_isp != os.user_isp)
1169 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1170 os.user_isp,
1171 ns.user_isp);
1172 dev_info(DEV, "%s\n", pb);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001173 }
1174
1175 /* solve the race between becoming unconfigured,
1176 * worker doing the cleanup, and
1177 * admin reconfiguring us:
1178 * on (re)configure, first set CONFIG_PENDING,
1179 * then wait for a potentially exiting worker,
1180 * start the worker, and schedule one no_op.
1181 * then proceed with configuration.
1182 */
1183 if (ns.disk == D_DISKLESS &&
1184 ns.conn == C_STANDALONE &&
1185 ns.role == R_SECONDARY &&
1186 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1187 set_bit(DEVICE_DYING, &mdev->flags);
1188
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001189 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1190 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1191 * drbd_ldev_destroy() won't happen before our corresponding
1192 * after_state_ch works run, where we put_ldev again. */
1193 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1194 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1195 atomic_inc(&mdev->local_cnt);
1196
1197 mdev->state = ns;
Lars Ellenberg62b0da32011-01-20 13:25:21 +01001198
1199 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1200 drbd_print_uuids(mdev, "attached to UUIDs");
1201
Philipp Reisnerb411b362009-09-25 16:07:19 -07001202 wake_up(&mdev->misc_wait);
1203 wake_up(&mdev->state_wait);
1204
Philipp Reisnerb411b362009-09-25 16:07:19 -07001205 /* aborted verify run. log the last position */
1206 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1207 ns.conn < C_CONNECTED) {
1208 mdev->ov_start_sector =
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001209 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001210 dev_info(DEV, "Online Verify reached sector %llu\n",
1211 (unsigned long long)mdev->ov_start_sector);
1212 }
1213
1214 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1215 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1216 dev_info(DEV, "Syncer continues.\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001217 mdev->rs_paused += (long)jiffies
1218 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
Philipp Reisner63106d32010-09-01 15:47:15 +02001219 if (ns.conn == C_SYNC_TARGET)
1220 mod_timer(&mdev->resync_timer, jiffies);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001221 }
1222
1223 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1224 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1225 dev_info(DEV, "Resync suspended\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001226 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001227 }
1228
1229 if (os.conn == C_CONNECTED &&
1230 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001231 unsigned long now = jiffies;
1232 int i;
1233
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001234 set_ov_position(mdev, ns.conn);
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001235 mdev->rs_start = now;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001236 mdev->rs_last_events = 0;
1237 mdev->rs_last_sect_ev = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001238 mdev->ov_last_oos_size = 0;
1239 mdev->ov_last_oos_start = 0;
1240
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001241 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001242 mdev->rs_mark_left[i] = mdev->ov_left;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001243 mdev->rs_mark_time[i] = now;
1244 }
1245
Lars Ellenberg2649f082010-11-05 10:05:47 +01001246 drbd_rs_controller_reset(mdev);
1247
Philipp Reisnerb411b362009-09-25 16:07:19 -07001248 if (ns.conn == C_VERIFY_S) {
1249 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1250 (unsigned long long)mdev->ov_position);
1251 mod_timer(&mdev->resync_timer, jiffies);
1252 }
1253 }
1254
1255 if (get_ldev(mdev)) {
1256 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1257 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1258 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1259
1260 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1261 mdf |= MDF_CRASHED_PRIMARY;
1262 if (mdev->state.role == R_PRIMARY ||
1263 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1264 mdf |= MDF_PRIMARY_IND;
1265 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1266 mdf |= MDF_CONNECTED_IND;
1267 if (mdev->state.disk > D_INCONSISTENT)
1268 mdf |= MDF_CONSISTENT;
1269 if (mdev->state.disk > D_OUTDATED)
1270 mdf |= MDF_WAS_UP_TO_DATE;
1271 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1272 mdf |= MDF_PEER_OUT_DATED;
1273 if (mdf != mdev->ldev->md.flags) {
1274 mdev->ldev->md.flags = mdf;
1275 drbd_md_mark_dirty(mdev);
1276 }
1277 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1278 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1279 put_ldev(mdev);
1280 }
1281
1282 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1283 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1284 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1285 set_bit(CONSIDER_RESYNC, &mdev->flags);
1286
1287 /* Receiver should clean up itself */
1288 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1289 drbd_thread_stop_nowait(&mdev->receiver);
1290
1291 /* Now the receiver finished cleaning up itself, it should die */
1292 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1293 drbd_thread_stop_nowait(&mdev->receiver);
1294
1295 /* Upon network failure, we need to restart the receiver. */
1296 if (os.conn > C_TEAR_DOWN &&
1297 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1298 drbd_thread_restart_nowait(&mdev->receiver);
1299
Philipp Reisner07782862010-08-31 12:00:50 +02001300 /* Resume AL writing if we get a connection */
1301 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1302 drbd_resume_al(mdev);
1303
Philipp Reisnerb411b362009-09-25 16:07:19 -07001304 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1305 if (ascw) {
1306 ascw->os = os;
1307 ascw->ns = ns;
1308 ascw->flags = flags;
1309 ascw->w.cb = w_after_state_ch;
1310 ascw->done = done;
1311 drbd_queue_work(&mdev->data.work, &ascw->w);
1312 } else {
1313 dev_warn(DEV, "Could not kmalloc an ascw\n");
1314 }
1315
1316 return rv;
1317}
1318
1319static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1320{
1321 struct after_state_chg_work *ascw =
1322 container_of(w, struct after_state_chg_work, w);
1323 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1324 if (ascw->flags & CS_WAIT_COMPLETE) {
1325 D_ASSERT(ascw->done != NULL);
1326 complete(ascw->done);
1327 }
1328 kfree(ascw);
1329
1330 return 1;
1331}
1332
1333static void abw_start_sync(struct drbd_conf *mdev, int rv)
1334{
1335 if (rv) {
1336 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1337 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1338 return;
1339 }
1340
1341 switch (mdev->state.conn) {
1342 case C_STARTING_SYNC_T:
1343 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1344 break;
1345 case C_STARTING_SYNC_S:
1346 drbd_start_resync(mdev, C_SYNC_SOURCE);
1347 break;
1348 }
1349}
1350
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001351int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1352 int (*io_fn)(struct drbd_conf *),
1353 char *why, enum bm_flag flags)
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001354{
1355 int rv;
1356
1357 D_ASSERT(current == mdev->worker.task);
1358
1359 /* open coded non-blocking drbd_suspend_io(mdev); */
1360 set_bit(SUSPEND_IO, &mdev->flags);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001361
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001362 drbd_bm_lock(mdev, why, flags);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001363 rv = io_fn(mdev);
1364 drbd_bm_unlock(mdev);
1365
1366 drbd_resume_io(mdev);
1367
1368 return rv;
1369}
1370
Philipp Reisnerb411b362009-09-25 16:07:19 -07001371/**
1372 * after_state_ch() - Perform after state change actions that may sleep
1373 * @mdev: DRBD device.
1374 * @os: old state.
1375 * @ns: new state.
1376 * @flags: Flags
1377 */
1378static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1379 union drbd_state ns, enum chg_state_flags flags)
1380{
1381 enum drbd_fencing_p fp;
Philipp Reisner67098932010-06-24 16:24:25 +02001382 enum drbd_req_event what = nothing;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001383 union drbd_state nsm = (union drbd_state){ .i = -1 };
Philipp Reisnerb411b362009-09-25 16:07:19 -07001384
1385 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1386 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1387 if (mdev->p_uuid)
1388 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1389 }
1390
1391 fp = FP_DONT_CARE;
1392 if (get_ldev(mdev)) {
1393 fp = mdev->ldev->dc.fencing;
1394 put_ldev(mdev);
1395 }
1396
1397 /* Inform userspace about the change... */
1398 drbd_bcast_state(mdev, ns);
1399
1400 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1401 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1402 drbd_khelper(mdev, "pri-on-incon-degr");
1403
1404 /* Here we have the actions that are performed after a
1405 state change. This function might sleep */
1406
Philipp Reisnerdfa8bed2011-06-29 14:06:08 +02001407 if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING)
1408 mod_timer(&mdev->request_timer, jiffies + HZ);
1409
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001410 nsm.i = -1;
1411 if (ns.susp_nod) {
Philipp Reisner3f986882010-12-20 14:48:20 +01001412 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1413 what = resend;
Philipp Reisner265be2d2010-05-31 10:14:17 +02001414
Philipp Reisner79f16f52011-07-15 18:44:26 +02001415 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
1416 ns.disk > D_NEGOTIATING)
Philipp Reisner3f986882010-12-20 14:48:20 +01001417 what = restart_frozen_disk_io;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001418
Philipp Reisner3f986882010-12-20 14:48:20 +01001419 if (what != nothing)
1420 nsm.susp_nod = 0;
Philipp Reisner265be2d2010-05-31 10:14:17 +02001421 }
1422
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001423 if (ns.susp_fen) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001424 /* case1: The outdate peer handler is successful: */
1425 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001426 tl_clear(mdev);
Philipp Reisner43a51822010-06-11 11:26:34 +02001427 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1428 drbd_uuid_new_current(mdev);
1429 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner43a51822010-06-11 11:26:34 +02001430 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001431 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001432 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001433 spin_unlock_irq(&mdev->req_lock);
1434 }
Philipp Reisner43a51822010-06-11 11:26:34 +02001435 /* case2: The connection was established again: */
1436 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1437 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner67098932010-06-24 16:24:25 +02001438 what = resend;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001439 nsm.susp_fen = 0;
Philipp Reisner43a51822010-06-11 11:26:34 +02001440 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001441 }
Philipp Reisner67098932010-06-24 16:24:25 +02001442
1443 if (what != nothing) {
1444 spin_lock_irq(&mdev->req_lock);
1445 _tl_restart(mdev, what);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001446 nsm.i &= mdev->state.i;
1447 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
Philipp Reisner67098932010-06-24 16:24:25 +02001448 spin_unlock_irq(&mdev->req_lock);
1449 }
1450
Lars Ellenberg5a22db82010-12-17 21:14:23 +01001451 /* Became sync source. With protocol >= 96, we still need to send out
1452 * the sync uuid now. Need to do that before any drbd_send_state, or
1453 * the other side may go "paused sync" before receiving the sync uuids,
1454 * which is unexpected. */
1455 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1456 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1457 mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1458 drbd_gen_and_send_sync_uuid(mdev);
1459 put_ldev(mdev);
1460 }
1461
Philipp Reisnerb411b362009-09-25 16:07:19 -07001462 /* Do not change the order of the if above and the two below... */
1463 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1464 drbd_send_uuids(mdev);
1465 drbd_send_state(mdev);
1466 }
Lars Ellenberg54b956a2011-01-20 10:47:53 +01001467 /* No point in queuing send_bitmap if we don't have a connection
1468 * anymore, so check also the _current_ state, not only the new state
1469 * at the time this work was queued. */
1470 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1471 mdev->state.conn == C_WF_BITMAP_S)
1472 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001473 "send_bitmap (WFBitMapS)",
1474 BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001475
1476 /* Lost contact to peer's copy of the data */
1477 if ((os.pdsk >= D_INCONSISTENT &&
1478 os.pdsk != D_UNKNOWN &&
1479 os.pdsk != D_OUTDATED)
1480 && (ns.pdsk < D_INCONSISTENT ||
1481 ns.pdsk == D_UNKNOWN ||
1482 ns.pdsk == D_OUTDATED)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001483 if (get_ldev(mdev)) {
1484 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001485 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001486 if (is_susp(mdev->state)) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001487 set_bit(NEW_CUR_UUID, &mdev->flags);
1488 } else {
1489 drbd_uuid_new_current(mdev);
1490 drbd_send_uuids(mdev);
1491 }
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001492 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001493 put_ldev(mdev);
1494 }
1495 }
1496
1497 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
Philipp Reisnerbca482e2011-07-15 12:14:27 +02001498 if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
1499 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001500 drbd_uuid_new_current(mdev);
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001501 drbd_send_uuids(mdev);
1502 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001503 /* D_DISKLESS Peer becomes secondary */
1504 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001505 /* We may still be Primary ourselves.
1506 * No harm done if the bitmap still changes,
1507 * redirtied pages will follow later. */
1508 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1509 "demote diskless peer", BM_LOCKED_SET_ALLOWED);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001510 put_ldev(mdev);
1511 }
1512
Lars Ellenberg06d33e92010-12-18 17:00:59 +01001513 /* Write out all changed bits on demote.
1514 * Though, no need to da that just yet
1515 * if there is a resync going on still */
1516 if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1517 mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001518 /* No changes to the bitmap expected this time, so assert that,
1519 * even though no harm was done if it did change. */
1520 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1521 "demote", BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001522 put_ldev(mdev);
1523 }
1524
1525 /* Last part of the attaching process ... */
1526 if (ns.conn >= C_CONNECTED &&
1527 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
Philipp Reisnere89b5912010-03-24 17:11:33 +01001528 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001529 drbd_send_uuids(mdev);
1530 drbd_send_state(mdev);
1531 }
1532
1533 /* We want to pause/continue resync, tell peer. */
1534 if (ns.conn >= C_CONNECTED &&
1535 ((os.aftr_isp != ns.aftr_isp) ||
1536 (os.user_isp != ns.user_isp)))
1537 drbd_send_state(mdev);
1538
1539 /* In case one of the isp bits got set, suspend other devices. */
1540 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1541 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1542 suspend_other_sg(mdev);
1543
1544 /* Make sure the peer gets informed about eventual state
1545 changes (ISP bits) while we were in WFReportParams. */
1546 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1547 drbd_send_state(mdev);
1548
Philipp Reisner67531712010-10-27 12:21:30 +02001549 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1550 drbd_send_state(mdev);
1551
Philipp Reisnerb411b362009-09-25 16:07:19 -07001552 /* We are in the progress to start a full sync... */
1553 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1554 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001555 /* no other bitmap changes expected during this phase */
1556 drbd_queue_bitmap_io(mdev,
1557 &drbd_bmio_set_n_write, &abw_start_sync,
1558 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001559
1560 /* We are invalidating our self... */
1561 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1562 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001563 /* other bitmap operation expected during this phase */
1564 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1565 "set_n_write from invalidate", BM_LOCKED_MASK);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001566
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001567 /* first half of local IO error, failure to attach,
1568 * or administrative detach */
1569 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1570 enum drbd_io_error_p eh;
1571 int was_io_error;
1572 /* corresponding get_ldev was in __drbd_set_state, to serialize
1573 * our cleanup here with the transition to D_DISKLESS,
1574 * so it is safe to dreference ldev here. */
1575 eh = mdev->ldev->dc.on_io_error;
1576 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1577
Philipp Reisner2b4dd362011-03-14 13:01:50 +01001578 /* Immediately allow completion of all application IO, that waits
1579 for completion from the local disk. */
1580 tl_restart(mdev, abort_disk_io);
1581
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001582 /* current state still has to be D_FAILED,
1583 * there is only one way out: to D_DISKLESS,
1584 * and that may only happen after our put_ldev below. */
1585 if (mdev->state.disk != D_FAILED)
1586 dev_err(DEV,
1587 "ASSERT FAILED: disk is %s during detach\n",
1588 drbd_disk_str(mdev->state.disk));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001589
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001590 if (drbd_send_state(mdev))
Lars Ellenberg076673472011-06-21 01:13:37 +02001591 dev_info(DEV, "Notified peer that I am detaching my disk\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001592
1593 drbd_rs_cancel_all(mdev);
1594
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001595 /* In case we want to get something to stable storage still,
1596 * this may be the last chance.
1597 * Following put_ldev may transition to D_DISKLESS. */
1598 drbd_md_sync(mdev);
1599 put_ldev(mdev);
1600
1601 if (was_io_error && eh == EP_CALL_HELPER)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001602 drbd_khelper(mdev, "local-io-error");
1603 }
1604
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001605 /* second half of local IO error, failure to attach,
1606 * or administrative detach,
1607 * after local_cnt references have reached zero again */
1608 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1609 /* We must still be diskless,
1610 * re-attach has to be serialized with this! */
1611 if (mdev->state.disk != D_DISKLESS)
1612 dev_err(DEV,
1613 "ASSERT FAILED: disk is %s while going diskless\n",
1614 drbd_disk_str(mdev->state.disk));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001615
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001616 mdev->rs_total = 0;
1617 mdev->rs_failed = 0;
1618 atomic_set(&mdev->rs_pending_cnt, 0);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001619
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001620 if (drbd_send_state(mdev))
Lars Ellenberg076673472011-06-21 01:13:37 +02001621 dev_info(DEV, "Notified peer that I'm now diskless.\n");
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001622 /* corresponding get_ldev in __drbd_set_state
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001623 * this may finally trigger drbd_ldev_destroy. */
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001624 put_ldev(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001625 }
1626
Philipp Reisner738a84b2011-03-03 00:21:30 +01001627 /* Notify peer that I had a local IO error, and did not detached.. */
1628 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
1629 drbd_send_state(mdev);
1630
Philipp Reisnerb411b362009-09-25 16:07:19 -07001631 /* Disks got bigger while they were detached */
1632 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1633 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1634 if (ns.conn == C_CONNECTED)
1635 resync_after_online_grow(mdev);
1636 }
1637
1638 /* A resync finished or aborted, wake paused devices... */
1639 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1640 (os.peer_isp && !ns.peer_isp) ||
1641 (os.user_isp && !ns.user_isp))
1642 resume_next_sg(mdev);
1643
Lars Ellenbergaf85e8e2010-10-07 16:07:55 +02001644 /* sync target done with resync. Explicitly notify peer, even though
1645 * it should (at least for non-empty resyncs) already know itself. */
1646 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1647 drbd_send_state(mdev);
1648
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001649 /* This triggers bitmap writeout of potentially still unwritten pages
1650 * if the resync finished cleanly, or aborted because of peer disk
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001651 * failure, or because of connection loss.
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001652 * For resync aborted because of local disk failure, we cannot do
1653 * any bitmap writeout anymore.
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001654 * No harm done if some bits change during this phase.
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001655 */
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001656 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
1657 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
1658 "write from resync_finished", BM_LOCKED_SET_ALLOWED);
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001659 put_ldev(mdev);
1660 }
Lars Ellenberg02851e92010-12-16 14:47:39 +01001661
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001662 /* free tl_hash if we Got thawed and are C_STANDALONE */
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001663 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001664 drbd_free_tl_hash(mdev);
1665
Philipp Reisnerb411b362009-09-25 16:07:19 -07001666 /* Upon network connection, we need to start the receiver */
1667 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1668 drbd_thread_start(&mdev->receiver);
1669
1670 /* Terminate worker thread if we are unconfigured - it will be
1671 restarted as needed... */
1672 if (ns.disk == D_DISKLESS &&
1673 ns.conn == C_STANDALONE &&
1674 ns.role == R_SECONDARY) {
1675 if (os.aftr_isp != ns.aftr_isp)
1676 resume_next_sg(mdev);
1677 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1678 if (test_bit(DEVICE_DYING, &mdev->flags))
1679 drbd_thread_stop_nowait(&mdev->worker);
1680 }
1681
1682 drbd_md_sync(mdev);
1683}
1684
1685
1686static int drbd_thread_setup(void *arg)
1687{
1688 struct drbd_thread *thi = (struct drbd_thread *) arg;
1689 struct drbd_conf *mdev = thi->mdev;
1690 unsigned long flags;
1691 int retval;
1692
1693restart:
1694 retval = thi->function(thi);
1695
1696 spin_lock_irqsave(&thi->t_lock, flags);
1697
1698 /* if the receiver has been "Exiting", the last thing it did
1699 * was set the conn state to "StandAlone",
1700 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1701 * and receiver thread will be "started".
1702 * drbd_thread_start needs to set "Restarting" in that case.
1703 * t_state check and assignment needs to be within the same spinlock,
1704 * so either thread_start sees Exiting, and can remap to Restarting,
1705 * or thread_start see None, and can proceed as normal.
1706 */
1707
1708 if (thi->t_state == Restarting) {
1709 dev_info(DEV, "Restarting %s\n", current->comm);
1710 thi->t_state = Running;
1711 spin_unlock_irqrestore(&thi->t_lock, flags);
1712 goto restart;
1713 }
1714
1715 thi->task = NULL;
1716 thi->t_state = None;
1717 smp_mb();
1718 complete(&thi->stop);
1719 spin_unlock_irqrestore(&thi->t_lock, flags);
1720
1721 dev_info(DEV, "Terminating %s\n", current->comm);
1722
1723 /* Release mod reference taken when thread was started */
1724 module_put(THIS_MODULE);
1725 return retval;
1726}
1727
1728static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1729 int (*func) (struct drbd_thread *))
1730{
1731 spin_lock_init(&thi->t_lock);
1732 thi->task = NULL;
1733 thi->t_state = None;
1734 thi->function = func;
1735 thi->mdev = mdev;
1736}
1737
1738int drbd_thread_start(struct drbd_thread *thi)
1739{
1740 struct drbd_conf *mdev = thi->mdev;
1741 struct task_struct *nt;
1742 unsigned long flags;
1743
1744 const char *me =
1745 thi == &mdev->receiver ? "receiver" :
1746 thi == &mdev->asender ? "asender" :
1747 thi == &mdev->worker ? "worker" : "NONSENSE";
1748
1749 /* is used from state engine doing drbd_thread_stop_nowait,
1750 * while holding the req lock irqsave */
1751 spin_lock_irqsave(&thi->t_lock, flags);
1752
1753 switch (thi->t_state) {
1754 case None:
1755 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1756 me, current->comm, current->pid);
1757
1758 /* Get ref on module for thread - this is released when thread exits */
1759 if (!try_module_get(THIS_MODULE)) {
1760 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1761 spin_unlock_irqrestore(&thi->t_lock, flags);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001762 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001763 }
1764
1765 init_completion(&thi->stop);
1766 D_ASSERT(thi->task == NULL);
1767 thi->reset_cpu_mask = 1;
1768 thi->t_state = Running;
1769 spin_unlock_irqrestore(&thi->t_lock, flags);
1770 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1771
1772 nt = kthread_create(drbd_thread_setup, (void *) thi,
1773 "drbd%d_%s", mdev_to_minor(mdev), me);
1774
1775 if (IS_ERR(nt)) {
1776 dev_err(DEV, "Couldn't start thread\n");
1777
1778 module_put(THIS_MODULE);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001779 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001780 }
1781 spin_lock_irqsave(&thi->t_lock, flags);
1782 thi->task = nt;
1783 thi->t_state = Running;
1784 spin_unlock_irqrestore(&thi->t_lock, flags);
1785 wake_up_process(nt);
1786 break;
1787 case Exiting:
1788 thi->t_state = Restarting;
1789 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1790 me, current->comm, current->pid);
1791 /* fall through */
1792 case Running:
1793 case Restarting:
1794 default:
1795 spin_unlock_irqrestore(&thi->t_lock, flags);
1796 break;
1797 }
1798
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001799 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001800}
1801
1802
1803void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1804{
1805 unsigned long flags;
1806
1807 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1808
1809 /* may be called from state engine, holding the req lock irqsave */
1810 spin_lock_irqsave(&thi->t_lock, flags);
1811
1812 if (thi->t_state == None) {
1813 spin_unlock_irqrestore(&thi->t_lock, flags);
1814 if (restart)
1815 drbd_thread_start(thi);
1816 return;
1817 }
1818
1819 if (thi->t_state != ns) {
1820 if (thi->task == NULL) {
1821 spin_unlock_irqrestore(&thi->t_lock, flags);
1822 return;
1823 }
1824
1825 thi->t_state = ns;
1826 smp_mb();
1827 init_completion(&thi->stop);
1828 if (thi->task != current)
1829 force_sig(DRBD_SIGKILL, thi->task);
1830
1831 }
1832
1833 spin_unlock_irqrestore(&thi->t_lock, flags);
1834
1835 if (wait)
1836 wait_for_completion(&thi->stop);
1837}
1838
1839#ifdef CONFIG_SMP
1840/**
1841 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1842 * @mdev: DRBD device.
1843 *
1844 * Forces all threads of a device onto the same CPU. This is beneficial for
1845 * DRBD's performance. May be overwritten by user's configuration.
1846 */
1847void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1848{
1849 int ord, cpu;
1850
1851 /* user override. */
1852 if (cpumask_weight(mdev->cpu_mask))
1853 return;
1854
1855 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1856 for_each_online_cpu(cpu) {
1857 if (ord-- == 0) {
1858 cpumask_set_cpu(cpu, mdev->cpu_mask);
1859 return;
1860 }
1861 }
1862 /* should not be reached */
1863 cpumask_setall(mdev->cpu_mask);
1864}
1865
1866/**
1867 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1868 * @mdev: DRBD device.
1869 *
1870 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1871 * prematurely.
1872 */
1873void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1874{
1875 struct task_struct *p = current;
1876 struct drbd_thread *thi =
1877 p == mdev->asender.task ? &mdev->asender :
1878 p == mdev->receiver.task ? &mdev->receiver :
1879 p == mdev->worker.task ? &mdev->worker :
1880 NULL;
1881 ERR_IF(thi == NULL)
1882 return;
1883 if (!thi->reset_cpu_mask)
1884 return;
1885 thi->reset_cpu_mask = 0;
1886 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1887}
1888#endif
1889
1890/* the appropriate socket mutex must be held already */
1891int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001892 enum drbd_packets cmd, struct p_header80 *h,
Philipp Reisnerb411b362009-09-25 16:07:19 -07001893 size_t size, unsigned msg_flags)
1894{
1895 int sent, ok;
1896
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001897 ERR_IF(!h) return false;
1898 ERR_IF(!size) return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001899
1900 h->magic = BE_DRBD_MAGIC;
1901 h->command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02001902 h->length = cpu_to_be16(size-sizeof(struct p_header80));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001903
Philipp Reisnerb411b362009-09-25 16:07:19 -07001904 sent = drbd_send(mdev, sock, h, size, msg_flags);
1905
1906 ok = (sent == size);
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01001907 if (!ok && !signal_pending(current))
1908 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
Philipp Reisnerb411b362009-09-25 16:07:19 -07001909 cmdname(cmd), (int)size, sent);
1910 return ok;
1911}
1912
1913/* don't pass the socket. we may only look at it
1914 * when we hold the appropriate socket mutex.
1915 */
1916int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001917 enum drbd_packets cmd, struct p_header80 *h, size_t size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001918{
1919 int ok = 0;
1920 struct socket *sock;
1921
1922 if (use_data_socket) {
1923 mutex_lock(&mdev->data.mutex);
1924 sock = mdev->data.socket;
1925 } else {
1926 mutex_lock(&mdev->meta.mutex);
1927 sock = mdev->meta.socket;
1928 }
1929
1930 /* drbd_disconnect() could have called drbd_free_sock()
1931 * while we were waiting in down()... */
1932 if (likely(sock != NULL))
1933 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1934
1935 if (use_data_socket)
1936 mutex_unlock(&mdev->data.mutex);
1937 else
1938 mutex_unlock(&mdev->meta.mutex);
1939 return ok;
1940}
1941
1942int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1943 size_t size)
1944{
Philipp Reisner0b70a132010-08-20 13:36:10 +02001945 struct p_header80 h;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001946 int ok;
1947
1948 h.magic = BE_DRBD_MAGIC;
1949 h.command = cpu_to_be16(cmd);
1950 h.length = cpu_to_be16(size);
1951
1952 if (!drbd_get_data_sock(mdev))
1953 return 0;
1954
Philipp Reisnerb411b362009-09-25 16:07:19 -07001955 ok = (sizeof(h) ==
1956 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1957 ok = ok && (size ==
1958 drbd_send(mdev, mdev->data.socket, data, size, 0));
1959
1960 drbd_put_data_sock(mdev);
1961
1962 return ok;
1963}
1964
1965int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1966{
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001967 struct p_rs_param_95 *p;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001968 struct socket *sock;
1969 int size, rv;
1970 const int apv = mdev->agreed_pro_version;
1971
1972 size = apv <= 87 ? sizeof(struct p_rs_param)
1973 : apv == 88 ? sizeof(struct p_rs_param)
1974 + strlen(mdev->sync_conf.verify_alg) + 1
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001975 : apv <= 94 ? sizeof(struct p_rs_param_89)
1976 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001977
1978 /* used from admin command context and receiver/worker context.
1979 * to avoid kmalloc, grab the socket right here,
1980 * then use the pre-allocated sbuf there */
1981 mutex_lock(&mdev->data.mutex);
1982 sock = mdev->data.socket;
1983
1984 if (likely(sock != NULL)) {
1985 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1986
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001987 p = &mdev->data.sbuf.rs_param_95;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001988
1989 /* initialize verify_alg and csums_alg */
1990 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1991
1992 p->rate = cpu_to_be32(sc->rate);
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001993 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1994 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1995 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1996 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001997
1998 if (apv >= 88)
1999 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
2000 if (apv >= 89)
2001 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
2002
2003 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
2004 } else
2005 rv = 0; /* not ok */
2006
2007 mutex_unlock(&mdev->data.mutex);
2008
2009 return rv;
2010}
2011
2012int drbd_send_protocol(struct drbd_conf *mdev)
2013{
2014 struct p_protocol *p;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002015 int size, cf, rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002016
2017 size = sizeof(struct p_protocol);
2018
2019 if (mdev->agreed_pro_version >= 87)
2020 size += strlen(mdev->net_conf->integrity_alg) + 1;
2021
2022 /* we must not recurse into our own queue,
2023 * as that is blocked during handshake */
2024 p = kmalloc(size, GFP_NOIO);
2025 if (p == NULL)
2026 return 0;
2027
2028 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
2029 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
2030 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
2031 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002032 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
2033
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002034 cf = 0;
2035 if (mdev->net_conf->want_lose)
2036 cf |= CF_WANT_LOSE;
2037 if (mdev->net_conf->dry_run) {
2038 if (mdev->agreed_pro_version >= 92)
2039 cf |= CF_DRY_RUN;
2040 else {
2041 dev_err(DEV, "--dry-run is not supported by peer");
Dan Carpenter7ac314c2010-04-22 14:27:23 +02002042 kfree(p);
Philipp Reisner148efa12011-01-15 00:21:15 +01002043 return -1;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002044 }
2045 }
2046 p->conn_flags = cpu_to_be32(cf);
2047
Philipp Reisnerb411b362009-09-25 16:07:19 -07002048 if (mdev->agreed_pro_version >= 87)
2049 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
2050
2051 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002052 (struct p_header80 *)p, size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002053 kfree(p);
2054 return rv;
2055}
2056
2057int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
2058{
2059 struct p_uuids p;
2060 int i;
2061
2062 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2063 return 1;
2064
2065 for (i = UI_CURRENT; i < UI_SIZE; i++)
2066 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
2067
2068 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2069 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
2070 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
2071 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
2072 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2073 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2074
2075 put_ldev(mdev);
2076
2077 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002078 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002079}
2080
2081int drbd_send_uuids(struct drbd_conf *mdev)
2082{
2083 return _drbd_send_uuids(mdev, 0);
2084}
2085
2086int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2087{
2088 return _drbd_send_uuids(mdev, 8);
2089}
2090
Lars Ellenberg62b0da32011-01-20 13:25:21 +01002091void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2092{
2093 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2094 u64 *uuid = mdev->ldev->md.uuid;
2095 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2096 text,
2097 (unsigned long long)uuid[UI_CURRENT],
2098 (unsigned long long)uuid[UI_BITMAP],
2099 (unsigned long long)uuid[UI_HISTORY_START],
2100 (unsigned long long)uuid[UI_HISTORY_END]);
2101 put_ldev(mdev);
2102 } else {
2103 dev_info(DEV, "%s effective data uuid: %016llX\n",
2104 text,
2105 (unsigned long long)mdev->ed_uuid);
2106 }
2107}
2108
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002109int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002110{
2111 struct p_rs_uuid p;
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002112 u64 uuid;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002113
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002114 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
2115
Philipp Reisner4a23f262011-01-11 17:42:17 +01002116 uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002117 drbd_uuid_set(mdev, UI_BITMAP, uuid);
Lars Ellenberg62b0da32011-01-20 13:25:21 +01002118 drbd_print_uuids(mdev, "updated sync UUID");
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002119 drbd_md_sync(mdev);
2120 p.uuid = cpu_to_be64(uuid);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002121
2122 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002123 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002124}
2125
Philipp Reisnere89b5912010-03-24 17:11:33 +01002126int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002127{
2128 struct p_sizes p;
2129 sector_t d_size, u_size;
Philipp Reisner99432fc2011-05-20 16:39:13 +02002130 int q_order_type, max_bio_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002131 int ok;
2132
2133 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2134 D_ASSERT(mdev->ldev->backing_bdev);
2135 d_size = drbd_get_max_capacity(mdev->ldev);
2136 u_size = mdev->ldev->dc.disk_size;
2137 q_order_type = drbd_queue_order_type(mdev);
Philipp Reisner99432fc2011-05-20 16:39:13 +02002138 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
2139 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002140 put_ldev(mdev);
2141 } else {
2142 d_size = 0;
2143 u_size = 0;
2144 q_order_type = QUEUE_ORDERED_NONE;
Philipp Reisner99432fc2011-05-20 16:39:13 +02002145 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002146 }
2147
Philipp Reisner68093842011-06-30 15:43:06 +02002148 /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
2149 if (mdev->agreed_pro_version <= 94)
2150 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
2151
Philipp Reisnerb411b362009-09-25 16:07:19 -07002152 p.d_size = cpu_to_be64(d_size);
2153 p.u_size = cpu_to_be64(u_size);
2154 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
Philipp Reisner99432fc2011-05-20 16:39:13 +02002155 p.max_bio_size = cpu_to_be32(max_bio_size);
Philipp Reisnere89b5912010-03-24 17:11:33 +01002156 p.queue_order_type = cpu_to_be16(q_order_type);
2157 p.dds_flags = cpu_to_be16(flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002158
2159 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002160 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002161 return ok;
2162}
2163
2164/**
2165 * drbd_send_state() - Sends the drbd state to the peer
2166 * @mdev: DRBD device.
2167 */
2168int drbd_send_state(struct drbd_conf *mdev)
2169{
2170 struct socket *sock;
2171 struct p_state p;
2172 int ok = 0;
2173
2174 /* Grab state lock so we wont send state if we're in the middle
2175 * of a cluster wide state change on another thread */
2176 drbd_state_lock(mdev);
2177
2178 mutex_lock(&mdev->data.mutex);
2179
2180 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2181 sock = mdev->data.socket;
2182
2183 if (likely(sock != NULL)) {
2184 ok = _drbd_send_cmd(mdev, sock, P_STATE,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002185 (struct p_header80 *)&p, sizeof(p), 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002186 }
2187
2188 mutex_unlock(&mdev->data.mutex);
2189
2190 drbd_state_unlock(mdev);
2191 return ok;
2192}
2193
2194int drbd_send_state_req(struct drbd_conf *mdev,
2195 union drbd_state mask, union drbd_state val)
2196{
2197 struct p_req_state p;
2198
2199 p.mask = cpu_to_be32(mask.i);
2200 p.val = cpu_to_be32(val.i);
2201
2202 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002203 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002204}
2205
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01002206int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002207{
2208 struct p_req_state_reply p;
2209
2210 p.retcode = cpu_to_be32(retcode);
2211
2212 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002213 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002214}
2215
2216int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2217 struct p_compressed_bm *p,
2218 struct bm_xfer_ctx *c)
2219{
2220 struct bitstream bs;
2221 unsigned long plain_bits;
2222 unsigned long tmp;
2223 unsigned long rl;
2224 unsigned len;
2225 unsigned toggle;
2226 int bits;
2227
2228 /* may we use this feature? */
2229 if ((mdev->sync_conf.use_rle == 0) ||
2230 (mdev->agreed_pro_version < 90))
2231 return 0;
2232
2233 if (c->bit_offset >= c->bm_bits)
2234 return 0; /* nothing to do. */
2235
2236 /* use at most thus many bytes */
2237 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2238 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2239 /* plain bits covered in this code string */
2240 plain_bits = 0;
2241
2242 /* p->encoding & 0x80 stores whether the first run length is set.
2243 * bit offset is implicit.
2244 * start with toggle == 2 to be able to tell the first iteration */
2245 toggle = 2;
2246
2247 /* see how much plain bits we can stuff into one packet
2248 * using RLE and VLI. */
2249 do {
2250 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2251 : _drbd_bm_find_next(mdev, c->bit_offset);
2252 if (tmp == -1UL)
2253 tmp = c->bm_bits;
2254 rl = tmp - c->bit_offset;
2255
2256 if (toggle == 2) { /* first iteration */
2257 if (rl == 0) {
2258 /* the first checked bit was set,
2259 * store start value, */
2260 DCBP_set_start(p, 1);
2261 /* but skip encoding of zero run length */
2262 toggle = !toggle;
2263 continue;
2264 }
2265 DCBP_set_start(p, 0);
2266 }
2267
2268 /* paranoia: catch zero runlength.
2269 * can only happen if bitmap is modified while we scan it. */
2270 if (rl == 0) {
2271 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2272 "t:%u bo:%lu\n", toggle, c->bit_offset);
2273 return -1;
2274 }
2275
2276 bits = vli_encode_bits(&bs, rl);
2277 if (bits == -ENOBUFS) /* buffer full */
2278 break;
2279 if (bits <= 0) {
2280 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2281 return 0;
2282 }
2283
2284 toggle = !toggle;
2285 plain_bits += rl;
2286 c->bit_offset = tmp;
2287 } while (c->bit_offset < c->bm_bits);
2288
2289 len = bs.cur.b - p->code + !!bs.cur.bit;
2290
2291 if (plain_bits < (len << 3)) {
2292 /* incompressible with this method.
2293 * we need to rewind both word and bit position. */
2294 c->bit_offset -= plain_bits;
2295 bm_xfer_ctx_bit_to_word_offset(c);
2296 c->bit_offset = c->word_offset * BITS_PER_LONG;
2297 return 0;
2298 }
2299
2300 /* RLE + VLI was able to compress it just fine.
2301 * update c->word_offset. */
2302 bm_xfer_ctx_bit_to_word_offset(c);
2303
2304 /* store pad_bits */
2305 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2306
2307 return len;
2308}
2309
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002310/**
2311 * send_bitmap_rle_or_plain
2312 *
2313 * Return 0 when done, 1 when another iteration is needed, and a negative error
2314 * code upon failure.
2315 */
2316static int
Philipp Reisnerb411b362009-09-25 16:07:19 -07002317send_bitmap_rle_or_plain(struct drbd_conf *mdev,
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002318 struct p_header80 *h, struct bm_xfer_ctx *c)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002319{
2320 struct p_compressed_bm *p = (void*)h;
2321 unsigned long num_words;
2322 int len;
2323 int ok;
2324
2325 len = fill_bitmap_rle_bits(mdev, p, c);
2326
2327 if (len < 0)
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002328 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002329
2330 if (len) {
2331 DCBP_set_code(p, RLE_VLI_Bits);
2332 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2333 sizeof(*p) + len, 0);
2334
2335 c->packets[0]++;
2336 c->bytes[0] += sizeof(*p) + len;
2337
2338 if (c->bit_offset >= c->bm_bits)
2339 len = 0; /* DONE */
2340 } else {
2341 /* was not compressible.
2342 * send a buffer full of plain text bits instead. */
2343 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2344 len = num_words * sizeof(long);
2345 if (len)
2346 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2347 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002348 h, sizeof(struct p_header80) + len, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002349 c->word_offset += num_words;
2350 c->bit_offset = c->word_offset * BITS_PER_LONG;
2351
2352 c->packets[1]++;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002353 c->bytes[1] += sizeof(struct p_header80) + len;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002354
2355 if (c->bit_offset > c->bm_bits)
2356 c->bit_offset = c->bm_bits;
2357 }
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002358 if (ok) {
2359 if (len == 0) {
2360 INFO_bm_xfer_stats(mdev, "send", c);
2361 return 0;
2362 } else
2363 return 1;
2364 }
2365 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002366}
2367
2368/* See the comment at receive_bitmap() */
2369int _drbd_send_bitmap(struct drbd_conf *mdev)
2370{
2371 struct bm_xfer_ctx c;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002372 struct p_header80 *p;
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002373 int err;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002374
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002375 ERR_IF(!mdev->bitmap) return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002376
2377 /* maybe we should use some per thread scratch page,
2378 * and allocate that during initial device creation? */
Philipp Reisner0b70a132010-08-20 13:36:10 +02002379 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002380 if (!p) {
2381 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002382 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002383 }
2384
2385 if (get_ldev(mdev)) {
2386 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2387 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2388 drbd_bm_set_all(mdev);
2389 if (drbd_bm_write(mdev)) {
2390 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2391 * but otherwise process as per normal - need to tell other
2392 * side that a full resync is required! */
2393 dev_err(DEV, "Failed to write bitmap to disk!\n");
2394 } else {
2395 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2396 drbd_md_sync(mdev);
2397 }
2398 }
2399 put_ldev(mdev);
2400 }
2401
2402 c = (struct bm_xfer_ctx) {
2403 .bm_bits = drbd_bm_bits(mdev),
2404 .bm_words = drbd_bm_words(mdev),
2405 };
2406
2407 do {
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002408 err = send_bitmap_rle_or_plain(mdev, p, &c);
2409 } while (err > 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002410
2411 free_page((unsigned long) p);
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002412 return err == 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002413}
2414
2415int drbd_send_bitmap(struct drbd_conf *mdev)
2416{
2417 int err;
2418
2419 if (!drbd_get_data_sock(mdev))
2420 return -1;
2421 err = !_drbd_send_bitmap(mdev);
2422 drbd_put_data_sock(mdev);
2423 return err;
2424}
2425
2426int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2427{
2428 int ok;
2429 struct p_barrier_ack p;
2430
2431 p.barrier = barrier_nr;
2432 p.set_size = cpu_to_be32(set_size);
2433
2434 if (mdev->state.conn < C_CONNECTED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002435 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002436 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002437 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002438 return ok;
2439}
2440
2441/**
2442 * _drbd_send_ack() - Sends an ack packet
2443 * @mdev: DRBD device.
2444 * @cmd: Packet command code.
2445 * @sector: sector, needs to be in big endian byte order
2446 * @blksize: size in byte, needs to be in big endian byte order
2447 * @block_id: Id, big endian byte order
2448 */
2449static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2450 u64 sector,
2451 u32 blksize,
2452 u64 block_id)
2453{
2454 int ok;
2455 struct p_block_ack p;
2456
2457 p.sector = sector;
2458 p.block_id = block_id;
2459 p.blksize = blksize;
2460 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2461
2462 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002463 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002464 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002465 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002466 return ok;
2467}
2468
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002469/* dp->sector and dp->block_id already/still in network byte order,
2470 * data_size is payload size according to dp->head,
2471 * and may need to be corrected for digest size. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002472int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002473 struct p_data *dp, int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002474{
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002475 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2476 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002477 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2478 dp->block_id);
2479}
2480
2481int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2482 struct p_block_req *rp)
2483{
2484 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2485}
2486
2487/**
2488 * drbd_send_ack() - Sends an ack packet
2489 * @mdev: DRBD device.
2490 * @cmd: Packet command code.
2491 * @e: Epoch entry.
2492 */
2493int drbd_send_ack(struct drbd_conf *mdev,
2494 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2495{
2496 return _drbd_send_ack(mdev, cmd,
2497 cpu_to_be64(e->sector),
2498 cpu_to_be32(e->size),
2499 e->block_id);
2500}
2501
2502/* This function misuses the block_id field to signal if the blocks
2503 * are is sync or not. */
2504int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2505 sector_t sector, int blksize, u64 block_id)
2506{
2507 return _drbd_send_ack(mdev, cmd,
2508 cpu_to_be64(sector),
2509 cpu_to_be32(blksize),
2510 cpu_to_be64(block_id));
2511}
2512
2513int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2514 sector_t sector, int size, u64 block_id)
2515{
2516 int ok;
2517 struct p_block_req p;
2518
2519 p.sector = cpu_to_be64(sector);
2520 p.block_id = block_id;
2521 p.blksize = cpu_to_be32(size);
2522
2523 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002524 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002525 return ok;
2526}
2527
2528int drbd_send_drequest_csum(struct drbd_conf *mdev,
2529 sector_t sector, int size,
2530 void *digest, int digest_size,
2531 enum drbd_packets cmd)
2532{
2533 int ok;
2534 struct p_block_req p;
2535
2536 p.sector = cpu_to_be64(sector);
2537 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2538 p.blksize = cpu_to_be32(size);
2539
2540 p.head.magic = BE_DRBD_MAGIC;
2541 p.head.command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002542 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002543
2544 mutex_lock(&mdev->data.mutex);
2545
2546 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2547 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2548
2549 mutex_unlock(&mdev->data.mutex);
2550
2551 return ok;
2552}
2553
2554int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2555{
2556 int ok;
2557 struct p_block_req p;
2558
2559 p.sector = cpu_to_be64(sector);
2560 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2561 p.blksize = cpu_to_be32(size);
2562
2563 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002564 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002565 return ok;
2566}
2567
2568/* called on sndtimeo
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002569 * returns false if we should retry,
2570 * true if we think connection is dead
Philipp Reisnerb411b362009-09-25 16:07:19 -07002571 */
2572static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2573{
2574 int drop_it;
2575 /* long elapsed = (long)(jiffies - mdev->last_received); */
2576
2577 drop_it = mdev->meta.socket == sock
2578 || !mdev->asender.task
2579 || get_t_state(&mdev->asender) != Running
2580 || mdev->state.conn < C_CONNECTED;
2581
2582 if (drop_it)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002583 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002584
2585 drop_it = !--mdev->ko_count;
2586 if (!drop_it) {
2587 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2588 current->comm, current->pid, mdev->ko_count);
2589 request_ping(mdev);
2590 }
2591
2592 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2593}
2594
2595/* The idea of sendpage seems to be to put some kind of reference
2596 * to the page into the skb, and to hand it over to the NIC. In
2597 * this process get_page() gets called.
2598 *
2599 * As soon as the page was really sent over the network put_page()
2600 * gets called by some part of the network layer. [ NIC driver? ]
2601 *
2602 * [ get_page() / put_page() increment/decrement the count. If count
2603 * reaches 0 the page will be freed. ]
2604 *
2605 * This works nicely with pages from FSs.
2606 * But this means that in protocol A we might signal IO completion too early!
2607 *
2608 * In order not to corrupt data during a resync we must make sure
2609 * that we do not reuse our own buffer pages (EEs) to early, therefore
2610 * we have the net_ee list.
2611 *
2612 * XFS seems to have problems, still, it submits pages with page_count == 0!
2613 * As a workaround, we disable sendpage on pages
2614 * with page_count == 0 or PageSlab.
2615 */
2616static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002617 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002618{
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002619 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002620 kunmap(page);
2621 if (sent == size)
2622 mdev->send_cnt += size>>9;
2623 return sent == size;
2624}
2625
2626static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002627 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002628{
2629 mm_segment_t oldfs = get_fs();
2630 int sent, ok;
2631 int len = size;
2632
2633 /* e.g. XFS meta- & log-data is in slab pages, which have a
2634 * page_count of 0 and/or have PageSlab() set.
2635 * we cannot use send_page for those, as that does get_page();
2636 * put_page(); and would cause either a VM_BUG directly, or
2637 * __page_cache_release a page that would actually still be referenced
2638 * by someone, leading to some obscure delayed Oops somewhere else. */
2639 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002640 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002641
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002642 msg_flags |= MSG_NOSIGNAL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002643 drbd_update_congested(mdev);
2644 set_fs(KERNEL_DS);
2645 do {
2646 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2647 offset, len,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002648 msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002649 if (sent == -EAGAIN) {
2650 if (we_should_drop_the_connection(mdev,
2651 mdev->data.socket))
2652 break;
2653 else
2654 continue;
2655 }
2656 if (sent <= 0) {
2657 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2658 __func__, (int)size, len, sent);
2659 break;
2660 }
2661 len -= sent;
2662 offset += sent;
2663 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2664 set_fs(oldfs);
2665 clear_bit(NET_CONGESTED, &mdev->flags);
2666
2667 ok = (len == 0);
2668 if (likely(ok))
2669 mdev->send_cnt += size>>9;
2670 return ok;
2671}
2672
2673static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2674{
2675 struct bio_vec *bvec;
2676 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002677 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002678 __bio_for_each_segment(bvec, bio, i, 0) {
2679 if (!_drbd_no_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002680 bvec->bv_offset, bvec->bv_len,
2681 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002682 return 0;
2683 }
2684 return 1;
2685}
2686
2687static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2688{
2689 struct bio_vec *bvec;
2690 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002691 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002692 __bio_for_each_segment(bvec, bio, i, 0) {
2693 if (!_drbd_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002694 bvec->bv_offset, bvec->bv_len,
2695 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002696 return 0;
2697 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002698 return 1;
2699}
2700
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002701static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2702{
2703 struct page *page = e->pages;
2704 unsigned len = e->size;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002705 /* hint all but last page with MSG_MORE */
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002706 page_chain_for_each(page) {
2707 unsigned l = min_t(unsigned, len, PAGE_SIZE);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002708 if (!_drbd_send_page(mdev, page, 0, l,
2709 page_chain_next(page) ? MSG_MORE : 0))
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002710 return 0;
2711 len -= l;
2712 }
2713 return 1;
2714}
2715
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002716static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2717{
2718 if (mdev->agreed_pro_version >= 95)
2719 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002720 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2721 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2722 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2723 else
Jens Axboe721a9602011-03-09 11:56:30 +01002724 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002725}
2726
Philipp Reisnerb411b362009-09-25 16:07:19 -07002727/* Used to send write requests
2728 * R_PRIMARY -> Peer (P_DATA)
2729 */
2730int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2731{
2732 int ok = 1;
2733 struct p_data p;
2734 unsigned int dp_flags = 0;
2735 void *dgb;
2736 int dgs;
2737
2738 if (!drbd_get_data_sock(mdev))
2739 return 0;
2740
2741 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2742 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2743
Philipp Reisnerd5373382010-08-23 15:18:33 +02002744 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002745 p.head.h80.magic = BE_DRBD_MAGIC;
2746 p.head.h80.command = cpu_to_be16(P_DATA);
2747 p.head.h80.length =
2748 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2749 } else {
2750 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2751 p.head.h95.command = cpu_to_be16(P_DATA);
2752 p.head.h95.length =
2753 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2754 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002755
2756 p.sector = cpu_to_be64(req->sector);
2757 p.block_id = (unsigned long)req;
2758 p.seq_num = cpu_to_be32(req->seq_num =
2759 atomic_add_return(1, &mdev->packet_seq));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002760
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002761 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2762
Philipp Reisnerb411b362009-09-25 16:07:19 -07002763 if (mdev->state.conn >= C_SYNC_SOURCE &&
2764 mdev->state.conn <= C_PAUSED_SYNC_T)
2765 dp_flags |= DP_MAY_SET_IN_SYNC;
2766
2767 p.dp_flags = cpu_to_be32(dp_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002768 set_bit(UNPLUG_REMOTE, &mdev->flags);
2769 ok = (sizeof(p) ==
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002770 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002771 if (ok && dgs) {
2772 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002773 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
Andreas Gruenbachercab2f742010-12-09 16:08:46 +01002774 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002775 }
2776 if (ok) {
Lars Ellenberg470be442010-11-10 10:36:52 +01002777 /* For protocol A, we have to memcpy the payload into
2778 * socket buffers, as we may complete right away
2779 * as soon as we handed it over to tcp, at which point the data
2780 * pages may become invalid.
2781 *
2782 * For data-integrity enabled, we copy it as well, so we can be
2783 * sure that even if the bio pages may still be modified, it
2784 * won't change the data on the wire, thus if the digest checks
2785 * out ok after sending on this side, but does not fit on the
2786 * receiving side, we sure have detected corruption elsewhere.
2787 */
2788 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002789 ok = _drbd_send_bio(mdev, req->master_bio);
2790 else
2791 ok = _drbd_send_zc_bio(mdev, req->master_bio);
Lars Ellenberg470be442010-11-10 10:36:52 +01002792
2793 /* double check digest, sometimes buffers have been modified in flight. */
2794 if (dgs > 0 && dgs <= 64) {
Bart Van Assche24c48302011-05-21 18:32:29 +02002795 /* 64 byte, 512 bit, is the largest digest size
Lars Ellenberg470be442010-11-10 10:36:52 +01002796 * currently supported in kernel crypto. */
2797 unsigned char digest[64];
2798 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2799 if (memcmp(mdev->int_dig_out, digest, dgs)) {
2800 dev_warn(DEV,
2801 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2802 (unsigned long long)req->sector, req->size);
2803 }
2804 } /* else if (dgs > 64) {
2805 ... Be noisy about digest too large ...
2806 } */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002807 }
2808
2809 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002810
Philipp Reisnerb411b362009-09-25 16:07:19 -07002811 return ok;
2812}
2813
2814/* answer packet, used to send data back for read requests:
2815 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2816 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2817 */
2818int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2819 struct drbd_epoch_entry *e)
2820{
2821 int ok;
2822 struct p_data p;
2823 void *dgb;
2824 int dgs;
2825
2826 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2827 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2828
Philipp Reisnerd5373382010-08-23 15:18:33 +02002829 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002830 p.head.h80.magic = BE_DRBD_MAGIC;
2831 p.head.h80.command = cpu_to_be16(cmd);
2832 p.head.h80.length =
2833 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2834 } else {
2835 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2836 p.head.h95.command = cpu_to_be16(cmd);
2837 p.head.h95.length =
2838 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2839 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002840
2841 p.sector = cpu_to_be64(e->sector);
2842 p.block_id = e->block_id;
2843 /* p.seq_num = 0; No sequence numbers here.. */
2844
2845 /* Only called by our kernel thread.
2846 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2847 * in response to admin command or module unload.
2848 */
2849 if (!drbd_get_data_sock(mdev))
2850 return 0;
2851
Philipp Reisner0b70a132010-08-20 13:36:10 +02002852 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002853 if (ok && dgs) {
2854 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002855 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
Andreas Gruenbachercab2f742010-12-09 16:08:46 +01002856 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002857 }
2858 if (ok)
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002859 ok = _drbd_send_zc_ee(mdev, e);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002860
2861 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002862
Philipp Reisnerb411b362009-09-25 16:07:19 -07002863 return ok;
2864}
2865
Philipp Reisner73a01a12010-10-27 14:33:00 +02002866int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2867{
2868 struct p_block_desc p;
2869
2870 p.sector = cpu_to_be64(req->sector);
2871 p.blksize = cpu_to_be32(req->size);
2872
2873 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2874}
2875
Philipp Reisnerb411b362009-09-25 16:07:19 -07002876/*
2877 drbd_send distinguishes two cases:
2878
2879 Packets sent via the data socket "sock"
2880 and packets sent via the meta data socket "msock"
2881
2882 sock msock
2883 -----------------+-------------------------+------------------------------
2884 timeout conf.timeout / 2 conf.timeout / 2
2885 timeout action send a ping via msock Abort communication
2886 and close all sockets
2887*/
2888
2889/*
2890 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2891 */
2892int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2893 void *buf, size_t size, unsigned msg_flags)
2894{
2895 struct kvec iov;
2896 struct msghdr msg;
2897 int rv, sent = 0;
2898
2899 if (!sock)
2900 return -1000;
2901
2902 /* THINK if (signal_pending) return ... ? */
2903
2904 iov.iov_base = buf;
2905 iov.iov_len = size;
2906
2907 msg.msg_name = NULL;
2908 msg.msg_namelen = 0;
2909 msg.msg_control = NULL;
2910 msg.msg_controllen = 0;
2911 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2912
2913 if (sock == mdev->data.socket) {
2914 mdev->ko_count = mdev->net_conf->ko_count;
2915 drbd_update_congested(mdev);
2916 }
2917 do {
2918 /* STRANGE
2919 * tcp_sendmsg does _not_ use its size parameter at all ?
2920 *
2921 * -EAGAIN on timeout, -EINTR on signal.
2922 */
2923/* THINK
2924 * do we need to block DRBD_SIG if sock == &meta.socket ??
2925 * otherwise wake_asender() might interrupt some send_*Ack !
2926 */
2927 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2928 if (rv == -EAGAIN) {
2929 if (we_should_drop_the_connection(mdev, sock))
2930 break;
2931 else
2932 continue;
2933 }
2934 D_ASSERT(rv != 0);
2935 if (rv == -EINTR) {
2936 flush_signals(current);
2937 rv = 0;
2938 }
2939 if (rv < 0)
2940 break;
2941 sent += rv;
2942 iov.iov_base += rv;
2943 iov.iov_len -= rv;
2944 } while (sent < size);
2945
2946 if (sock == mdev->data.socket)
2947 clear_bit(NET_CONGESTED, &mdev->flags);
2948
2949 if (rv <= 0) {
2950 if (rv != -EAGAIN) {
2951 dev_err(DEV, "%s_sendmsg returned %d\n",
2952 sock == mdev->meta.socket ? "msock" : "sock",
2953 rv);
2954 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2955 } else
2956 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2957 }
2958
2959 return sent;
2960}
2961
2962static int drbd_open(struct block_device *bdev, fmode_t mode)
2963{
2964 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2965 unsigned long flags;
2966 int rv = 0;
2967
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002968 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002969 spin_lock_irqsave(&mdev->req_lock, flags);
2970 /* to have a stable mdev->state.role
2971 * and no race with updating open_cnt */
2972
2973 if (mdev->state.role != R_PRIMARY) {
2974 if (mode & FMODE_WRITE)
2975 rv = -EROFS;
2976 else if (!allow_oos)
2977 rv = -EMEDIUMTYPE;
2978 }
2979
2980 if (!rv)
2981 mdev->open_cnt++;
2982 spin_unlock_irqrestore(&mdev->req_lock, flags);
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002983 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002984
2985 return rv;
2986}
2987
2988static int drbd_release(struct gendisk *gd, fmode_t mode)
2989{
2990 struct drbd_conf *mdev = gd->private_data;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002991 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002992 mdev->open_cnt--;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002993 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002994 return 0;
2995}
2996
Philipp Reisnerb411b362009-09-25 16:07:19 -07002997static void drbd_set_defaults(struct drbd_conf *mdev)
2998{
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002999 /* This way we get a compile error when sync_conf grows,
3000 and we forgot to initialize it here */
3001 mdev->sync_conf = (struct syncer_conf) {
3002 /* .rate = */ DRBD_RATE_DEF,
3003 /* .after = */ DRBD_AFTER_DEF,
3004 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
Philipp Reisner85f4cc12010-06-29 17:35:34 +02003005 /* .verify_alg = */ {}, 0,
3006 /* .cpu_mask = */ {}, 0,
3007 /* .csums_alg = */ {}, 0,
Philipp Reisnere7564142010-06-29 17:35:34 +02003008 /* .use_rle = */ 0,
Philipp Reisner9a31d712010-07-05 13:42:03 +02003009 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
3010 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
3011 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
3012 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02003013 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
3014 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
Philipp Reisner85f4cc12010-06-29 17:35:34 +02003015 };
3016
3017 /* Have to use that way, because the layout differs between
3018 big endian and little endian */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003019 mdev->state = (union drbd_state) {
3020 { .role = R_SECONDARY,
3021 .peer = R_UNKNOWN,
3022 .conn = C_STANDALONE,
3023 .disk = D_DISKLESS,
3024 .pdsk = D_UNKNOWN,
Philipp Reisnerfb22c402010-09-08 23:20:21 +02003025 .susp = 0,
3026 .susp_nod = 0,
3027 .susp_fen = 0
Philipp Reisnerb411b362009-09-25 16:07:19 -07003028 } };
3029}
3030
3031void drbd_init_set_defaults(struct drbd_conf *mdev)
3032{
3033 /* the memset(,0,) did most of this.
3034 * note: only assignments, no allocation in here */
3035
3036 drbd_set_defaults(mdev);
3037
Philipp Reisnerb411b362009-09-25 16:07:19 -07003038 atomic_set(&mdev->ap_bio_cnt, 0);
3039 atomic_set(&mdev->ap_pending_cnt, 0);
3040 atomic_set(&mdev->rs_pending_cnt, 0);
3041 atomic_set(&mdev->unacked_cnt, 0);
3042 atomic_set(&mdev->local_cnt, 0);
3043 atomic_set(&mdev->net_cnt, 0);
3044 atomic_set(&mdev->packet_seq, 0);
3045 atomic_set(&mdev->pp_in_use, 0);
Lars Ellenberg435f0742010-09-06 12:30:25 +02003046 atomic_set(&mdev->pp_in_use_by_net, 0);
Philipp Reisner778f2712010-07-06 11:14:00 +02003047 atomic_set(&mdev->rs_sect_in, 0);
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02003048 atomic_set(&mdev->rs_sect_ev, 0);
Philipp Reisner759fbdf2010-10-26 16:02:27 +02003049 atomic_set(&mdev->ap_in_flight, 0);
Philipp Reisnere1711732011-06-27 11:51:46 +02003050 atomic_set(&mdev->md_io_in_use, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003051
Philipp Reisnerb411b362009-09-25 16:07:19 -07003052 mutex_init(&mdev->data.mutex);
3053 mutex_init(&mdev->meta.mutex);
3054 sema_init(&mdev->data.work.s, 0);
3055 sema_init(&mdev->meta.work.s, 0);
3056 mutex_init(&mdev->state_mutex);
3057
3058 spin_lock_init(&mdev->data.work.q_lock);
3059 spin_lock_init(&mdev->meta.work.q_lock);
3060
3061 spin_lock_init(&mdev->al_lock);
3062 spin_lock_init(&mdev->req_lock);
3063 spin_lock_init(&mdev->peer_seq_lock);
3064 spin_lock_init(&mdev->epoch_lock);
3065
3066 INIT_LIST_HEAD(&mdev->active_ee);
3067 INIT_LIST_HEAD(&mdev->sync_ee);
3068 INIT_LIST_HEAD(&mdev->done_ee);
3069 INIT_LIST_HEAD(&mdev->read_ee);
3070 INIT_LIST_HEAD(&mdev->net_ee);
3071 INIT_LIST_HEAD(&mdev->resync_reads);
3072 INIT_LIST_HEAD(&mdev->data.work.q);
3073 INIT_LIST_HEAD(&mdev->meta.work.q);
3074 INIT_LIST_HEAD(&mdev->resync_work.list);
3075 INIT_LIST_HEAD(&mdev->unplug_work.list);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003076 INIT_LIST_HEAD(&mdev->go_diskless.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003077 INIT_LIST_HEAD(&mdev->md_sync_work.list);
Philipp Reisnerc4752ef2010-10-27 17:32:36 +02003078 INIT_LIST_HEAD(&mdev->start_resync_work.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003079 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
Philipp Reisner0ced55a2010-04-30 15:26:20 +02003080
Philipp Reisner794abb72010-12-27 11:51:23 +01003081 mdev->resync_work.cb = w_resync_timer;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003082 mdev->unplug_work.cb = w_send_write_hint;
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003083 mdev->go_diskless.cb = w_go_diskless;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003084 mdev->md_sync_work.cb = w_md_sync;
3085 mdev->bm_io_work.w.cb = w_bitmap_io;
Philipp Reisner370a43e2011-01-14 16:03:11 +01003086 mdev->start_resync_work.cb = w_start_resync;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003087 init_timer(&mdev->resync_timer);
3088 init_timer(&mdev->md_sync_timer);
Philipp Reisner370a43e2011-01-14 16:03:11 +01003089 init_timer(&mdev->start_resync_timer);
Philipp Reisner7fde2be2011-03-01 11:08:28 +01003090 init_timer(&mdev->request_timer);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003091 mdev->resync_timer.function = resync_timer_fn;
3092 mdev->resync_timer.data = (unsigned long) mdev;
3093 mdev->md_sync_timer.function = md_sync_timer_fn;
3094 mdev->md_sync_timer.data = (unsigned long) mdev;
Philipp Reisner370a43e2011-01-14 16:03:11 +01003095 mdev->start_resync_timer.function = start_resync_timer_fn;
3096 mdev->start_resync_timer.data = (unsigned long) mdev;
Philipp Reisner7fde2be2011-03-01 11:08:28 +01003097 mdev->request_timer.function = request_timer_fn;
3098 mdev->request_timer.data = (unsigned long) mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003099
3100 init_waitqueue_head(&mdev->misc_wait);
3101 init_waitqueue_head(&mdev->state_wait);
Philipp Reisner84dfb9f2010-06-23 11:20:05 +02003102 init_waitqueue_head(&mdev->net_cnt_wait);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003103 init_waitqueue_head(&mdev->ee_wait);
3104 init_waitqueue_head(&mdev->al_wait);
3105 init_waitqueue_head(&mdev->seq_wait);
3106
3107 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
3108 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
3109 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
3110
3111 mdev->agreed_pro_version = PRO_VERSION_MAX;
Philipp Reisner2451fc32010-08-24 13:43:11 +02003112 mdev->write_ordering = WO_bdev_flush;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003113 mdev->resync_wenr = LC_FREE;
Philipp Reisner99432fc2011-05-20 16:39:13 +02003114 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3115 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003116}
3117
3118void drbd_mdev_cleanup(struct drbd_conf *mdev)
3119{
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003120 int i;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003121 if (mdev->receiver.t_state != None)
3122 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
3123 mdev->receiver.t_state);
3124
3125 /* no need to lock it, I'm the only thread alive */
3126 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
3127 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
3128 mdev->al_writ_cnt =
3129 mdev->bm_writ_cnt =
3130 mdev->read_cnt =
3131 mdev->recv_cnt =
3132 mdev->send_cnt =
3133 mdev->writ_cnt =
3134 mdev->p_size =
3135 mdev->rs_start =
3136 mdev->rs_total =
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003137 mdev->rs_failed = 0;
3138 mdev->rs_last_events = 0;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02003139 mdev->rs_last_sect_ev = 0;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003140 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3141 mdev->rs_mark_left[i] = 0;
3142 mdev->rs_mark_time[i] = 0;
3143 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003144 D_ASSERT(mdev->net_conf == NULL);
3145
3146 drbd_set_my_capacity(mdev, 0);
3147 if (mdev->bitmap) {
3148 /* maybe never allocated. */
Philipp Reisner02d9a942010-03-24 16:23:03 +01003149 drbd_bm_resize(mdev, 0, 1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003150 drbd_bm_cleanup(mdev);
3151 }
3152
3153 drbd_free_resources(mdev);
Philipp Reisner07782862010-08-31 12:00:50 +02003154 clear_bit(AL_SUSPENDED, &mdev->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003155
3156 /*
3157 * currently we drbd_init_ee only on module load, so
3158 * we may do drbd_release_ee only on module unload!
3159 */
3160 D_ASSERT(list_empty(&mdev->active_ee));
3161 D_ASSERT(list_empty(&mdev->sync_ee));
3162 D_ASSERT(list_empty(&mdev->done_ee));
3163 D_ASSERT(list_empty(&mdev->read_ee));
3164 D_ASSERT(list_empty(&mdev->net_ee));
3165 D_ASSERT(list_empty(&mdev->resync_reads));
3166 D_ASSERT(list_empty(&mdev->data.work.q));
3167 D_ASSERT(list_empty(&mdev->meta.work.q));
3168 D_ASSERT(list_empty(&mdev->resync_work.list));
3169 D_ASSERT(list_empty(&mdev->unplug_work.list));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003170 D_ASSERT(list_empty(&mdev->go_diskless.list));
Lars Ellenberg2265b472010-12-16 15:41:26 +01003171
3172 drbd_set_defaults(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003173}
3174
3175
3176static void drbd_destroy_mempools(void)
3177{
3178 struct page *page;
3179
3180 while (drbd_pp_pool) {
3181 page = drbd_pp_pool;
3182 drbd_pp_pool = (struct page *)page_private(page);
3183 __free_page(page);
3184 drbd_pp_vacant--;
3185 }
3186
3187 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3188
3189 if (drbd_ee_mempool)
3190 mempool_destroy(drbd_ee_mempool);
3191 if (drbd_request_mempool)
3192 mempool_destroy(drbd_request_mempool);
3193 if (drbd_ee_cache)
3194 kmem_cache_destroy(drbd_ee_cache);
3195 if (drbd_request_cache)
3196 kmem_cache_destroy(drbd_request_cache);
3197 if (drbd_bm_ext_cache)
3198 kmem_cache_destroy(drbd_bm_ext_cache);
3199 if (drbd_al_ext_cache)
3200 kmem_cache_destroy(drbd_al_ext_cache);
3201
3202 drbd_ee_mempool = NULL;
3203 drbd_request_mempool = NULL;
3204 drbd_ee_cache = NULL;
3205 drbd_request_cache = NULL;
3206 drbd_bm_ext_cache = NULL;
3207 drbd_al_ext_cache = NULL;
3208
3209 return;
3210}
3211
3212static int drbd_create_mempools(void)
3213{
3214 struct page *page;
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01003215 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003216 int i;
3217
3218 /* prepare our caches and mempools */
3219 drbd_request_mempool = NULL;
3220 drbd_ee_cache = NULL;
3221 drbd_request_cache = NULL;
3222 drbd_bm_ext_cache = NULL;
3223 drbd_al_ext_cache = NULL;
3224 drbd_pp_pool = NULL;
3225
3226 /* caches */
3227 drbd_request_cache = kmem_cache_create(
3228 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3229 if (drbd_request_cache == NULL)
3230 goto Enomem;
3231
3232 drbd_ee_cache = kmem_cache_create(
3233 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3234 if (drbd_ee_cache == NULL)
3235 goto Enomem;
3236
3237 drbd_bm_ext_cache = kmem_cache_create(
3238 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3239 if (drbd_bm_ext_cache == NULL)
3240 goto Enomem;
3241
3242 drbd_al_ext_cache = kmem_cache_create(
3243 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3244 if (drbd_al_ext_cache == NULL)
3245 goto Enomem;
3246
3247 /* mempools */
3248 drbd_request_mempool = mempool_create(number,
3249 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3250 if (drbd_request_mempool == NULL)
3251 goto Enomem;
3252
3253 drbd_ee_mempool = mempool_create(number,
3254 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
Nicolas Kaiser2027ae12010-10-28 06:15:26 -06003255 if (drbd_ee_mempool == NULL)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003256 goto Enomem;
3257
3258 /* drbd's page pool */
3259 spin_lock_init(&drbd_pp_lock);
3260
3261 for (i = 0; i < number; i++) {
3262 page = alloc_page(GFP_HIGHUSER);
3263 if (!page)
3264 goto Enomem;
3265 set_page_private(page, (unsigned long)drbd_pp_pool);
3266 drbd_pp_pool = page;
3267 }
3268 drbd_pp_vacant = number;
3269
3270 return 0;
3271
3272Enomem:
3273 drbd_destroy_mempools(); /* in case we allocated some */
3274 return -ENOMEM;
3275}
3276
3277static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3278 void *unused)
3279{
3280 /* just so we have it. you never know what interesting things we
3281 * might want to do here some day...
3282 */
3283
3284 return NOTIFY_DONE;
3285}
3286
3287static struct notifier_block drbd_notifier = {
3288 .notifier_call = drbd_notify_sys,
3289};
3290
3291static void drbd_release_ee_lists(struct drbd_conf *mdev)
3292{
3293 int rr;
3294
3295 rr = drbd_release_ee(mdev, &mdev->active_ee);
3296 if (rr)
3297 dev_err(DEV, "%d EEs in active list found!\n", rr);
3298
3299 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3300 if (rr)
3301 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3302
3303 rr = drbd_release_ee(mdev, &mdev->read_ee);
3304 if (rr)
3305 dev_err(DEV, "%d EEs in read list found!\n", rr);
3306
3307 rr = drbd_release_ee(mdev, &mdev->done_ee);
3308 if (rr)
3309 dev_err(DEV, "%d EEs in done list found!\n", rr);
3310
3311 rr = drbd_release_ee(mdev, &mdev->net_ee);
3312 if (rr)
3313 dev_err(DEV, "%d EEs in net list found!\n", rr);
3314}
3315
3316/* caution. no locking.
3317 * currently only used from module cleanup code. */
3318static void drbd_delete_device(unsigned int minor)
3319{
3320 struct drbd_conf *mdev = minor_to_mdev(minor);
3321
3322 if (!mdev)
3323 return;
3324
Philipp Reisnerdfa8bed2011-06-29 14:06:08 +02003325 del_timer_sync(&mdev->request_timer);
3326
Philipp Reisnerb411b362009-09-25 16:07:19 -07003327 /* paranoia asserts */
3328 if (mdev->open_cnt != 0)
3329 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3330 __FILE__ , __LINE__);
3331
3332 ERR_IF (!list_empty(&mdev->data.work.q)) {
3333 struct list_head *lp;
3334 list_for_each(lp, &mdev->data.work.q) {
3335 dev_err(DEV, "lp = %p\n", lp);
3336 }
3337 };
3338 /* end paranoia asserts */
3339
3340 del_gendisk(mdev->vdisk);
3341
3342 /* cleanup stuff that may have been allocated during
3343 * device (re-)configuration or state changes */
3344
3345 if (mdev->this_bdev)
3346 bdput(mdev->this_bdev);
3347
3348 drbd_free_resources(mdev);
3349
3350 drbd_release_ee_lists(mdev);
3351
Bart Van Assche24c48302011-05-21 18:32:29 +02003352 /* should be freed on disconnect? */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003353 kfree(mdev->ee_hash);
3354 /*
3355 mdev->ee_hash_s = 0;
3356 mdev->ee_hash = NULL;
3357 */
3358
3359 lc_destroy(mdev->act_log);
3360 lc_destroy(mdev->resync);
3361
3362 kfree(mdev->p_uuid);
3363 /* mdev->p_uuid = NULL; */
3364
3365 kfree(mdev->int_dig_out);
3366 kfree(mdev->int_dig_in);
3367 kfree(mdev->int_dig_vv);
3368
3369 /* cleanup the rest that has been
3370 * allocated from drbd_new_device
3371 * and actually free the mdev itself */
3372 drbd_free_mdev(mdev);
3373}
3374
3375static void drbd_cleanup(void)
3376{
3377 unsigned int i;
3378
3379 unregister_reboot_notifier(&drbd_notifier);
3380
Lars Ellenberg17a93f32010-11-24 10:37:35 +01003381 /* first remove proc,
3382 * drbdsetup uses it's presence to detect
3383 * whether DRBD is loaded.
3384 * If we would get stuck in proc removal,
3385 * but have netlink already deregistered,
3386 * some drbdsetup commands may wait forever
3387 * for an answer.
3388 */
3389 if (drbd_proc)
3390 remove_proc_entry("drbd", NULL);
3391
Philipp Reisnerb411b362009-09-25 16:07:19 -07003392 drbd_nl_cleanup();
3393
3394 if (minor_table) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003395 i = minor_count;
3396 while (i--)
3397 drbd_delete_device(i);
3398 drbd_destroy_mempools();
3399 }
3400
3401 kfree(minor_table);
3402
3403 unregister_blkdev(DRBD_MAJOR, "drbd");
3404
3405 printk(KERN_INFO "drbd: module cleanup done.\n");
3406}
3407
3408/**
3409 * drbd_congested() - Callback for pdflush
3410 * @congested_data: User data
3411 * @bdi_bits: Bits pdflush is currently interested in
3412 *
3413 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3414 */
3415static int drbd_congested(void *congested_data, int bdi_bits)
3416{
3417 struct drbd_conf *mdev = congested_data;
3418 struct request_queue *q;
3419 char reason = '-';
3420 int r = 0;
3421
Andreas Gruenbacher1b881ef2010-12-13 18:03:38 +01003422 if (!may_inc_ap_bio(mdev)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003423 /* DRBD has frozen IO */
3424 r = bdi_bits;
3425 reason = 'd';
3426 goto out;
3427 }
3428
3429 if (get_ldev(mdev)) {
3430 q = bdev_get_queue(mdev->ldev->backing_bdev);
3431 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3432 put_ldev(mdev);
3433 if (r)
3434 reason = 'b';
3435 }
3436
3437 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3438 r |= (1 << BDI_async_congested);
3439 reason = reason == 'b' ? 'a' : 'n';
3440 }
3441
3442out:
3443 mdev->congestion_reason = reason;
3444 return r;
3445}
3446
3447struct drbd_conf *drbd_new_device(unsigned int minor)
3448{
3449 struct drbd_conf *mdev;
3450 struct gendisk *disk;
3451 struct request_queue *q;
3452
3453 /* GFP_KERNEL, we are outside of all write-out paths */
3454 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3455 if (!mdev)
3456 return NULL;
3457 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3458 goto out_no_cpumask;
3459
3460 mdev->minor = minor;
3461
3462 drbd_init_set_defaults(mdev);
3463
3464 q = blk_alloc_queue(GFP_KERNEL);
3465 if (!q)
3466 goto out_no_q;
3467 mdev->rq_queue = q;
3468 q->queuedata = mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003469
3470 disk = alloc_disk(1);
3471 if (!disk)
3472 goto out_no_disk;
3473 mdev->vdisk = disk;
3474
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003475 set_disk_ro(disk, true);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003476
3477 disk->queue = q;
3478 disk->major = DRBD_MAJOR;
3479 disk->first_minor = minor;
3480 disk->fops = &drbd_ops;
3481 sprintf(disk->disk_name, "drbd%d", minor);
3482 disk->private_data = mdev;
3483
3484 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3485 /* we have no partitions. we contain only ourselves. */
3486 mdev->this_bdev->bd_contains = mdev->this_bdev;
3487
3488 q->backing_dev_info.congested_fn = drbd_congested;
3489 q->backing_dev_info.congested_data = mdev;
3490
Andreas Gruenbacher2f58dcf2010-12-13 17:48:19 +01003491 blk_queue_make_request(q, drbd_make_request);
Philipp Reisner99432fc2011-05-20 16:39:13 +02003492 /* Setting the max_hw_sectors to an odd value of 8kibyte here
3493 This triggers a max_bio_size message upon first attach or connect */
3494 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003495 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3496 blk_queue_merge_bvec(q, drbd_merge_bvec);
Jens Axboe7eaceac2011-03-10 08:52:07 +01003497 q->queue_lock = &mdev->req_lock;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003498
3499 mdev->md_io_page = alloc_page(GFP_KERNEL);
3500 if (!mdev->md_io_page)
3501 goto out_no_io_page;
3502
3503 if (drbd_bm_init(mdev))
3504 goto out_no_bitmap;
3505 /* no need to lock access, we are still initializing this minor device. */
3506 if (!tl_init(mdev))
3507 goto out_no_tl;
3508
3509 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3510 if (!mdev->app_reads_hash)
3511 goto out_no_app_reads;
3512
3513 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3514 if (!mdev->current_epoch)
3515 goto out_no_epoch;
3516
3517 INIT_LIST_HEAD(&mdev->current_epoch->list);
3518 mdev->epochs = 1;
3519
3520 return mdev;
3521
3522/* out_whatever_else:
3523 kfree(mdev->current_epoch); */
3524out_no_epoch:
3525 kfree(mdev->app_reads_hash);
3526out_no_app_reads:
3527 tl_cleanup(mdev);
3528out_no_tl:
3529 drbd_bm_cleanup(mdev);
3530out_no_bitmap:
3531 __free_page(mdev->md_io_page);
3532out_no_io_page:
3533 put_disk(disk);
3534out_no_disk:
3535 blk_cleanup_queue(q);
3536out_no_q:
3537 free_cpumask_var(mdev->cpu_mask);
3538out_no_cpumask:
3539 kfree(mdev);
3540 return NULL;
3541}
3542
3543/* counterpart of drbd_new_device.
3544 * last part of drbd_delete_device. */
3545void drbd_free_mdev(struct drbd_conf *mdev)
3546{
3547 kfree(mdev->current_epoch);
3548 kfree(mdev->app_reads_hash);
3549 tl_cleanup(mdev);
3550 if (mdev->bitmap) /* should no longer be there. */
3551 drbd_bm_cleanup(mdev);
3552 __free_page(mdev->md_io_page);
3553 put_disk(mdev->vdisk);
3554 blk_cleanup_queue(mdev->rq_queue);
3555 free_cpumask_var(mdev->cpu_mask);
Philipp Reisner37190942010-11-10 12:08:37 +01003556 drbd_free_tl_hash(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003557 kfree(mdev);
3558}
3559
3560
3561int __init drbd_init(void)
3562{
3563 int err;
3564
3565 if (sizeof(struct p_handshake) != 80) {
3566 printk(KERN_ERR
3567 "drbd: never change the size or layout "
3568 "of the HandShake packet.\n");
3569 return -EINVAL;
3570 }
3571
Philipp Reisner2b8a90b2011-01-10 11:15:17 +01003572 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003573 printk(KERN_ERR
3574 "drbd: invalid minor_count (%d)\n", minor_count);
3575#ifdef MODULE
3576 return -EINVAL;
3577#else
3578 minor_count = 8;
3579#endif
3580 }
3581
3582 err = drbd_nl_init();
3583 if (err)
3584 return err;
3585
3586 err = register_blkdev(DRBD_MAJOR, "drbd");
3587 if (err) {
3588 printk(KERN_ERR
3589 "drbd: unable to register block device major %d\n",
3590 DRBD_MAJOR);
3591 return err;
3592 }
3593
3594 register_reboot_notifier(&drbd_notifier);
3595
3596 /*
3597 * allocate all necessary structs
3598 */
3599 err = -ENOMEM;
3600
3601 init_waitqueue_head(&drbd_pp_wait);
3602
3603 drbd_proc = NULL; /* play safe for drbd_cleanup */
3604 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3605 GFP_KERNEL);
3606 if (!minor_table)
3607 goto Enomem;
3608
3609 err = drbd_create_mempools();
3610 if (err)
3611 goto Enomem;
3612
Lars Ellenberg8c484ee2010-03-11 16:47:58 +01003613 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003614 if (!drbd_proc) {
3615 printk(KERN_ERR "drbd: unable to register proc file\n");
3616 goto Enomem;
3617 }
3618
3619 rwlock_init(&global_state_lock);
3620
3621 printk(KERN_INFO "drbd: initialized. "
3622 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3623 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3624 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3625 printk(KERN_INFO "drbd: registered as block device major %d\n",
3626 DRBD_MAJOR);
3627 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3628
3629 return 0; /* Success! */
3630
3631Enomem:
3632 drbd_cleanup();
3633 if (err == -ENOMEM)
3634 /* currently always the case */
3635 printk(KERN_ERR "drbd: ran out of memory\n");
3636 else
3637 printk(KERN_ERR "drbd: initialization failure\n");
3638 return err;
3639}
3640
3641void drbd_free_bc(struct drbd_backing_dev *ldev)
3642{
3643 if (ldev == NULL)
3644 return;
3645
Tejun Heoe525fd82010-11-13 11:55:17 +01003646 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3647 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003648
3649 kfree(ldev);
3650}
3651
3652void drbd_free_sock(struct drbd_conf *mdev)
3653{
3654 if (mdev->data.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003655 mutex_lock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003656 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3657 sock_release(mdev->data.socket);
3658 mdev->data.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003659 mutex_unlock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003660 }
3661 if (mdev->meta.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003662 mutex_lock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003663 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3664 sock_release(mdev->meta.socket);
3665 mdev->meta.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003666 mutex_unlock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003667 }
3668}
3669
3670
3671void drbd_free_resources(struct drbd_conf *mdev)
3672{
3673 crypto_free_hash(mdev->csums_tfm);
3674 mdev->csums_tfm = NULL;
3675 crypto_free_hash(mdev->verify_tfm);
3676 mdev->verify_tfm = NULL;
3677 crypto_free_hash(mdev->cram_hmac_tfm);
3678 mdev->cram_hmac_tfm = NULL;
3679 crypto_free_hash(mdev->integrity_w_tfm);
3680 mdev->integrity_w_tfm = NULL;
3681 crypto_free_hash(mdev->integrity_r_tfm);
3682 mdev->integrity_r_tfm = NULL;
3683
3684 drbd_free_sock(mdev);
3685
3686 __no_warn(local,
3687 drbd_free_bc(mdev->ldev);
3688 mdev->ldev = NULL;);
3689}
3690
3691/* meta data management */
3692
3693struct meta_data_on_disk {
3694 u64 la_size; /* last agreed size. */
3695 u64 uuid[UI_SIZE]; /* UUIDs. */
3696 u64 device_uuid;
3697 u64 reserved_u64_1;
3698 u32 flags; /* MDF */
3699 u32 magic;
3700 u32 md_size_sect;
3701 u32 al_offset; /* offset to this block */
3702 u32 al_nr_extents; /* important for restoring the AL */
3703 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3704 u32 bm_offset; /* offset to the bitmap, from here */
3705 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
Philipp Reisner99432fc2011-05-20 16:39:13 +02003706 u32 la_peer_max_bio_size; /* last peer max_bio_size */
3707 u32 reserved_u32[3];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003708
3709} __packed;
3710
3711/**
3712 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3713 * @mdev: DRBD device.
3714 */
3715void drbd_md_sync(struct drbd_conf *mdev)
3716{
3717 struct meta_data_on_disk *buffer;
3718 sector_t sector;
3719 int i;
3720
Lars Ellenbergee15b032010-09-03 10:00:09 +02003721 del_timer(&mdev->md_sync_timer);
3722 /* timer may be rearmed by drbd_md_mark_dirty() now. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003723 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3724 return;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003725
3726 /* We use here D_FAILED and not D_ATTACHING because we try to write
3727 * metadata even if we detach due to a disk failure! */
3728 if (!get_ldev_if_state(mdev, D_FAILED))
3729 return;
3730
Philipp Reisnere1711732011-06-27 11:51:46 +02003731 buffer = drbd_md_get_buffer(mdev);
3732 if (!buffer)
3733 goto out;
3734
Philipp Reisnerb411b362009-09-25 16:07:19 -07003735 memset(buffer, 0, 512);
3736
3737 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3738 for (i = UI_CURRENT; i < UI_SIZE; i++)
3739 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3740 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3741 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3742
3743 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3744 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3745 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3746 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3747 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3748
3749 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
Philipp Reisner99432fc2011-05-20 16:39:13 +02003750 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003751
3752 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3753 sector = mdev->ldev->md.md_offset;
3754
Lars Ellenberg3f3a9b82010-09-01 15:12:12 +02003755 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003756 /* this was a try anyways ... */
3757 dev_err(DEV, "meta data update failed!\n");
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003758 drbd_chk_io_error(mdev, 1, true);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003759 }
3760
3761 /* Update mdev->ldev->md.la_size_sect,
3762 * since we updated it on metadata. */
3763 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3764
Philipp Reisnere1711732011-06-27 11:51:46 +02003765 drbd_md_put_buffer(mdev);
3766out:
Philipp Reisnerb411b362009-09-25 16:07:19 -07003767 put_ldev(mdev);
3768}
3769
3770/**
3771 * drbd_md_read() - Reads in the meta data super block
3772 * @mdev: DRBD device.
3773 * @bdev: Device from which the meta data should be read in.
3774 *
Andreas Gruenbacher116676c2010-12-08 13:33:11 +01003775 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
Philipp Reisnerb411b362009-09-25 16:07:19 -07003776 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3777 */
3778int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3779{
3780 struct meta_data_on_disk *buffer;
3781 int i, rv = NO_ERROR;
3782
3783 if (!get_ldev_if_state(mdev, D_ATTACHING))
3784 return ERR_IO_MD_DISK;
3785
Philipp Reisnere1711732011-06-27 11:51:46 +02003786 buffer = drbd_md_get_buffer(mdev);
3787 if (!buffer)
3788 goto out;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003789
3790 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
Lucas De Marchi25985ed2011-03-30 22:57:33 -03003791 /* NOTE: can't do normal error processing here as this is
Philipp Reisnerb411b362009-09-25 16:07:19 -07003792 called BEFORE disk is attached */
3793 dev_err(DEV, "Error while reading metadata.\n");
3794 rv = ERR_IO_MD_DISK;
3795 goto err;
3796 }
3797
3798 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3799 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3800 rv = ERR_MD_INVALID;
3801 goto err;
3802 }
3803 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3804 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3805 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3806 rv = ERR_MD_INVALID;
3807 goto err;
3808 }
3809 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3810 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3811 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3812 rv = ERR_MD_INVALID;
3813 goto err;
3814 }
3815 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3816 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3817 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3818 rv = ERR_MD_INVALID;
3819 goto err;
3820 }
3821
3822 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3823 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3824 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3825 rv = ERR_MD_INVALID;
3826 goto err;
3827 }
3828
3829 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3830 for (i = UI_CURRENT; i < UI_SIZE; i++)
3831 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3832 bdev->md.flags = be32_to_cpu(buffer->flags);
3833 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3834 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3835
Philipp Reisner99432fc2011-05-20 16:39:13 +02003836 spin_lock_irq(&mdev->req_lock);
3837 if (mdev->state.conn < C_CONNECTED) {
3838 int peer;
3839 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3840 peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
3841 mdev->peer_max_bio_size = peer;
3842 }
3843 spin_unlock_irq(&mdev->req_lock);
3844
Philipp Reisnerb411b362009-09-25 16:07:19 -07003845 if (mdev->sync_conf.al_extents < 7)
3846 mdev->sync_conf.al_extents = 127;
3847
3848 err:
Philipp Reisnere1711732011-06-27 11:51:46 +02003849 drbd_md_put_buffer(mdev);
3850 out:
Philipp Reisnerb411b362009-09-25 16:07:19 -07003851 put_ldev(mdev);
3852
3853 return rv;
3854}
3855
3856/**
3857 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3858 * @mdev: DRBD device.
3859 *
3860 * Call this function if you change anything that should be written to
3861 * the meta-data super block. This function sets MD_DIRTY, and starts a
3862 * timer that ensures that within five seconds you have to call drbd_md_sync().
3863 */
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003864#ifdef DEBUG
Lars Ellenbergee15b032010-09-03 10:00:09 +02003865void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3866{
3867 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3868 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3869 mdev->last_md_mark_dirty.line = line;
3870 mdev->last_md_mark_dirty.func = func;
3871 }
3872}
3873#else
Philipp Reisnerb411b362009-09-25 16:07:19 -07003874void drbd_md_mark_dirty(struct drbd_conf *mdev)
3875{
Lars Ellenbergee15b032010-09-03 10:00:09 +02003876 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003877 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003878}
Lars Ellenbergee15b032010-09-03 10:00:09 +02003879#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07003880
3881static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3882{
3883 int i;
3884
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003885 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003886 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003887}
3888
3889void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3890{
3891 if (idx == UI_CURRENT) {
3892 if (mdev->state.role == R_PRIMARY)
3893 val |= 1;
3894 else
3895 val &= ~((u64)1);
3896
3897 drbd_set_ed_uuid(mdev, val);
3898 }
3899
3900 mdev->ldev->md.uuid[idx] = val;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003901 drbd_md_mark_dirty(mdev);
3902}
3903
3904
3905void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3906{
3907 if (mdev->ldev->md.uuid[idx]) {
3908 drbd_uuid_move_history(mdev);
3909 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003910 }
3911 _drbd_uuid_set(mdev, idx, val);
3912}
3913
3914/**
3915 * drbd_uuid_new_current() - Creates a new current UUID
3916 * @mdev: DRBD device.
3917 *
3918 * Creates a new current UUID, and rotates the old current UUID into
3919 * the bitmap slot. Causes an incremental resync upon next connect.
3920 */
3921void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3922{
3923 u64 val;
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003924 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003925
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003926 if (bm_uuid)
3927 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
3928
Philipp Reisnerb411b362009-09-25 16:07:19 -07003929 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003930
3931 get_random_bytes(&val, sizeof(u64));
3932 _drbd_uuid_set(mdev, UI_CURRENT, val);
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003933 drbd_print_uuids(mdev, "new current UUID");
Lars Ellenbergaaa8e2b2010-10-15 13:16:53 +02003934 /* get it to stable storage _now_ */
3935 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003936}
3937
3938void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3939{
3940 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3941 return;
3942
3943 if (val == 0) {
3944 drbd_uuid_move_history(mdev);
3945 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3946 mdev->ldev->md.uuid[UI_BITMAP] = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003947 } else {
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003948 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3949 if (bm_uuid)
3950 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003951
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003952 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003953 }
3954 drbd_md_mark_dirty(mdev);
3955}
3956
3957/**
3958 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3959 * @mdev: DRBD device.
3960 *
3961 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3962 */
3963int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3964{
3965 int rv = -EIO;
3966
3967 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3968 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3969 drbd_md_sync(mdev);
3970 drbd_bm_set_all(mdev);
3971
3972 rv = drbd_bm_write(mdev);
3973
3974 if (!rv) {
3975 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3976 drbd_md_sync(mdev);
3977 }
3978
3979 put_ldev(mdev);
3980 }
3981
3982 return rv;
3983}
3984
3985/**
3986 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3987 * @mdev: DRBD device.
3988 *
3989 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3990 */
3991int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3992{
3993 int rv = -EIO;
3994
Philipp Reisner07782862010-08-31 12:00:50 +02003995 drbd_resume_al(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003996 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3997 drbd_bm_clear_all(mdev);
3998 rv = drbd_bm_write(mdev);
3999 put_ldev(mdev);
4000 }
4001
4002 return rv;
4003}
4004
4005static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4006{
4007 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
Lars Ellenberg02851e92010-12-16 14:47:39 +01004008 int rv = -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004009
4010 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
4011
Lars Ellenberg02851e92010-12-16 14:47:39 +01004012 if (get_ldev(mdev)) {
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004013 drbd_bm_lock(mdev, work->why, work->flags);
Lars Ellenberg02851e92010-12-16 14:47:39 +01004014 rv = work->io_fn(mdev);
4015 drbd_bm_unlock(mdev);
4016 put_ldev(mdev);
4017 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07004018
4019 clear_bit(BITMAP_IO, &mdev->flags);
Philipp Reisner127b3172010-11-16 10:07:53 +01004020 smp_mb__after_clear_bit();
Philipp Reisnerb411b362009-09-25 16:07:19 -07004021 wake_up(&mdev->misc_wait);
4022
4023 if (work->done)
4024 work->done(mdev, rv);
4025
4026 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
4027 work->why = NULL;
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004028 work->flags = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004029
4030 return 1;
4031}
4032
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02004033void drbd_ldev_destroy(struct drbd_conf *mdev)
4034{
4035 lc_destroy(mdev->resync);
4036 mdev->resync = NULL;
4037 lc_destroy(mdev->act_log);
4038 mdev->act_log = NULL;
4039 __no_warn(local,
4040 drbd_free_bc(mdev->ldev);
4041 mdev->ldev = NULL;);
4042
4043 if (mdev->md_io_tmpp) {
4044 __free_page(mdev->md_io_tmpp);
4045 mdev->md_io_tmpp = NULL;
4046 }
4047 clear_bit(GO_DISKLESS, &mdev->flags);
4048}
4049
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004050static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4051{
4052 D_ASSERT(mdev->state.disk == D_FAILED);
Lars Ellenberg9d282872010-10-14 13:57:07 +02004053 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
4054 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02004055 * the protected members anymore, though, so once put_ldev reaches zero
4056 * again, it will be safe to free them. */
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004057 drbd_force_state(mdev, NS(disk, D_DISKLESS));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004058 return 1;
4059}
4060
4061void drbd_go_diskless(struct drbd_conf *mdev)
4062{
4063 D_ASSERT(mdev->state.disk == D_FAILED);
4064 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
Lars Ellenberg9d282872010-10-14 13:57:07 +02004065 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004066}
4067
Philipp Reisnerb411b362009-09-25 16:07:19 -07004068/**
4069 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
4070 * @mdev: DRBD device.
4071 * @io_fn: IO callback to be called when bitmap IO is possible
4072 * @done: callback to be called after the bitmap IO was performed
4073 * @why: Descriptive text of the reason for doing the IO
4074 *
4075 * While IO on the bitmap happens we freeze application IO thus we ensure
4076 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
4077 * called from worker context. It MUST NOT be used while a previous such
4078 * work is still pending!
4079 */
4080void drbd_queue_bitmap_io(struct drbd_conf *mdev,
4081 int (*io_fn)(struct drbd_conf *),
4082 void (*done)(struct drbd_conf *, int),
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004083 char *why, enum bm_flag flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004084{
4085 D_ASSERT(current == mdev->worker.task);
4086
4087 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
4088 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
4089 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
4090 if (mdev->bm_io_work.why)
4091 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
4092 why, mdev->bm_io_work.why);
4093
4094 mdev->bm_io_work.io_fn = io_fn;
4095 mdev->bm_io_work.done = done;
4096 mdev->bm_io_work.why = why;
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004097 mdev->bm_io_work.flags = flags;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004098
Philipp Reisner22afd7e2010-11-16 15:30:44 +01004099 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004100 set_bit(BITMAP_IO, &mdev->flags);
4101 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
Philipp Reisner127b3172010-11-16 10:07:53 +01004102 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004103 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004104 }
Philipp Reisner22afd7e2010-11-16 15:30:44 +01004105 spin_unlock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004106}
4107
4108/**
4109 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
4110 * @mdev: DRBD device.
4111 * @io_fn: IO callback to be called when bitmap IO is possible
4112 * @why: Descriptive text of the reason for doing the IO
4113 *
4114 * freezes application IO while that the actual IO operations runs. This
4115 * functions MAY NOT be called from worker context.
4116 */
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004117int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4118 char *why, enum bm_flag flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004119{
4120 int rv;
4121
4122 D_ASSERT(current != mdev->worker.task);
4123
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004124 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4125 drbd_suspend_io(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004126
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004127 drbd_bm_lock(mdev, why, flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004128 rv = io_fn(mdev);
4129 drbd_bm_unlock(mdev);
4130
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004131 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4132 drbd_resume_io(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004133
4134 return rv;
4135}
4136
4137void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4138{
4139 if ((mdev->ldev->md.flags & flag) != flag) {
4140 drbd_md_mark_dirty(mdev);
4141 mdev->ldev->md.flags |= flag;
4142 }
4143}
4144
4145void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4146{
4147 if ((mdev->ldev->md.flags & flag) != 0) {
4148 drbd_md_mark_dirty(mdev);
4149 mdev->ldev->md.flags &= ~flag;
4150 }
4151}
4152int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4153{
4154 return (bdev->md.flags & flag) != 0;
4155}
4156
4157static void md_sync_timer_fn(unsigned long data)
4158{
4159 struct drbd_conf *mdev = (struct drbd_conf *) data;
4160
4161 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
4162}
4163
4164static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4165{
4166 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
Lars Ellenbergee15b032010-09-03 10:00:09 +02004167#ifdef DEBUG
4168 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4169 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4170#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07004171 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004172 return 1;
4173}
4174
4175#ifdef CONFIG_DRBD_FAULT_INJECTION
4176/* Fault insertion support including random number generator shamelessly
4177 * stolen from kernel/rcutorture.c */
4178struct fault_random_state {
4179 unsigned long state;
4180 unsigned long count;
4181};
4182
4183#define FAULT_RANDOM_MULT 39916801 /* prime */
4184#define FAULT_RANDOM_ADD 479001701 /* prime */
4185#define FAULT_RANDOM_REFRESH 10000
4186
4187/*
4188 * Crude but fast random-number generator. Uses a linear congruential
4189 * generator, with occasional help from get_random_bytes().
4190 */
4191static unsigned long
4192_drbd_fault_random(struct fault_random_state *rsp)
4193{
4194 long refresh;
4195
Roel Kluin49829ea2009-12-15 22:55:44 +01004196 if (!rsp->count--) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004197 get_random_bytes(&refresh, sizeof(refresh));
4198 rsp->state += refresh;
4199 rsp->count = FAULT_RANDOM_REFRESH;
4200 }
4201 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4202 return swahw32(rsp->state);
4203}
4204
4205static char *
4206_drbd_fault_str(unsigned int type) {
4207 static char *_faults[] = {
4208 [DRBD_FAULT_MD_WR] = "Meta-data write",
4209 [DRBD_FAULT_MD_RD] = "Meta-data read",
4210 [DRBD_FAULT_RS_WR] = "Resync write",
4211 [DRBD_FAULT_RS_RD] = "Resync read",
4212 [DRBD_FAULT_DT_WR] = "Data write",
4213 [DRBD_FAULT_DT_RD] = "Data read",
4214 [DRBD_FAULT_DT_RA] = "Data read ahead",
4215 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
Philipp Reisner6b4388a2010-04-26 14:11:45 +02004216 [DRBD_FAULT_AL_EE] = "EE allocation",
4217 [DRBD_FAULT_RECEIVE] = "receive data corruption",
Philipp Reisnerb411b362009-09-25 16:07:19 -07004218 };
4219
4220 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4221}
4222
4223unsigned int
4224_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4225{
4226 static struct fault_random_state rrs = {0, 0};
4227
4228 unsigned int ret = (
4229 (fault_devs == 0 ||
4230 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4231 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4232
4233 if (ret) {
4234 fault_count++;
4235
Lars Ellenberg73835062010-05-27 11:51:56 +02004236 if (__ratelimit(&drbd_ratelimit_state))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004237 dev_warn(DEV, "***Simulating %s failure\n",
4238 _drbd_fault_str(type));
4239 }
4240
4241 return ret;
4242}
4243#endif
4244
4245const char *drbd_buildtag(void)
4246{
4247 /* DRBD built from external sources has here a reference to the
4248 git hash of the source code. */
4249
4250 static char buildtag[38] = "\0uilt-in";
4251
4252 if (buildtag[0] == 0) {
4253#ifdef CONFIG_MODULES
4254 if (THIS_MODULE != NULL)
4255 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4256 else
4257#endif
4258 buildtag[0] = 'b';
4259 }
4260
4261 return buildtag;
4262}
4263
4264module_init(drbd_init)
4265module_exit(drbd_cleanup)
4266
Philipp Reisnerb411b362009-09-25 16:07:19 -07004267EXPORT_SYMBOL(drbd_conn_str);
4268EXPORT_SYMBOL(drbd_role_str);
4269EXPORT_SYMBOL(drbd_disk_str);
4270EXPORT_SYMBOL(drbd_set_st_err_str);