blob: 8e83a402ed5cf5efcbffdac7dbee34f0493583ec [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
Philipp Reisnerb411b362009-09-25 16:07:19 -070029#include <linux/module.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070030#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020035#include <linux/mutex.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070036#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070055#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020067static DEFINE_MUTEX(drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -070068int drbdd_init(struct drbd_thread *);
69int drbd_worker(struct drbd_thread *);
70int drbd_asender(struct drbd_thread *);
71
72int drbd_init(void);
73static int drbd_open(struct block_device *bdev, fmode_t mode);
74static int drbd_release(struct gendisk *gd, fmode_t mode);
75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79static void md_sync_timer_fn(unsigned long data);
80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +020081static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Philipp Reisnerb411b362009-09-25 16:07:19 -070082
Philipp Reisnerb411b362009-09-25 16:07:19 -070083MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84 "Lars Ellenberg <lars@linbit.com>");
85MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86MODULE_VERSION(REL_VERSION);
87MODULE_LICENSE("GPL");
Philipp Reisner2b8a90b2011-01-10 11:15:17 +010088MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
89 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
Philipp Reisnerb411b362009-09-25 16:07:19 -070090MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
91
92#include <linux/moduleparam.h>
93/* allow_open_on_secondary */
94MODULE_PARM_DESC(allow_oos, "DONT USE!");
95/* thanks to these macros, if compiled into the kernel (not-module),
96 * this becomes the boot parameter drbd.minor_count */
97module_param(minor_count, uint, 0444);
98module_param(disable_sendpage, bool, 0644);
99module_param(allow_oos, bool, 0);
100module_param(cn_idx, uint, 0444);
101module_param(proc_details, int, 0644);
102
103#ifdef CONFIG_DRBD_FAULT_INJECTION
104int enable_faults;
105int fault_rate;
106static int fault_count;
107int fault_devs;
108/* bitmap of enabled faults */
109module_param(enable_faults, int, 0664);
110/* fault rate % value - applies to all enabled faults */
111module_param(fault_rate, int, 0664);
112/* count of faults inserted */
113module_param(fault_count, int, 0664);
114/* bitmap of devices to insert faults on */
115module_param(fault_devs, int, 0644);
116#endif
117
118/* module parameter, defined */
Philipp Reisner2b8a90b2011-01-10 11:15:17 +0100119unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
Rusty Russell90ab5ee2012-01-13 09:32:20 +1030120bool disable_sendpage;
121bool allow_oos;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700122unsigned int cn_idx = CN_IDX_DRBD;
123int proc_details; /* Detail level in proc drbd*/
124
125/* Module parameter for setting the user mode helper program
126 * to run. Default is /sbin/drbdadm */
127char usermode_helper[80] = "/sbin/drbdadm";
128
129module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
130
131/* in 2.6.x, our device mapping and config info contains our virtual gendisks
132 * as member "struct gendisk *vdisk;"
133 */
134struct drbd_conf **minor_table;
135
136struct kmem_cache *drbd_request_cache;
137struct kmem_cache *drbd_ee_cache; /* epoch entries */
138struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
139struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
140mempool_t *drbd_request_mempool;
141mempool_t *drbd_ee_mempool;
142
143/* I do not use a standard mempool, because:
144 1) I want to hand out the pre-allocated objects first.
145 2) I want to be able to interrupt sleeping allocation with a signal.
146 Note: This is a single linked list, the next pointer is the private
147 member of struct page.
148 */
149struct page *drbd_pp_pool;
150spinlock_t drbd_pp_lock;
151int drbd_pp_vacant;
152wait_queue_head_t drbd_pp_wait;
153
154DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
155
Emese Revfy7d4e9d02009-12-14 00:59:30 +0100156static const struct block_device_operations drbd_ops = {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700157 .owner = THIS_MODULE,
158 .open = drbd_open,
159 .release = drbd_release,
160};
161
162#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
163
164#ifdef __CHECKER__
165/* When checking with sparse, and this is an inline function, sparse will
166 give tons of false positives. When this is a real functions sparse works.
167 */
168int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
169{
170 int io_allowed;
171
172 atomic_inc(&mdev->local_cnt);
173 io_allowed = (mdev->state.disk >= mins);
174 if (!io_allowed) {
175 if (atomic_dec_and_test(&mdev->local_cnt))
176 wake_up(&mdev->misc_wait);
177 }
178 return io_allowed;
179}
180
181#endif
182
183/**
184 * DOC: The transfer log
185 *
186 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
187 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
188 * of the list. There is always at least one &struct drbd_tl_epoch object.
189 *
190 * Each &struct drbd_tl_epoch has a circular double linked list of requests
191 * attached.
192 */
193static int tl_init(struct drbd_conf *mdev)
194{
195 struct drbd_tl_epoch *b;
196
197 /* during device minor initialization, we may well use GFP_KERNEL */
198 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
199 if (!b)
200 return 0;
201 INIT_LIST_HEAD(&b->requests);
202 INIT_LIST_HEAD(&b->w.list);
203 b->next = NULL;
204 b->br_number = 4711;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200205 b->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700206 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
207
208 mdev->oldest_tle = b;
209 mdev->newest_tle = b;
210 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
211
212 mdev->tl_hash = NULL;
213 mdev->tl_hash_s = 0;
214
215 return 1;
216}
217
218static void tl_cleanup(struct drbd_conf *mdev)
219{
220 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
221 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
222 kfree(mdev->oldest_tle);
223 mdev->oldest_tle = NULL;
224 kfree(mdev->unused_spare_tle);
225 mdev->unused_spare_tle = NULL;
226 kfree(mdev->tl_hash);
227 mdev->tl_hash = NULL;
228 mdev->tl_hash_s = 0;
229}
230
231/**
232 * _tl_add_barrier() - Adds a barrier to the transfer log
233 * @mdev: DRBD device.
234 * @new: Barrier to be added before the current head of the TL.
235 *
236 * The caller must hold the req_lock.
237 */
238void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
239{
240 struct drbd_tl_epoch *newest_before;
241
242 INIT_LIST_HEAD(&new->requests);
243 INIT_LIST_HEAD(&new->w.list);
244 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
245 new->next = NULL;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200246 new->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700247
248 newest_before = mdev->newest_tle;
249 /* never send a barrier number == 0, because that is special-cased
250 * when using TCQ for our write ordering code */
251 new->br_number = (newest_before->br_number+1) ?: 1;
252 if (mdev->newest_tle != new) {
253 mdev->newest_tle->next = new;
254 mdev->newest_tle = new;
255 }
256}
257
258/**
259 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
260 * @mdev: DRBD device.
261 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
262 * @set_size: Expected number of requests before that barrier.
263 *
264 * In case the passed barrier_nr or set_size does not match the oldest
265 * &struct drbd_tl_epoch objects this function will cause a termination
266 * of the connection.
267 */
268void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
269 unsigned int set_size)
270{
271 struct drbd_tl_epoch *b, *nob; /* next old barrier */
272 struct list_head *le, *tle;
273 struct drbd_request *r;
274
275 spin_lock_irq(&mdev->req_lock);
276
277 b = mdev->oldest_tle;
278
279 /* first some paranoia code */
280 if (b == NULL) {
281 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
282 barrier_nr);
283 goto bail;
284 }
285 if (b->br_number != barrier_nr) {
286 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
287 barrier_nr, b->br_number);
288 goto bail;
289 }
Philipp Reisner7e602c02010-05-27 14:49:27 +0200290 if (b->n_writes != set_size) {
291 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
292 barrier_nr, set_size, b->n_writes);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700293 goto bail;
294 }
295
296 /* Clean up list of requests processed during current epoch */
297 list_for_each_safe(le, tle, &b->requests) {
298 r = list_entry(le, struct drbd_request, tl_requests);
299 _req_mod(r, barrier_acked);
300 }
301 /* There could be requests on the list waiting for completion
302 of the write to the local disk. To avoid corruptions of
303 slab's data structures we have to remove the lists head.
304
305 Also there could have been a barrier ack out of sequence, overtaking
306 the write acks - which would be a bug and violating write ordering.
307 To not deadlock in case we lose connection while such requests are
308 still pending, we need some way to find them for the
309 _req_mode(connection_lost_while_pending).
310
311 These have been list_move'd to the out_of_sequence_requests list in
312 _req_mod(, barrier_acked) above.
313 */
314 list_del_init(&b->requests);
315
316 nob = b->next;
317 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
318 _tl_add_barrier(mdev, b);
319 if (nob)
320 mdev->oldest_tle = nob;
321 /* if nob == NULL b was the only barrier, and becomes the new
322 barrier. Therefore mdev->oldest_tle points already to b */
323 } else {
324 D_ASSERT(nob != NULL);
325 mdev->oldest_tle = nob;
326 kfree(b);
327 }
328
329 spin_unlock_irq(&mdev->req_lock);
330 dec_ap_pending(mdev);
331
332 return;
333
334bail:
335 spin_unlock_irq(&mdev->req_lock);
336 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
337}
338
Philipp Reisner617049a2010-12-22 12:48:31 +0100339
Philipp Reisner11b58e72010-05-12 17:08:26 +0200340/**
341 * _tl_restart() - Walks the transfer log, and applies an action to all requests
342 * @mdev: DRBD device.
343 * @what: The action/event to perform with all request objects
344 *
345 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
346 * restart_frozen_disk_io.
347 */
348static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
349{
350 struct drbd_tl_epoch *b, *tmp, **pn;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200351 struct list_head *le, *tle, carry_reads;
Philipp Reisner11b58e72010-05-12 17:08:26 +0200352 struct drbd_request *req;
353 int rv, n_writes, n_reads;
354
355 b = mdev->oldest_tle;
356 pn = &mdev->oldest_tle;
357 while (b) {
358 n_writes = 0;
359 n_reads = 0;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200360 INIT_LIST_HEAD(&carry_reads);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200361 list_for_each_safe(le, tle, &b->requests) {
362 req = list_entry(le, struct drbd_request, tl_requests);
363 rv = _req_mod(req, what);
364
365 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
366 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
367 }
368 tmp = b->next;
369
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200370 if (n_writes) {
Philipp Reisner11b58e72010-05-12 17:08:26 +0200371 if (what == resend) {
372 b->n_writes = n_writes;
373 if (b->w.cb == NULL) {
374 b->w.cb = w_send_barrier;
375 inc_ap_pending(mdev);
376 set_bit(CREATE_BARRIER, &mdev->flags);
377 }
378
379 drbd_queue_work(&mdev->data.work, &b->w);
380 }
381 pn = &b->next;
382 } else {
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200383 if (n_reads)
384 list_add(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200385 /* there could still be requests on that ring list,
386 * in case local io is still pending */
387 list_del(&b->requests);
388
389 /* dec_ap_pending corresponding to queue_barrier.
390 * the newest barrier may not have been queued yet,
391 * in which case w.cb is still NULL. */
392 if (b->w.cb != NULL)
393 dec_ap_pending(mdev);
394
395 if (b == mdev->newest_tle) {
396 /* recycle, but reinit! */
397 D_ASSERT(tmp == NULL);
398 INIT_LIST_HEAD(&b->requests);
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200399 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200400 INIT_LIST_HEAD(&b->w.list);
401 b->w.cb = NULL;
402 b->br_number = net_random();
403 b->n_writes = 0;
404
405 *pn = b;
406 break;
407 }
408 *pn = tmp;
409 kfree(b);
410 }
411 b = tmp;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200412 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200413 }
414}
415
Philipp Reisnerb411b362009-09-25 16:07:19 -0700416
417/**
418 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
419 * @mdev: DRBD device.
420 *
421 * This is called after the connection to the peer was lost. The storage covered
422 * by the requests on the transfer gets marked as our of sync. Called from the
423 * receiver thread and the worker thread.
424 */
425void tl_clear(struct drbd_conf *mdev)
426{
Philipp Reisnerb411b362009-09-25 16:07:19 -0700427 struct list_head *le, *tle;
428 struct drbd_request *r;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700429
430 spin_lock_irq(&mdev->req_lock);
431
Philipp Reisner11b58e72010-05-12 17:08:26 +0200432 _tl_restart(mdev, connection_lost_while_pending);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700433
434 /* we expect this list to be empty. */
435 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
436
437 /* but just in case, clean it up anyways! */
438 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
439 r = list_entry(le, struct drbd_request, tl_requests);
440 /* It would be nice to complete outside of spinlock.
441 * But this is easier for now. */
442 _req_mod(r, connection_lost_while_pending);
443 }
444
445 /* ensure bit indicating barrier is required is clear */
446 clear_bit(CREATE_BARRIER, &mdev->flags);
447
Philipp Reisner288f4222010-05-27 15:07:43 +0200448 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
449
Philipp Reisnerb411b362009-09-25 16:07:19 -0700450 spin_unlock_irq(&mdev->req_lock);
451}
452
Philipp Reisner11b58e72010-05-12 17:08:26 +0200453void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
454{
455 spin_lock_irq(&mdev->req_lock);
456 _tl_restart(mdev, what);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700457 spin_unlock_irq(&mdev->req_lock);
458}
459
460/**
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100461 * cl_wide_st_chg() - true if the state change is a cluster wide one
Philipp Reisnerb411b362009-09-25 16:07:19 -0700462 * @mdev: DRBD device.
463 * @os: old (current) state.
464 * @ns: new (wanted) state.
465 */
466static int cl_wide_st_chg(struct drbd_conf *mdev,
467 union drbd_state os, union drbd_state ns)
468{
469 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
470 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
471 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
472 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
473 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
474 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
475 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
476}
477
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100478enum drbd_state_rv
479drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
480 union drbd_state mask, union drbd_state val)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700481{
482 unsigned long flags;
483 union drbd_state os, ns;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100484 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700485
486 spin_lock_irqsave(&mdev->req_lock, flags);
487 os = mdev->state;
488 ns.i = (os.i & ~mask.i) | val.i;
489 rv = _drbd_set_state(mdev, ns, f, NULL);
490 ns = mdev->state;
491 spin_unlock_irqrestore(&mdev->req_lock, flags);
492
493 return rv;
494}
495
496/**
497 * drbd_force_state() - Impose a change which happens outside our control on our state
498 * @mdev: DRBD device.
499 * @mask: mask of state bits to change.
500 * @val: value of new state bits.
501 */
502void drbd_force_state(struct drbd_conf *mdev,
503 union drbd_state mask, union drbd_state val)
504{
505 drbd_change_state(mdev, CS_HARD, mask, val);
506}
507
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100508static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
509static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
510 union drbd_state,
511 union drbd_state);
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200512enum sanitize_state_warnings {
513 NO_WARNING,
514 ABORTED_ONLINE_VERIFY,
515 ABORTED_RESYNC,
516 CONNECTION_LOST_NEGOTIATING,
517 IMPLICITLY_UPGRADED_DISK,
518 IMPLICITLY_UPGRADED_PDSK,
519};
Philipp Reisnerb411b362009-09-25 16:07:19 -0700520static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200521 union drbd_state ns, enum sanitize_state_warnings *warn);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700522int drbd_send_state_req(struct drbd_conf *,
523 union drbd_state, union drbd_state);
524
Andreas Gruenbacherc8b32562010-12-08 01:06:16 +0100525static enum drbd_state_rv
526_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
527 union drbd_state val)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700528{
529 union drbd_state os, ns;
530 unsigned long flags;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100531 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700532
533 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
534 return SS_CW_SUCCESS;
535
536 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
537 return SS_CW_FAILED_BY_PEER;
538
539 rv = 0;
540 spin_lock_irqsave(&mdev->req_lock, flags);
541 os = mdev->state;
542 ns.i = (os.i & ~mask.i) | val.i;
543 ns = sanitize_state(mdev, os, ns, NULL);
544
545 if (!cl_wide_st_chg(mdev, os, ns))
546 rv = SS_CW_NO_NEED;
547 if (!rv) {
548 rv = is_valid_state(mdev, ns);
549 if (rv == SS_SUCCESS) {
550 rv = is_valid_state_transition(mdev, ns, os);
551 if (rv == SS_SUCCESS)
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100552 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700553 }
554 }
555 spin_unlock_irqrestore(&mdev->req_lock, flags);
556
557 return rv;
558}
559
560/**
561 * drbd_req_state() - Perform an eventually cluster wide state change
562 * @mdev: DRBD device.
563 * @mask: mask of state bits to change.
564 * @val: value of new state bits.
565 * @f: flags
566 *
567 * Should not be called directly, use drbd_request_state() or
568 * _drbd_request_state().
569 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100570static enum drbd_state_rv
571drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
572 union drbd_state val, enum chg_state_flags f)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700573{
574 struct completion done;
575 unsigned long flags;
576 union drbd_state os, ns;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100577 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700578
579 init_completion(&done);
580
581 if (f & CS_SERIALIZE)
582 mutex_lock(&mdev->state_mutex);
583
584 spin_lock_irqsave(&mdev->req_lock, flags);
585 os = mdev->state;
586 ns.i = (os.i & ~mask.i) | val.i;
587 ns = sanitize_state(mdev, os, ns, NULL);
588
589 if (cl_wide_st_chg(mdev, os, ns)) {
590 rv = is_valid_state(mdev, ns);
591 if (rv == SS_SUCCESS)
592 rv = is_valid_state_transition(mdev, ns, os);
593 spin_unlock_irqrestore(&mdev->req_lock, flags);
594
595 if (rv < SS_SUCCESS) {
596 if (f & CS_VERBOSE)
597 print_st_err(mdev, os, ns, rv);
598 goto abort;
599 }
600
601 drbd_state_lock(mdev);
602 if (!drbd_send_state_req(mdev, mask, val)) {
603 drbd_state_unlock(mdev);
604 rv = SS_CW_FAILED_BY_PEER;
605 if (f & CS_VERBOSE)
606 print_st_err(mdev, os, ns, rv);
607 goto abort;
608 }
609
610 wait_event(mdev->state_wait,
611 (rv = _req_st_cond(mdev, mask, val)));
612
613 if (rv < SS_SUCCESS) {
614 drbd_state_unlock(mdev);
615 if (f & CS_VERBOSE)
616 print_st_err(mdev, os, ns, rv);
617 goto abort;
618 }
619 spin_lock_irqsave(&mdev->req_lock, flags);
620 os = mdev->state;
621 ns.i = (os.i & ~mask.i) | val.i;
622 rv = _drbd_set_state(mdev, ns, f, &done);
623 drbd_state_unlock(mdev);
624 } else {
625 rv = _drbd_set_state(mdev, ns, f, &done);
626 }
627
628 spin_unlock_irqrestore(&mdev->req_lock, flags);
629
630 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
631 D_ASSERT(current != mdev->worker.task);
632 wait_for_completion(&done);
633 }
634
635abort:
636 if (f & CS_SERIALIZE)
637 mutex_unlock(&mdev->state_mutex);
638
639 return rv;
640}
641
642/**
643 * _drbd_request_state() - Request a state change (with flags)
644 * @mdev: DRBD device.
645 * @mask: mask of state bits to change.
646 * @val: value of new state bits.
647 * @f: flags
648 *
649 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
650 * flag, or when logging of failed state change requests is not desired.
651 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100652enum drbd_state_rv
653_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
654 union drbd_state val, enum chg_state_flags f)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700655{
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100656 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700657
658 wait_event(mdev->state_wait,
659 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
660
661 return rv;
662}
663
664static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
665{
666 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
667 name,
668 drbd_conn_str(ns.conn),
669 drbd_role_str(ns.role),
670 drbd_role_str(ns.peer),
671 drbd_disk_str(ns.disk),
672 drbd_disk_str(ns.pdsk),
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200673 is_susp(ns) ? 's' : 'r',
Philipp Reisnerb411b362009-09-25 16:07:19 -0700674 ns.aftr_isp ? 'a' : '-',
675 ns.peer_isp ? 'p' : '-',
676 ns.user_isp ? 'u' : '-'
677 );
678}
679
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100680void print_st_err(struct drbd_conf *mdev, union drbd_state os,
681 union drbd_state ns, enum drbd_state_rv err)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700682{
683 if (err == SS_IN_TRANSIENT_STATE)
684 return;
685 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
686 print_st(mdev, " state", os);
687 print_st(mdev, "wanted", ns);
688}
689
690
Philipp Reisnerb411b362009-09-25 16:07:19 -0700691/**
692 * is_valid_state() - Returns an SS_ error code if ns is not valid
693 * @mdev: DRBD device.
694 * @ns: State to consider.
695 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100696static enum drbd_state_rv
697is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700698{
699 /* See drbd_state_sw_errors in drbd_strings.c */
700
701 enum drbd_fencing_p fp;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100702 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700703
704 fp = FP_DONT_CARE;
705 if (get_ldev(mdev)) {
706 fp = mdev->ldev->dc.fencing;
707 put_ldev(mdev);
708 }
709
710 if (get_net_conf(mdev)) {
711 if (!mdev->net_conf->two_primaries &&
712 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
713 rv = SS_TWO_PRIMARIES;
714 put_net_conf(mdev);
715 }
716
717 if (rv <= 0)
718 /* already found a reason to abort */;
719 else if (ns.role == R_SECONDARY && mdev->open_cnt)
720 rv = SS_DEVICE_IN_USE;
721
722 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
723 rv = SS_NO_UP_TO_DATE_DISK;
724
725 else if (fp >= FP_RESOURCE &&
726 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
727 rv = SS_PRIMARY_NOP;
728
729 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
730 rv = SS_NO_UP_TO_DATE_DISK;
731
732 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
733 rv = SS_NO_LOCAL_DISK;
734
735 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
736 rv = SS_NO_REMOTE_DISK;
737
Lars Ellenberg8d4ce822010-04-01 16:59:32 +0200738 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
739 rv = SS_NO_UP_TO_DATE_DISK;
740
Philipp Reisnerb411b362009-09-25 16:07:19 -0700741 else if ((ns.conn == C_CONNECTED ||
742 ns.conn == C_WF_BITMAP_S ||
743 ns.conn == C_SYNC_SOURCE ||
744 ns.conn == C_PAUSED_SYNC_S) &&
745 ns.disk == D_OUTDATED)
746 rv = SS_CONNECTED_OUTDATES;
747
748 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
749 (mdev->sync_conf.verify_alg[0] == 0))
750 rv = SS_NO_VERIFY_ALG;
751
752 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
753 mdev->agreed_pro_version < 88)
754 rv = SS_NOT_SUPPORTED;
755
Philipp Reisnerfa7d9392011-05-17 14:48:55 +0200756 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
757 rv = SS_CONNECTED_OUTDATES;
758
Philipp Reisnerb411b362009-09-25 16:07:19 -0700759 return rv;
760}
761
762/**
763 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
764 * @mdev: DRBD device.
765 * @ns: new state.
766 * @os: old state.
767 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100768static enum drbd_state_rv
769is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
770 union drbd_state os)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700771{
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100772 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700773
774 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
775 os.conn > C_CONNECTED)
776 rv = SS_RESYNC_RUNNING;
777
778 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
779 rv = SS_ALREADY_STANDALONE;
780
781 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
782 rv = SS_IS_DISKLESS;
783
784 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
785 rv = SS_NO_NET_CONFIG;
786
787 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
788 rv = SS_LOWER_THAN_OUTDATED;
789
790 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
791 rv = SS_IN_TRANSIENT_STATE;
792
793 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
794 rv = SS_IN_TRANSIENT_STATE;
795
796 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
797 rv = SS_NEED_CONNECTION;
798
799 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
800 ns.conn != os.conn && os.conn > C_CONNECTED)
801 rv = SS_RESYNC_RUNNING;
802
803 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
804 os.conn < C_CONNECTED)
805 rv = SS_NEED_CONNECTION;
806
Philipp Reisner1fc80cf2010-11-22 14:18:47 +0100807 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
808 && os.conn < C_WF_REPORT_PARAMS)
809 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
810
Philipp Reisnerb411b362009-09-25 16:07:19 -0700811 return rv;
812}
813
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200814static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
815{
816 static const char *msg_table[] = {
817 [NO_WARNING] = "",
818 [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
819 [ABORTED_RESYNC] = "Resync aborted.",
820 [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
821 [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
822 [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
823 };
824
825 if (warn != NO_WARNING)
826 dev_warn(DEV, "%s\n", msg_table[warn]);
827}
828
Philipp Reisnerb411b362009-09-25 16:07:19 -0700829/**
830 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
831 * @mdev: DRBD device.
832 * @os: old state.
833 * @ns: new state.
834 * @warn_sync_abort:
835 *
836 * When we loose connection, we have to set the state of the peers disk (pdsk)
837 * to D_UNKNOWN. This rule and many more along those lines are in this function.
838 */
839static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200840 union drbd_state ns, enum sanitize_state_warnings *warn)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700841{
842 enum drbd_fencing_p fp;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100843 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700844
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200845 if (warn)
846 *warn = NO_WARNING;
847
Philipp Reisnerb411b362009-09-25 16:07:19 -0700848 fp = FP_DONT_CARE;
849 if (get_ldev(mdev)) {
850 fp = mdev->ldev->dc.fencing;
851 put_ldev(mdev);
852 }
853
854 /* Disallow Network errors to configure a device's network part */
855 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
856 os.conn <= C_DISCONNECTING)
857 ns.conn = os.conn;
858
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200859 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
860 * If you try to go into some Sync* state, that shall fail (elsewhere). */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700861 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200862 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700863 ns.conn = os.conn;
864
Lars Ellenberg82f59cc2010-10-16 12:13:47 +0200865 /* we cannot fail (again) if we already detached */
866 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
867 ns.disk = D_DISKLESS;
868
869 /* if we are only D_ATTACHING yet,
870 * we can (and should) go directly to D_DISKLESS. */
871 if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
872 ns.disk = D_DISKLESS;
873
Philipp Reisnerb411b362009-09-25 16:07:19 -0700874 /* After C_DISCONNECTING only C_STANDALONE may follow */
875 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
876 ns.conn = os.conn;
877
878 if (ns.conn < C_CONNECTED) {
879 ns.peer_isp = 0;
880 ns.peer = R_UNKNOWN;
881 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
882 ns.pdsk = D_UNKNOWN;
883 }
884
885 /* Clear the aftr_isp when becoming unconfigured */
886 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
887 ns.aftr_isp = 0;
888
Philipp Reisnerb411b362009-09-25 16:07:19 -0700889 /* Abort resync if a disk fails/detaches */
890 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
891 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200892 if (warn)
893 *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
894 ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700895 ns.conn = C_CONNECTED;
896 }
897
Philipp Reisnerb411b362009-09-25 16:07:19 -0700898 /* Connection breaks down before we finished "Negotiating" */
899 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
900 get_ldev_if_state(mdev, D_NEGOTIATING)) {
901 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
902 ns.disk = mdev->new_state_tmp.disk;
903 ns.pdsk = mdev->new_state_tmp.pdsk;
904 } else {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200905 if (warn)
906 *warn = CONNECTION_LOST_NEGOTIATING;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700907 ns.disk = D_DISKLESS;
908 ns.pdsk = D_UNKNOWN;
909 }
910 put_ldev(mdev);
911 }
912
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100913 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
914 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
915 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
916 ns.disk = D_UP_TO_DATE;
917 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
918 ns.pdsk = D_UP_TO_DATE;
919 }
920
921 /* Implications of the connection stat on the disk states */
922 disk_min = D_DISKLESS;
923 disk_max = D_UP_TO_DATE;
924 pdsk_min = D_INCONSISTENT;
925 pdsk_max = D_UNKNOWN;
926 switch ((enum drbd_conns)ns.conn) {
927 case C_WF_BITMAP_T:
928 case C_PAUSED_SYNC_T:
929 case C_STARTING_SYNC_T:
930 case C_WF_SYNC_UUID:
931 case C_BEHIND:
932 disk_min = D_INCONSISTENT;
933 disk_max = D_OUTDATED;
934 pdsk_min = D_UP_TO_DATE;
935 pdsk_max = D_UP_TO_DATE;
936 break;
937 case C_VERIFY_S:
938 case C_VERIFY_T:
939 disk_min = D_UP_TO_DATE;
940 disk_max = D_UP_TO_DATE;
941 pdsk_min = D_UP_TO_DATE;
942 pdsk_max = D_UP_TO_DATE;
943 break;
944 case C_CONNECTED:
945 disk_min = D_DISKLESS;
946 disk_max = D_UP_TO_DATE;
947 pdsk_min = D_DISKLESS;
948 pdsk_max = D_UP_TO_DATE;
949 break;
950 case C_WF_BITMAP_S:
951 case C_PAUSED_SYNC_S:
952 case C_STARTING_SYNC_S:
953 case C_AHEAD:
954 disk_min = D_UP_TO_DATE;
955 disk_max = D_UP_TO_DATE;
956 pdsk_min = D_INCONSISTENT;
957 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
958 break;
959 case C_SYNC_TARGET:
960 disk_min = D_INCONSISTENT;
961 disk_max = D_INCONSISTENT;
962 pdsk_min = D_UP_TO_DATE;
963 pdsk_max = D_UP_TO_DATE;
964 break;
965 case C_SYNC_SOURCE:
966 disk_min = D_UP_TO_DATE;
967 disk_max = D_UP_TO_DATE;
968 pdsk_min = D_INCONSISTENT;
969 pdsk_max = D_INCONSISTENT;
970 break;
971 case C_STANDALONE:
972 case C_DISCONNECTING:
973 case C_UNCONNECTED:
974 case C_TIMEOUT:
975 case C_BROKEN_PIPE:
976 case C_NETWORK_FAILURE:
977 case C_PROTOCOL_ERROR:
978 case C_TEAR_DOWN:
979 case C_WF_CONNECTION:
980 case C_WF_REPORT_PARAMS:
981 case C_MASK:
982 break;
983 }
984 if (ns.disk > disk_max)
985 ns.disk = disk_max;
986
987 if (ns.disk < disk_min) {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200988 if (warn)
989 *warn = IMPLICITLY_UPGRADED_DISK;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100990 ns.disk = disk_min;
991 }
992 if (ns.pdsk > pdsk_max)
993 ns.pdsk = pdsk_max;
994
995 if (ns.pdsk < pdsk_min) {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200996 if (warn)
997 *warn = IMPLICITLY_UPGRADED_PDSK;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100998 ns.pdsk = pdsk_min;
999 }
1000
Philipp Reisnerb411b362009-09-25 16:07:19 -07001001 if (fp == FP_STONITH &&
Philipp Reisner0a492162009-10-21 13:08:29 +02001002 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
1003 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001004 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
Philipp Reisner265be2d2010-05-31 10:14:17 +02001005
1006 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
1007 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
1008 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001009 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001010
1011 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
1012 if (ns.conn == C_SYNC_SOURCE)
1013 ns.conn = C_PAUSED_SYNC_S;
1014 if (ns.conn == C_SYNC_TARGET)
1015 ns.conn = C_PAUSED_SYNC_T;
1016 } else {
1017 if (ns.conn == C_PAUSED_SYNC_S)
1018 ns.conn = C_SYNC_SOURCE;
1019 if (ns.conn == C_PAUSED_SYNC_T)
1020 ns.conn = C_SYNC_TARGET;
1021 }
1022
1023 return ns;
1024}
1025
1026/* helper for __drbd_set_state */
1027static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
1028{
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001029 if (mdev->agreed_pro_version < 90)
1030 mdev->ov_start_sector = 0;
1031 mdev->rs_total = drbd_bm_bits(mdev);
1032 mdev->ov_position = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001033 if (cs == C_VERIFY_T) {
1034 /* starting online verify from an arbitrary position
1035 * does not fit well into the existing protocol.
1036 * on C_VERIFY_T, we initialize ov_left and friends
1037 * implicitly in receive_DataRequest once the
1038 * first P_OV_REQUEST is received */
1039 mdev->ov_start_sector = ~(sector_t)0;
1040 } else {
1041 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001042 if (bit >= mdev->rs_total) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001043 mdev->ov_start_sector =
1044 BM_BIT_TO_SECT(mdev->rs_total - 1);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001045 mdev->rs_total = 1;
1046 } else
1047 mdev->rs_total -= bit;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001048 mdev->ov_position = mdev->ov_start_sector;
1049 }
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001050 mdev->ov_left = mdev->rs_total;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001051}
1052
Philipp Reisner07782862010-08-31 12:00:50 +02001053static void drbd_resume_al(struct drbd_conf *mdev)
1054{
1055 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1056 dev_info(DEV, "Resumed AL updates\n");
1057}
1058
Philipp Reisnerb411b362009-09-25 16:07:19 -07001059/**
1060 * __drbd_set_state() - Set a new DRBD state
1061 * @mdev: DRBD device.
1062 * @ns: new state.
1063 * @flags: Flags
1064 * @done: Optional completion, that will get completed after the after_state_ch() finished
1065 *
1066 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1067 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01001068enum drbd_state_rv
1069__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1070 enum chg_state_flags flags, struct completion *done)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001071{
1072 union drbd_state os;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01001073 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001074 enum sanitize_state_warnings ssw;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001075 struct after_state_chg_work *ascw;
1076
1077 os = mdev->state;
1078
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001079 ns = sanitize_state(mdev, os, ns, &ssw);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001080
1081 if (ns.i == os.i)
1082 return SS_NOTHING_TO_DO;
1083
1084 if (!(flags & CS_HARD)) {
1085 /* pre-state-change checks ; only look at ns */
1086 /* See drbd_state_sw_errors in drbd_strings.c */
1087
1088 rv = is_valid_state(mdev, ns);
1089 if (rv < SS_SUCCESS) {
1090 /* If the old state was illegal as well, then let
1091 this happen...*/
1092
Philipp Reisner1616a252010-06-10 16:55:15 +02001093 if (is_valid_state(mdev, os) == rv)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001094 rv = is_valid_state_transition(mdev, ns, os);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001095 } else
1096 rv = is_valid_state_transition(mdev, ns, os);
1097 }
1098
1099 if (rv < SS_SUCCESS) {
1100 if (flags & CS_VERBOSE)
1101 print_st_err(mdev, os, ns, rv);
1102 return rv;
1103 }
1104
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001105 print_sanitize_warnings(mdev, ssw);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001106
1107 {
Andreas Gruenbacher662d91a2010-12-07 03:01:41 +01001108 char *pbp, pb[300];
1109 pbp = pb;
1110 *pbp = 0;
1111 if (ns.role != os.role)
1112 pbp += sprintf(pbp, "role( %s -> %s ) ",
1113 drbd_role_str(os.role),
1114 drbd_role_str(ns.role));
1115 if (ns.peer != os.peer)
1116 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1117 drbd_role_str(os.peer),
1118 drbd_role_str(ns.peer));
1119 if (ns.conn != os.conn)
1120 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1121 drbd_conn_str(os.conn),
1122 drbd_conn_str(ns.conn));
1123 if (ns.disk != os.disk)
1124 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1125 drbd_disk_str(os.disk),
1126 drbd_disk_str(ns.disk));
1127 if (ns.pdsk != os.pdsk)
1128 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1129 drbd_disk_str(os.pdsk),
1130 drbd_disk_str(ns.pdsk));
1131 if (is_susp(ns) != is_susp(os))
1132 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1133 is_susp(os),
1134 is_susp(ns));
1135 if (ns.aftr_isp != os.aftr_isp)
1136 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1137 os.aftr_isp,
1138 ns.aftr_isp);
1139 if (ns.peer_isp != os.peer_isp)
1140 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1141 os.peer_isp,
1142 ns.peer_isp);
1143 if (ns.user_isp != os.user_isp)
1144 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1145 os.user_isp,
1146 ns.user_isp);
1147 dev_info(DEV, "%s\n", pb);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001148 }
1149
1150 /* solve the race between becoming unconfigured,
1151 * worker doing the cleanup, and
1152 * admin reconfiguring us:
1153 * on (re)configure, first set CONFIG_PENDING,
1154 * then wait for a potentially exiting worker,
1155 * start the worker, and schedule one no_op.
1156 * then proceed with configuration.
1157 */
1158 if (ns.disk == D_DISKLESS &&
1159 ns.conn == C_STANDALONE &&
1160 ns.role == R_SECONDARY &&
1161 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1162 set_bit(DEVICE_DYING, &mdev->flags);
1163
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001164 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1165 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1166 * drbd_ldev_destroy() won't happen before our corresponding
1167 * after_state_ch works run, where we put_ldev again. */
1168 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1169 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1170 atomic_inc(&mdev->local_cnt);
1171
1172 mdev->state = ns;
Lars Ellenberg62b0da32011-01-20 13:25:21 +01001173
1174 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1175 drbd_print_uuids(mdev, "attached to UUIDs");
1176
Philipp Reisnerb411b362009-09-25 16:07:19 -07001177 wake_up(&mdev->misc_wait);
1178 wake_up(&mdev->state_wait);
1179
Philipp Reisnerb411b362009-09-25 16:07:19 -07001180 /* aborted verify run. log the last position */
1181 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1182 ns.conn < C_CONNECTED) {
1183 mdev->ov_start_sector =
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001184 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001185 dev_info(DEV, "Online Verify reached sector %llu\n",
1186 (unsigned long long)mdev->ov_start_sector);
1187 }
1188
1189 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1190 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1191 dev_info(DEV, "Syncer continues.\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001192 mdev->rs_paused += (long)jiffies
1193 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
Philipp Reisner63106d32010-09-01 15:47:15 +02001194 if (ns.conn == C_SYNC_TARGET)
1195 mod_timer(&mdev->resync_timer, jiffies);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001196 }
1197
1198 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1199 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1200 dev_info(DEV, "Resync suspended\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001201 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001202 }
1203
1204 if (os.conn == C_CONNECTED &&
1205 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001206 unsigned long now = jiffies;
1207 int i;
1208
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001209 set_ov_position(mdev, ns.conn);
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001210 mdev->rs_start = now;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001211 mdev->rs_last_events = 0;
1212 mdev->rs_last_sect_ev = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001213 mdev->ov_last_oos_size = 0;
1214 mdev->ov_last_oos_start = 0;
1215
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001216 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001217 mdev->rs_mark_left[i] = mdev->ov_left;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001218 mdev->rs_mark_time[i] = now;
1219 }
1220
Lars Ellenberg2649f082010-11-05 10:05:47 +01001221 drbd_rs_controller_reset(mdev);
1222
Philipp Reisnerb411b362009-09-25 16:07:19 -07001223 if (ns.conn == C_VERIFY_S) {
1224 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1225 (unsigned long long)mdev->ov_position);
1226 mod_timer(&mdev->resync_timer, jiffies);
1227 }
1228 }
1229
1230 if (get_ldev(mdev)) {
1231 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1232 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1233 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1234
1235 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1236 mdf |= MDF_CRASHED_PRIMARY;
1237 if (mdev->state.role == R_PRIMARY ||
1238 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1239 mdf |= MDF_PRIMARY_IND;
1240 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1241 mdf |= MDF_CONNECTED_IND;
1242 if (mdev->state.disk > D_INCONSISTENT)
1243 mdf |= MDF_CONSISTENT;
1244 if (mdev->state.disk > D_OUTDATED)
1245 mdf |= MDF_WAS_UP_TO_DATE;
1246 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1247 mdf |= MDF_PEER_OUT_DATED;
1248 if (mdf != mdev->ldev->md.flags) {
1249 mdev->ldev->md.flags = mdf;
1250 drbd_md_mark_dirty(mdev);
1251 }
1252 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1253 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1254 put_ldev(mdev);
1255 }
1256
1257 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1258 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1259 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1260 set_bit(CONSIDER_RESYNC, &mdev->flags);
1261
1262 /* Receiver should clean up itself */
1263 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1264 drbd_thread_stop_nowait(&mdev->receiver);
1265
1266 /* Now the receiver finished cleaning up itself, it should die */
1267 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1268 drbd_thread_stop_nowait(&mdev->receiver);
1269
1270 /* Upon network failure, we need to restart the receiver. */
1271 if (os.conn > C_TEAR_DOWN &&
1272 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1273 drbd_thread_restart_nowait(&mdev->receiver);
1274
Philipp Reisner07782862010-08-31 12:00:50 +02001275 /* Resume AL writing if we get a connection */
1276 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1277 drbd_resume_al(mdev);
1278
Philipp Reisnerb411b362009-09-25 16:07:19 -07001279 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1280 if (ascw) {
1281 ascw->os = os;
1282 ascw->ns = ns;
1283 ascw->flags = flags;
1284 ascw->w.cb = w_after_state_ch;
1285 ascw->done = done;
1286 drbd_queue_work(&mdev->data.work, &ascw->w);
1287 } else {
1288 dev_warn(DEV, "Could not kmalloc an ascw\n");
1289 }
1290
1291 return rv;
1292}
1293
1294static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1295{
1296 struct after_state_chg_work *ascw =
1297 container_of(w, struct after_state_chg_work, w);
1298 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1299 if (ascw->flags & CS_WAIT_COMPLETE) {
1300 D_ASSERT(ascw->done != NULL);
1301 complete(ascw->done);
1302 }
1303 kfree(ascw);
1304
1305 return 1;
1306}
1307
1308static void abw_start_sync(struct drbd_conf *mdev, int rv)
1309{
1310 if (rv) {
1311 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1312 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1313 return;
1314 }
1315
1316 switch (mdev->state.conn) {
1317 case C_STARTING_SYNC_T:
1318 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1319 break;
1320 case C_STARTING_SYNC_S:
1321 drbd_start_resync(mdev, C_SYNC_SOURCE);
1322 break;
1323 }
1324}
1325
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001326int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1327 int (*io_fn)(struct drbd_conf *),
1328 char *why, enum bm_flag flags)
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001329{
1330 int rv;
1331
1332 D_ASSERT(current == mdev->worker.task);
1333
1334 /* open coded non-blocking drbd_suspend_io(mdev); */
1335 set_bit(SUSPEND_IO, &mdev->flags);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001336
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001337 drbd_bm_lock(mdev, why, flags);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001338 rv = io_fn(mdev);
1339 drbd_bm_unlock(mdev);
1340
1341 drbd_resume_io(mdev);
1342
1343 return rv;
1344}
1345
Philipp Reisnerb411b362009-09-25 16:07:19 -07001346/**
1347 * after_state_ch() - Perform after state change actions that may sleep
1348 * @mdev: DRBD device.
1349 * @os: old state.
1350 * @ns: new state.
1351 * @flags: Flags
1352 */
1353static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1354 union drbd_state ns, enum chg_state_flags flags)
1355{
1356 enum drbd_fencing_p fp;
Philipp Reisner67098932010-06-24 16:24:25 +02001357 enum drbd_req_event what = nothing;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001358 union drbd_state nsm = (union drbd_state){ .i = -1 };
Philipp Reisnerb411b362009-09-25 16:07:19 -07001359
1360 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1361 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1362 if (mdev->p_uuid)
1363 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1364 }
1365
1366 fp = FP_DONT_CARE;
1367 if (get_ldev(mdev)) {
1368 fp = mdev->ldev->dc.fencing;
1369 put_ldev(mdev);
1370 }
1371
1372 /* Inform userspace about the change... */
1373 drbd_bcast_state(mdev, ns);
1374
1375 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1376 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1377 drbd_khelper(mdev, "pri-on-incon-degr");
1378
1379 /* Here we have the actions that are performed after a
1380 state change. This function might sleep */
1381
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001382 nsm.i = -1;
1383 if (ns.susp_nod) {
Philipp Reisner3f986882010-12-20 14:48:20 +01001384 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1385 what = resend;
Philipp Reisner265be2d2010-05-31 10:14:17 +02001386
Philipp Reisner67098932010-06-24 16:24:25 +02001387 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
Philipp Reisner3f986882010-12-20 14:48:20 +01001388 what = restart_frozen_disk_io;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001389
Philipp Reisner3f986882010-12-20 14:48:20 +01001390 if (what != nothing)
1391 nsm.susp_nod = 0;
Philipp Reisner265be2d2010-05-31 10:14:17 +02001392 }
1393
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001394 if (ns.susp_fen) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001395 /* case1: The outdate peer handler is successful: */
1396 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001397 tl_clear(mdev);
Philipp Reisner43a51822010-06-11 11:26:34 +02001398 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1399 drbd_uuid_new_current(mdev);
1400 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner43a51822010-06-11 11:26:34 +02001401 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001402 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001403 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001404 spin_unlock_irq(&mdev->req_lock);
1405 }
Philipp Reisner43a51822010-06-11 11:26:34 +02001406 /* case2: The connection was established again: */
1407 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1408 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner67098932010-06-24 16:24:25 +02001409 what = resend;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001410 nsm.susp_fen = 0;
Philipp Reisner43a51822010-06-11 11:26:34 +02001411 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001412 }
Philipp Reisner67098932010-06-24 16:24:25 +02001413
1414 if (what != nothing) {
1415 spin_lock_irq(&mdev->req_lock);
1416 _tl_restart(mdev, what);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001417 nsm.i &= mdev->state.i;
1418 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
Philipp Reisner67098932010-06-24 16:24:25 +02001419 spin_unlock_irq(&mdev->req_lock);
1420 }
1421
Lars Ellenberg5a22db82010-12-17 21:14:23 +01001422 /* Became sync source. With protocol >= 96, we still need to send out
1423 * the sync uuid now. Need to do that before any drbd_send_state, or
1424 * the other side may go "paused sync" before receiving the sync uuids,
1425 * which is unexpected. */
1426 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1427 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1428 mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1429 drbd_gen_and_send_sync_uuid(mdev);
1430 put_ldev(mdev);
1431 }
1432
Philipp Reisnerb411b362009-09-25 16:07:19 -07001433 /* Do not change the order of the if above and the two below... */
1434 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1435 drbd_send_uuids(mdev);
1436 drbd_send_state(mdev);
1437 }
Lars Ellenberg54b956a2011-01-20 10:47:53 +01001438 /* No point in queuing send_bitmap if we don't have a connection
1439 * anymore, so check also the _current_ state, not only the new state
1440 * at the time this work was queued. */
1441 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1442 mdev->state.conn == C_WF_BITMAP_S)
1443 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001444 "send_bitmap (WFBitMapS)",
1445 BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001446
1447 /* Lost contact to peer's copy of the data */
1448 if ((os.pdsk >= D_INCONSISTENT &&
1449 os.pdsk != D_UNKNOWN &&
1450 os.pdsk != D_OUTDATED)
1451 && (ns.pdsk < D_INCONSISTENT ||
1452 ns.pdsk == D_UNKNOWN ||
1453 ns.pdsk == D_OUTDATED)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001454 if (get_ldev(mdev)) {
1455 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001456 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001457 if (is_susp(mdev->state)) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001458 set_bit(NEW_CUR_UUID, &mdev->flags);
1459 } else {
1460 drbd_uuid_new_current(mdev);
1461 drbd_send_uuids(mdev);
1462 }
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001463 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001464 put_ldev(mdev);
1465 }
1466 }
1467
1468 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001469 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001470 drbd_uuid_new_current(mdev);
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001471 drbd_send_uuids(mdev);
1472 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001473
1474 /* D_DISKLESS Peer becomes secondary */
1475 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001476 /* We may still be Primary ourselves.
1477 * No harm done if the bitmap still changes,
1478 * redirtied pages will follow later. */
1479 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1480 "demote diskless peer", BM_LOCKED_SET_ALLOWED);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001481 put_ldev(mdev);
1482 }
1483
Lars Ellenberg06d33e92010-12-18 17:00:59 +01001484 /* Write out all changed bits on demote.
1485 * Though, no need to da that just yet
1486 * if there is a resync going on still */
1487 if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1488 mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001489 /* No changes to the bitmap expected this time, so assert that,
1490 * even though no harm was done if it did change. */
1491 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1492 "demote", BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001493 put_ldev(mdev);
1494 }
1495
1496 /* Last part of the attaching process ... */
1497 if (ns.conn >= C_CONNECTED &&
1498 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
Philipp Reisnere89b5912010-03-24 17:11:33 +01001499 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001500 drbd_send_uuids(mdev);
1501 drbd_send_state(mdev);
1502 }
1503
1504 /* We want to pause/continue resync, tell peer. */
1505 if (ns.conn >= C_CONNECTED &&
1506 ((os.aftr_isp != ns.aftr_isp) ||
1507 (os.user_isp != ns.user_isp)))
1508 drbd_send_state(mdev);
1509
1510 /* In case one of the isp bits got set, suspend other devices. */
1511 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1512 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1513 suspend_other_sg(mdev);
1514
1515 /* Make sure the peer gets informed about eventual state
1516 changes (ISP bits) while we were in WFReportParams. */
1517 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1518 drbd_send_state(mdev);
1519
Philipp Reisner67531712010-10-27 12:21:30 +02001520 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1521 drbd_send_state(mdev);
1522
Philipp Reisnerb411b362009-09-25 16:07:19 -07001523 /* We are in the progress to start a full sync... */
1524 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1525 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001526 /* no other bitmap changes expected during this phase */
1527 drbd_queue_bitmap_io(mdev,
1528 &drbd_bmio_set_n_write, &abw_start_sync,
1529 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001530
1531 /* We are invalidating our self... */
1532 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1533 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001534 /* other bitmap operation expected during this phase */
1535 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1536 "set_n_write from invalidate", BM_LOCKED_MASK);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001537
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001538 /* first half of local IO error, failure to attach,
1539 * or administrative detach */
1540 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1541 enum drbd_io_error_p eh;
1542 int was_io_error;
1543 /* corresponding get_ldev was in __drbd_set_state, to serialize
1544 * our cleanup here with the transition to D_DISKLESS,
1545 * so it is safe to dreference ldev here. */
1546 eh = mdev->ldev->dc.on_io_error;
1547 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1548
1549 /* current state still has to be D_FAILED,
1550 * there is only one way out: to D_DISKLESS,
1551 * and that may only happen after our put_ldev below. */
1552 if (mdev->state.disk != D_FAILED)
1553 dev_err(DEV,
1554 "ASSERT FAILED: disk is %s during detach\n",
1555 drbd_disk_str(mdev->state.disk));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001556
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001557 if (drbd_send_state(mdev))
Lars Ellenberg076673472011-06-21 01:13:37 +02001558 dev_info(DEV, "Notified peer that I am detaching my disk\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001559
1560 drbd_rs_cancel_all(mdev);
1561
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001562 /* In case we want to get something to stable storage still,
1563 * this may be the last chance.
1564 * Following put_ldev may transition to D_DISKLESS. */
1565 drbd_md_sync(mdev);
1566 put_ldev(mdev);
1567
1568 if (was_io_error && eh == EP_CALL_HELPER)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001569 drbd_khelper(mdev, "local-io-error");
1570 }
1571
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001572 /* second half of local IO error, failure to attach,
1573 * or administrative detach,
1574 * after local_cnt references have reached zero again */
1575 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1576 /* We must still be diskless,
1577 * re-attach has to be serialized with this! */
1578 if (mdev->state.disk != D_DISKLESS)
1579 dev_err(DEV,
1580 "ASSERT FAILED: disk is %s while going diskless\n",
1581 drbd_disk_str(mdev->state.disk));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001582
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001583 mdev->rs_total = 0;
1584 mdev->rs_failed = 0;
1585 atomic_set(&mdev->rs_pending_cnt, 0);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001586
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001587 if (drbd_send_state(mdev))
Lars Ellenberg076673472011-06-21 01:13:37 +02001588 dev_info(DEV, "Notified peer that I'm now diskless.\n");
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001589 /* corresponding get_ldev in __drbd_set_state
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001590 * this may finally trigger drbd_ldev_destroy. */
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001591 put_ldev(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001592 }
1593
Philipp Reisner738a84b2011-03-03 00:21:30 +01001594 /* Notify peer that I had a local IO error, and did not detached.. */
1595 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
1596 drbd_send_state(mdev);
1597
Philipp Reisnerb411b362009-09-25 16:07:19 -07001598 /* Disks got bigger while they were detached */
1599 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1600 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1601 if (ns.conn == C_CONNECTED)
1602 resync_after_online_grow(mdev);
1603 }
1604
1605 /* A resync finished or aborted, wake paused devices... */
1606 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1607 (os.peer_isp && !ns.peer_isp) ||
1608 (os.user_isp && !ns.user_isp))
1609 resume_next_sg(mdev);
1610
Lars Ellenbergaf85e8e2010-10-07 16:07:55 +02001611 /* sync target done with resync. Explicitly notify peer, even though
1612 * it should (at least for non-empty resyncs) already know itself. */
1613 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1614 drbd_send_state(mdev);
1615
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001616 /* This triggers bitmap writeout of potentially still unwritten pages
1617 * if the resync finished cleanly, or aborted because of peer disk
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001618 * failure, or because of connection loss.
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001619 * For resync aborted because of local disk failure, we cannot do
1620 * any bitmap writeout anymore.
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001621 * No harm done if some bits change during this phase.
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001622 */
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001623 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
1624 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
1625 "write from resync_finished", BM_LOCKED_SET_ALLOWED);
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001626 put_ldev(mdev);
1627 }
Lars Ellenberg02851e92010-12-16 14:47:39 +01001628
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001629 /* free tl_hash if we Got thawed and are C_STANDALONE */
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001630 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001631 drbd_free_tl_hash(mdev);
1632
Philipp Reisnerb411b362009-09-25 16:07:19 -07001633 /* Upon network connection, we need to start the receiver */
1634 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1635 drbd_thread_start(&mdev->receiver);
1636
1637 /* Terminate worker thread if we are unconfigured - it will be
1638 restarted as needed... */
1639 if (ns.disk == D_DISKLESS &&
1640 ns.conn == C_STANDALONE &&
1641 ns.role == R_SECONDARY) {
1642 if (os.aftr_isp != ns.aftr_isp)
1643 resume_next_sg(mdev);
1644 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1645 if (test_bit(DEVICE_DYING, &mdev->flags))
1646 drbd_thread_stop_nowait(&mdev->worker);
1647 }
1648
1649 drbd_md_sync(mdev);
1650}
1651
1652
1653static int drbd_thread_setup(void *arg)
1654{
1655 struct drbd_thread *thi = (struct drbd_thread *) arg;
1656 struct drbd_conf *mdev = thi->mdev;
1657 unsigned long flags;
1658 int retval;
1659
1660restart:
1661 retval = thi->function(thi);
1662
1663 spin_lock_irqsave(&thi->t_lock, flags);
1664
1665 /* if the receiver has been "Exiting", the last thing it did
1666 * was set the conn state to "StandAlone",
1667 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1668 * and receiver thread will be "started".
1669 * drbd_thread_start needs to set "Restarting" in that case.
1670 * t_state check and assignment needs to be within the same spinlock,
1671 * so either thread_start sees Exiting, and can remap to Restarting,
1672 * or thread_start see None, and can proceed as normal.
1673 */
1674
1675 if (thi->t_state == Restarting) {
1676 dev_info(DEV, "Restarting %s\n", current->comm);
1677 thi->t_state = Running;
1678 spin_unlock_irqrestore(&thi->t_lock, flags);
1679 goto restart;
1680 }
1681
1682 thi->task = NULL;
1683 thi->t_state = None;
1684 smp_mb();
1685 complete(&thi->stop);
1686 spin_unlock_irqrestore(&thi->t_lock, flags);
1687
1688 dev_info(DEV, "Terminating %s\n", current->comm);
1689
1690 /* Release mod reference taken when thread was started */
1691 module_put(THIS_MODULE);
1692 return retval;
1693}
1694
1695static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1696 int (*func) (struct drbd_thread *))
1697{
1698 spin_lock_init(&thi->t_lock);
1699 thi->task = NULL;
1700 thi->t_state = None;
1701 thi->function = func;
1702 thi->mdev = mdev;
1703}
1704
1705int drbd_thread_start(struct drbd_thread *thi)
1706{
1707 struct drbd_conf *mdev = thi->mdev;
1708 struct task_struct *nt;
1709 unsigned long flags;
1710
1711 const char *me =
1712 thi == &mdev->receiver ? "receiver" :
1713 thi == &mdev->asender ? "asender" :
1714 thi == &mdev->worker ? "worker" : "NONSENSE";
1715
1716 /* is used from state engine doing drbd_thread_stop_nowait,
1717 * while holding the req lock irqsave */
1718 spin_lock_irqsave(&thi->t_lock, flags);
1719
1720 switch (thi->t_state) {
1721 case None:
1722 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1723 me, current->comm, current->pid);
1724
1725 /* Get ref on module for thread - this is released when thread exits */
1726 if (!try_module_get(THIS_MODULE)) {
1727 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1728 spin_unlock_irqrestore(&thi->t_lock, flags);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001729 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001730 }
1731
1732 init_completion(&thi->stop);
1733 D_ASSERT(thi->task == NULL);
1734 thi->reset_cpu_mask = 1;
1735 thi->t_state = Running;
1736 spin_unlock_irqrestore(&thi->t_lock, flags);
1737 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1738
1739 nt = kthread_create(drbd_thread_setup, (void *) thi,
1740 "drbd%d_%s", mdev_to_minor(mdev), me);
1741
1742 if (IS_ERR(nt)) {
1743 dev_err(DEV, "Couldn't start thread\n");
1744
1745 module_put(THIS_MODULE);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001746 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001747 }
1748 spin_lock_irqsave(&thi->t_lock, flags);
1749 thi->task = nt;
1750 thi->t_state = Running;
1751 spin_unlock_irqrestore(&thi->t_lock, flags);
1752 wake_up_process(nt);
1753 break;
1754 case Exiting:
1755 thi->t_state = Restarting;
1756 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1757 me, current->comm, current->pid);
1758 /* fall through */
1759 case Running:
1760 case Restarting:
1761 default:
1762 spin_unlock_irqrestore(&thi->t_lock, flags);
1763 break;
1764 }
1765
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001766 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001767}
1768
1769
1770void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1771{
1772 unsigned long flags;
1773
1774 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1775
1776 /* may be called from state engine, holding the req lock irqsave */
1777 spin_lock_irqsave(&thi->t_lock, flags);
1778
1779 if (thi->t_state == None) {
1780 spin_unlock_irqrestore(&thi->t_lock, flags);
1781 if (restart)
1782 drbd_thread_start(thi);
1783 return;
1784 }
1785
1786 if (thi->t_state != ns) {
1787 if (thi->task == NULL) {
1788 spin_unlock_irqrestore(&thi->t_lock, flags);
1789 return;
1790 }
1791
1792 thi->t_state = ns;
1793 smp_mb();
1794 init_completion(&thi->stop);
1795 if (thi->task != current)
1796 force_sig(DRBD_SIGKILL, thi->task);
1797
1798 }
1799
1800 spin_unlock_irqrestore(&thi->t_lock, flags);
1801
1802 if (wait)
1803 wait_for_completion(&thi->stop);
1804}
1805
1806#ifdef CONFIG_SMP
1807/**
1808 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1809 * @mdev: DRBD device.
1810 *
1811 * Forces all threads of a device onto the same CPU. This is beneficial for
1812 * DRBD's performance. May be overwritten by user's configuration.
1813 */
1814void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1815{
1816 int ord, cpu;
1817
1818 /* user override. */
1819 if (cpumask_weight(mdev->cpu_mask))
1820 return;
1821
1822 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1823 for_each_online_cpu(cpu) {
1824 if (ord-- == 0) {
1825 cpumask_set_cpu(cpu, mdev->cpu_mask);
1826 return;
1827 }
1828 }
1829 /* should not be reached */
1830 cpumask_setall(mdev->cpu_mask);
1831}
1832
1833/**
1834 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1835 * @mdev: DRBD device.
1836 *
1837 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1838 * prematurely.
1839 */
1840void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1841{
1842 struct task_struct *p = current;
1843 struct drbd_thread *thi =
1844 p == mdev->asender.task ? &mdev->asender :
1845 p == mdev->receiver.task ? &mdev->receiver :
1846 p == mdev->worker.task ? &mdev->worker :
1847 NULL;
1848 ERR_IF(thi == NULL)
1849 return;
1850 if (!thi->reset_cpu_mask)
1851 return;
1852 thi->reset_cpu_mask = 0;
1853 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1854}
1855#endif
1856
1857/* the appropriate socket mutex must be held already */
1858int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001859 enum drbd_packets cmd, struct p_header80 *h,
Philipp Reisnerb411b362009-09-25 16:07:19 -07001860 size_t size, unsigned msg_flags)
1861{
1862 int sent, ok;
1863
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001864 ERR_IF(!h) return false;
1865 ERR_IF(!size) return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001866
1867 h->magic = BE_DRBD_MAGIC;
1868 h->command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02001869 h->length = cpu_to_be16(size-sizeof(struct p_header80));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001870
Philipp Reisnerb411b362009-09-25 16:07:19 -07001871 sent = drbd_send(mdev, sock, h, size, msg_flags);
1872
1873 ok = (sent == size);
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01001874 if (!ok && !signal_pending(current))
1875 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
Philipp Reisnerb411b362009-09-25 16:07:19 -07001876 cmdname(cmd), (int)size, sent);
1877 return ok;
1878}
1879
1880/* don't pass the socket. we may only look at it
1881 * when we hold the appropriate socket mutex.
1882 */
1883int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001884 enum drbd_packets cmd, struct p_header80 *h, size_t size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001885{
1886 int ok = 0;
1887 struct socket *sock;
1888
1889 if (use_data_socket) {
1890 mutex_lock(&mdev->data.mutex);
1891 sock = mdev->data.socket;
1892 } else {
1893 mutex_lock(&mdev->meta.mutex);
1894 sock = mdev->meta.socket;
1895 }
1896
1897 /* drbd_disconnect() could have called drbd_free_sock()
1898 * while we were waiting in down()... */
1899 if (likely(sock != NULL))
1900 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1901
1902 if (use_data_socket)
1903 mutex_unlock(&mdev->data.mutex);
1904 else
1905 mutex_unlock(&mdev->meta.mutex);
1906 return ok;
1907}
1908
1909int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1910 size_t size)
1911{
Philipp Reisner0b70a132010-08-20 13:36:10 +02001912 struct p_header80 h;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001913 int ok;
1914
1915 h.magic = BE_DRBD_MAGIC;
1916 h.command = cpu_to_be16(cmd);
1917 h.length = cpu_to_be16(size);
1918
1919 if (!drbd_get_data_sock(mdev))
1920 return 0;
1921
Philipp Reisnerb411b362009-09-25 16:07:19 -07001922 ok = (sizeof(h) ==
1923 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1924 ok = ok && (size ==
1925 drbd_send(mdev, mdev->data.socket, data, size, 0));
1926
1927 drbd_put_data_sock(mdev);
1928
1929 return ok;
1930}
1931
1932int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1933{
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001934 struct p_rs_param_95 *p;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001935 struct socket *sock;
1936 int size, rv;
1937 const int apv = mdev->agreed_pro_version;
1938
1939 size = apv <= 87 ? sizeof(struct p_rs_param)
1940 : apv == 88 ? sizeof(struct p_rs_param)
1941 + strlen(mdev->sync_conf.verify_alg) + 1
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001942 : apv <= 94 ? sizeof(struct p_rs_param_89)
1943 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001944
1945 /* used from admin command context and receiver/worker context.
1946 * to avoid kmalloc, grab the socket right here,
1947 * then use the pre-allocated sbuf there */
1948 mutex_lock(&mdev->data.mutex);
1949 sock = mdev->data.socket;
1950
1951 if (likely(sock != NULL)) {
1952 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1953
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001954 p = &mdev->data.sbuf.rs_param_95;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001955
1956 /* initialize verify_alg and csums_alg */
1957 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1958
1959 p->rate = cpu_to_be32(sc->rate);
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001960 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1961 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1962 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1963 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001964
1965 if (apv >= 88)
1966 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1967 if (apv >= 89)
1968 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1969
1970 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1971 } else
1972 rv = 0; /* not ok */
1973
1974 mutex_unlock(&mdev->data.mutex);
1975
1976 return rv;
1977}
1978
1979int drbd_send_protocol(struct drbd_conf *mdev)
1980{
1981 struct p_protocol *p;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001982 int size, cf, rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001983
1984 size = sizeof(struct p_protocol);
1985
1986 if (mdev->agreed_pro_version >= 87)
1987 size += strlen(mdev->net_conf->integrity_alg) + 1;
1988
1989 /* we must not recurse into our own queue,
1990 * as that is blocked during handshake */
1991 p = kmalloc(size, GFP_NOIO);
1992 if (p == NULL)
1993 return 0;
1994
1995 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
1996 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
1997 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
1998 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001999 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
2000
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002001 cf = 0;
2002 if (mdev->net_conf->want_lose)
2003 cf |= CF_WANT_LOSE;
2004 if (mdev->net_conf->dry_run) {
2005 if (mdev->agreed_pro_version >= 92)
2006 cf |= CF_DRY_RUN;
2007 else {
2008 dev_err(DEV, "--dry-run is not supported by peer");
Dan Carpenter7ac314c2010-04-22 14:27:23 +02002009 kfree(p);
Philipp Reisner148efa12011-01-15 00:21:15 +01002010 return -1;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002011 }
2012 }
2013 p->conn_flags = cpu_to_be32(cf);
2014
Philipp Reisnerb411b362009-09-25 16:07:19 -07002015 if (mdev->agreed_pro_version >= 87)
2016 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
2017
2018 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002019 (struct p_header80 *)p, size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002020 kfree(p);
2021 return rv;
2022}
2023
2024int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
2025{
2026 struct p_uuids p;
2027 int i;
2028
2029 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2030 return 1;
2031
2032 for (i = UI_CURRENT; i < UI_SIZE; i++)
2033 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
2034
2035 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2036 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
2037 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
2038 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
2039 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2040 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2041
2042 put_ldev(mdev);
2043
2044 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002045 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002046}
2047
2048int drbd_send_uuids(struct drbd_conf *mdev)
2049{
2050 return _drbd_send_uuids(mdev, 0);
2051}
2052
2053int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2054{
2055 return _drbd_send_uuids(mdev, 8);
2056}
2057
Lars Ellenberg62b0da32011-01-20 13:25:21 +01002058void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2059{
2060 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2061 u64 *uuid = mdev->ldev->md.uuid;
2062 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2063 text,
2064 (unsigned long long)uuid[UI_CURRENT],
2065 (unsigned long long)uuid[UI_BITMAP],
2066 (unsigned long long)uuid[UI_HISTORY_START],
2067 (unsigned long long)uuid[UI_HISTORY_END]);
2068 put_ldev(mdev);
2069 } else {
2070 dev_info(DEV, "%s effective data uuid: %016llX\n",
2071 text,
2072 (unsigned long long)mdev->ed_uuid);
2073 }
2074}
2075
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002076int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002077{
2078 struct p_rs_uuid p;
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002079 u64 uuid;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002080
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002081 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
2082
Philipp Reisner4a23f262011-01-11 17:42:17 +01002083 uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002084 drbd_uuid_set(mdev, UI_BITMAP, uuid);
Lars Ellenberg62b0da32011-01-20 13:25:21 +01002085 drbd_print_uuids(mdev, "updated sync UUID");
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002086 drbd_md_sync(mdev);
2087 p.uuid = cpu_to_be64(uuid);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002088
2089 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002090 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002091}
2092
Philipp Reisnere89b5912010-03-24 17:11:33 +01002093int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002094{
2095 struct p_sizes p;
2096 sector_t d_size, u_size;
Philipp Reisner99432fc2011-05-20 16:39:13 +02002097 int q_order_type, max_bio_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002098 int ok;
2099
2100 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2101 D_ASSERT(mdev->ldev->backing_bdev);
2102 d_size = drbd_get_max_capacity(mdev->ldev);
2103 u_size = mdev->ldev->dc.disk_size;
2104 q_order_type = drbd_queue_order_type(mdev);
Philipp Reisner99432fc2011-05-20 16:39:13 +02002105 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
2106 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002107 put_ldev(mdev);
2108 } else {
2109 d_size = 0;
2110 u_size = 0;
2111 q_order_type = QUEUE_ORDERED_NONE;
Philipp Reisner99432fc2011-05-20 16:39:13 +02002112 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002113 }
2114
2115 p.d_size = cpu_to_be64(d_size);
2116 p.u_size = cpu_to_be64(u_size);
2117 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
Philipp Reisner99432fc2011-05-20 16:39:13 +02002118 p.max_bio_size = cpu_to_be32(max_bio_size);
Philipp Reisnere89b5912010-03-24 17:11:33 +01002119 p.queue_order_type = cpu_to_be16(q_order_type);
2120 p.dds_flags = cpu_to_be16(flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002121
2122 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002123 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002124 return ok;
2125}
2126
2127/**
2128 * drbd_send_state() - Sends the drbd state to the peer
2129 * @mdev: DRBD device.
2130 */
2131int drbd_send_state(struct drbd_conf *mdev)
2132{
2133 struct socket *sock;
2134 struct p_state p;
2135 int ok = 0;
2136
2137 /* Grab state lock so we wont send state if we're in the middle
2138 * of a cluster wide state change on another thread */
2139 drbd_state_lock(mdev);
2140
2141 mutex_lock(&mdev->data.mutex);
2142
2143 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2144 sock = mdev->data.socket;
2145
2146 if (likely(sock != NULL)) {
2147 ok = _drbd_send_cmd(mdev, sock, P_STATE,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002148 (struct p_header80 *)&p, sizeof(p), 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002149 }
2150
2151 mutex_unlock(&mdev->data.mutex);
2152
2153 drbd_state_unlock(mdev);
2154 return ok;
2155}
2156
2157int drbd_send_state_req(struct drbd_conf *mdev,
2158 union drbd_state mask, union drbd_state val)
2159{
2160 struct p_req_state p;
2161
2162 p.mask = cpu_to_be32(mask.i);
2163 p.val = cpu_to_be32(val.i);
2164
2165 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002166 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002167}
2168
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01002169int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002170{
2171 struct p_req_state_reply p;
2172
2173 p.retcode = cpu_to_be32(retcode);
2174
2175 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002176 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002177}
2178
2179int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2180 struct p_compressed_bm *p,
2181 struct bm_xfer_ctx *c)
2182{
2183 struct bitstream bs;
2184 unsigned long plain_bits;
2185 unsigned long tmp;
2186 unsigned long rl;
2187 unsigned len;
2188 unsigned toggle;
2189 int bits;
2190
2191 /* may we use this feature? */
2192 if ((mdev->sync_conf.use_rle == 0) ||
2193 (mdev->agreed_pro_version < 90))
2194 return 0;
2195
2196 if (c->bit_offset >= c->bm_bits)
2197 return 0; /* nothing to do. */
2198
2199 /* use at most thus many bytes */
2200 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2201 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2202 /* plain bits covered in this code string */
2203 plain_bits = 0;
2204
2205 /* p->encoding & 0x80 stores whether the first run length is set.
2206 * bit offset is implicit.
2207 * start with toggle == 2 to be able to tell the first iteration */
2208 toggle = 2;
2209
2210 /* see how much plain bits we can stuff into one packet
2211 * using RLE and VLI. */
2212 do {
2213 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2214 : _drbd_bm_find_next(mdev, c->bit_offset);
2215 if (tmp == -1UL)
2216 tmp = c->bm_bits;
2217 rl = tmp - c->bit_offset;
2218
2219 if (toggle == 2) { /* first iteration */
2220 if (rl == 0) {
2221 /* the first checked bit was set,
2222 * store start value, */
2223 DCBP_set_start(p, 1);
2224 /* but skip encoding of zero run length */
2225 toggle = !toggle;
2226 continue;
2227 }
2228 DCBP_set_start(p, 0);
2229 }
2230
2231 /* paranoia: catch zero runlength.
2232 * can only happen if bitmap is modified while we scan it. */
2233 if (rl == 0) {
2234 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2235 "t:%u bo:%lu\n", toggle, c->bit_offset);
2236 return -1;
2237 }
2238
2239 bits = vli_encode_bits(&bs, rl);
2240 if (bits == -ENOBUFS) /* buffer full */
2241 break;
2242 if (bits <= 0) {
2243 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2244 return 0;
2245 }
2246
2247 toggle = !toggle;
2248 plain_bits += rl;
2249 c->bit_offset = tmp;
2250 } while (c->bit_offset < c->bm_bits);
2251
2252 len = bs.cur.b - p->code + !!bs.cur.bit;
2253
2254 if (plain_bits < (len << 3)) {
2255 /* incompressible with this method.
2256 * we need to rewind both word and bit position. */
2257 c->bit_offset -= plain_bits;
2258 bm_xfer_ctx_bit_to_word_offset(c);
2259 c->bit_offset = c->word_offset * BITS_PER_LONG;
2260 return 0;
2261 }
2262
2263 /* RLE + VLI was able to compress it just fine.
2264 * update c->word_offset. */
2265 bm_xfer_ctx_bit_to_word_offset(c);
2266
2267 /* store pad_bits */
2268 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2269
2270 return len;
2271}
2272
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002273/**
2274 * send_bitmap_rle_or_plain
2275 *
2276 * Return 0 when done, 1 when another iteration is needed, and a negative error
2277 * code upon failure.
2278 */
2279static int
Philipp Reisnerb411b362009-09-25 16:07:19 -07002280send_bitmap_rle_or_plain(struct drbd_conf *mdev,
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002281 struct p_header80 *h, struct bm_xfer_ctx *c)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002282{
2283 struct p_compressed_bm *p = (void*)h;
2284 unsigned long num_words;
2285 int len;
2286 int ok;
2287
2288 len = fill_bitmap_rle_bits(mdev, p, c);
2289
2290 if (len < 0)
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002291 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002292
2293 if (len) {
2294 DCBP_set_code(p, RLE_VLI_Bits);
2295 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2296 sizeof(*p) + len, 0);
2297
2298 c->packets[0]++;
2299 c->bytes[0] += sizeof(*p) + len;
2300
2301 if (c->bit_offset >= c->bm_bits)
2302 len = 0; /* DONE */
2303 } else {
2304 /* was not compressible.
2305 * send a buffer full of plain text bits instead. */
2306 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2307 len = num_words * sizeof(long);
2308 if (len)
2309 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2310 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002311 h, sizeof(struct p_header80) + len, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002312 c->word_offset += num_words;
2313 c->bit_offset = c->word_offset * BITS_PER_LONG;
2314
2315 c->packets[1]++;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002316 c->bytes[1] += sizeof(struct p_header80) + len;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002317
2318 if (c->bit_offset > c->bm_bits)
2319 c->bit_offset = c->bm_bits;
2320 }
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002321 if (ok) {
2322 if (len == 0) {
2323 INFO_bm_xfer_stats(mdev, "send", c);
2324 return 0;
2325 } else
2326 return 1;
2327 }
2328 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002329}
2330
2331/* See the comment at receive_bitmap() */
2332int _drbd_send_bitmap(struct drbd_conf *mdev)
2333{
2334 struct bm_xfer_ctx c;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002335 struct p_header80 *p;
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002336 int err;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002337
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002338 ERR_IF(!mdev->bitmap) return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002339
2340 /* maybe we should use some per thread scratch page,
2341 * and allocate that during initial device creation? */
Philipp Reisner0b70a132010-08-20 13:36:10 +02002342 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002343 if (!p) {
2344 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002345 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002346 }
2347
2348 if (get_ldev(mdev)) {
2349 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2350 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2351 drbd_bm_set_all(mdev);
2352 if (drbd_bm_write(mdev)) {
2353 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2354 * but otherwise process as per normal - need to tell other
2355 * side that a full resync is required! */
2356 dev_err(DEV, "Failed to write bitmap to disk!\n");
2357 } else {
2358 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2359 drbd_md_sync(mdev);
2360 }
2361 }
2362 put_ldev(mdev);
2363 }
2364
2365 c = (struct bm_xfer_ctx) {
2366 .bm_bits = drbd_bm_bits(mdev),
2367 .bm_words = drbd_bm_words(mdev),
2368 };
2369
2370 do {
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002371 err = send_bitmap_rle_or_plain(mdev, p, &c);
2372 } while (err > 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002373
2374 free_page((unsigned long) p);
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002375 return err == 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002376}
2377
2378int drbd_send_bitmap(struct drbd_conf *mdev)
2379{
2380 int err;
2381
2382 if (!drbd_get_data_sock(mdev))
2383 return -1;
2384 err = !_drbd_send_bitmap(mdev);
2385 drbd_put_data_sock(mdev);
2386 return err;
2387}
2388
2389int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2390{
2391 int ok;
2392 struct p_barrier_ack p;
2393
2394 p.barrier = barrier_nr;
2395 p.set_size = cpu_to_be32(set_size);
2396
2397 if (mdev->state.conn < C_CONNECTED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002398 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002399 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002400 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002401 return ok;
2402}
2403
2404/**
2405 * _drbd_send_ack() - Sends an ack packet
2406 * @mdev: DRBD device.
2407 * @cmd: Packet command code.
2408 * @sector: sector, needs to be in big endian byte order
2409 * @blksize: size in byte, needs to be in big endian byte order
2410 * @block_id: Id, big endian byte order
2411 */
2412static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2413 u64 sector,
2414 u32 blksize,
2415 u64 block_id)
2416{
2417 int ok;
2418 struct p_block_ack p;
2419
2420 p.sector = sector;
2421 p.block_id = block_id;
2422 p.blksize = blksize;
2423 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2424
2425 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002426 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002427 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002428 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002429 return ok;
2430}
2431
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002432/* dp->sector and dp->block_id already/still in network byte order,
2433 * data_size is payload size according to dp->head,
2434 * and may need to be corrected for digest size. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002435int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002436 struct p_data *dp, int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002437{
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002438 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2439 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002440 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2441 dp->block_id);
2442}
2443
2444int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2445 struct p_block_req *rp)
2446{
2447 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2448}
2449
2450/**
2451 * drbd_send_ack() - Sends an ack packet
2452 * @mdev: DRBD device.
2453 * @cmd: Packet command code.
2454 * @e: Epoch entry.
2455 */
2456int drbd_send_ack(struct drbd_conf *mdev,
2457 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2458{
2459 return _drbd_send_ack(mdev, cmd,
2460 cpu_to_be64(e->sector),
2461 cpu_to_be32(e->size),
2462 e->block_id);
2463}
2464
2465/* This function misuses the block_id field to signal if the blocks
2466 * are is sync or not. */
2467int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2468 sector_t sector, int blksize, u64 block_id)
2469{
2470 return _drbd_send_ack(mdev, cmd,
2471 cpu_to_be64(sector),
2472 cpu_to_be32(blksize),
2473 cpu_to_be64(block_id));
2474}
2475
2476int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2477 sector_t sector, int size, u64 block_id)
2478{
2479 int ok;
2480 struct p_block_req p;
2481
2482 p.sector = cpu_to_be64(sector);
2483 p.block_id = block_id;
2484 p.blksize = cpu_to_be32(size);
2485
2486 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002487 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002488 return ok;
2489}
2490
2491int drbd_send_drequest_csum(struct drbd_conf *mdev,
2492 sector_t sector, int size,
2493 void *digest, int digest_size,
2494 enum drbd_packets cmd)
2495{
2496 int ok;
2497 struct p_block_req p;
2498
2499 p.sector = cpu_to_be64(sector);
2500 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2501 p.blksize = cpu_to_be32(size);
2502
2503 p.head.magic = BE_DRBD_MAGIC;
2504 p.head.command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002505 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002506
2507 mutex_lock(&mdev->data.mutex);
2508
2509 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2510 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2511
2512 mutex_unlock(&mdev->data.mutex);
2513
2514 return ok;
2515}
2516
2517int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2518{
2519 int ok;
2520 struct p_block_req p;
2521
2522 p.sector = cpu_to_be64(sector);
2523 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2524 p.blksize = cpu_to_be32(size);
2525
2526 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002527 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002528 return ok;
2529}
2530
2531/* called on sndtimeo
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002532 * returns false if we should retry,
2533 * true if we think connection is dead
Philipp Reisnerb411b362009-09-25 16:07:19 -07002534 */
2535static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2536{
2537 int drop_it;
2538 /* long elapsed = (long)(jiffies - mdev->last_received); */
2539
2540 drop_it = mdev->meta.socket == sock
2541 || !mdev->asender.task
2542 || get_t_state(&mdev->asender) != Running
2543 || mdev->state.conn < C_CONNECTED;
2544
2545 if (drop_it)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002546 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002547
2548 drop_it = !--mdev->ko_count;
2549 if (!drop_it) {
2550 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2551 current->comm, current->pid, mdev->ko_count);
2552 request_ping(mdev);
2553 }
2554
2555 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2556}
2557
2558/* The idea of sendpage seems to be to put some kind of reference
2559 * to the page into the skb, and to hand it over to the NIC. In
2560 * this process get_page() gets called.
2561 *
2562 * As soon as the page was really sent over the network put_page()
2563 * gets called by some part of the network layer. [ NIC driver? ]
2564 *
2565 * [ get_page() / put_page() increment/decrement the count. If count
2566 * reaches 0 the page will be freed. ]
2567 *
2568 * This works nicely with pages from FSs.
2569 * But this means that in protocol A we might signal IO completion too early!
2570 *
2571 * In order not to corrupt data during a resync we must make sure
2572 * that we do not reuse our own buffer pages (EEs) to early, therefore
2573 * we have the net_ee list.
2574 *
2575 * XFS seems to have problems, still, it submits pages with page_count == 0!
2576 * As a workaround, we disable sendpage on pages
2577 * with page_count == 0 or PageSlab.
2578 */
2579static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002580 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002581{
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002582 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002583 kunmap(page);
2584 if (sent == size)
2585 mdev->send_cnt += size>>9;
2586 return sent == size;
2587}
2588
2589static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002590 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002591{
2592 mm_segment_t oldfs = get_fs();
2593 int sent, ok;
2594 int len = size;
2595
2596 /* e.g. XFS meta- & log-data is in slab pages, which have a
2597 * page_count of 0 and/or have PageSlab() set.
2598 * we cannot use send_page for those, as that does get_page();
2599 * put_page(); and would cause either a VM_BUG directly, or
2600 * __page_cache_release a page that would actually still be referenced
2601 * by someone, leading to some obscure delayed Oops somewhere else. */
2602 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002603 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002604
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002605 msg_flags |= MSG_NOSIGNAL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002606 drbd_update_congested(mdev);
2607 set_fs(KERNEL_DS);
2608 do {
2609 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2610 offset, len,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002611 msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002612 if (sent == -EAGAIN) {
2613 if (we_should_drop_the_connection(mdev,
2614 mdev->data.socket))
2615 break;
2616 else
2617 continue;
2618 }
2619 if (sent <= 0) {
2620 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2621 __func__, (int)size, len, sent);
2622 break;
2623 }
2624 len -= sent;
2625 offset += sent;
2626 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2627 set_fs(oldfs);
2628 clear_bit(NET_CONGESTED, &mdev->flags);
2629
2630 ok = (len == 0);
2631 if (likely(ok))
2632 mdev->send_cnt += size>>9;
2633 return ok;
2634}
2635
2636static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2637{
2638 struct bio_vec *bvec;
2639 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002640 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002641 __bio_for_each_segment(bvec, bio, i, 0) {
2642 if (!_drbd_no_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002643 bvec->bv_offset, bvec->bv_len,
2644 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002645 return 0;
2646 }
2647 return 1;
2648}
2649
2650static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2651{
2652 struct bio_vec *bvec;
2653 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002654 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002655 __bio_for_each_segment(bvec, bio, i, 0) {
2656 if (!_drbd_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002657 bvec->bv_offset, bvec->bv_len,
2658 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002659 return 0;
2660 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002661 return 1;
2662}
2663
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002664static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2665{
2666 struct page *page = e->pages;
2667 unsigned len = e->size;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002668 /* hint all but last page with MSG_MORE */
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002669 page_chain_for_each(page) {
2670 unsigned l = min_t(unsigned, len, PAGE_SIZE);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002671 if (!_drbd_send_page(mdev, page, 0, l,
2672 page_chain_next(page) ? MSG_MORE : 0))
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002673 return 0;
2674 len -= l;
2675 }
2676 return 1;
2677}
2678
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002679static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2680{
2681 if (mdev->agreed_pro_version >= 95)
2682 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002683 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2684 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2685 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2686 else
Jens Axboe721a9602011-03-09 11:56:30 +01002687 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002688}
2689
Philipp Reisnerb411b362009-09-25 16:07:19 -07002690/* Used to send write requests
2691 * R_PRIMARY -> Peer (P_DATA)
2692 */
2693int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2694{
2695 int ok = 1;
2696 struct p_data p;
2697 unsigned int dp_flags = 0;
2698 void *dgb;
2699 int dgs;
2700
2701 if (!drbd_get_data_sock(mdev))
2702 return 0;
2703
2704 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2705 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2706
Philipp Reisnerd5373382010-08-23 15:18:33 +02002707 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002708 p.head.h80.magic = BE_DRBD_MAGIC;
2709 p.head.h80.command = cpu_to_be16(P_DATA);
2710 p.head.h80.length =
2711 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2712 } else {
2713 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2714 p.head.h95.command = cpu_to_be16(P_DATA);
2715 p.head.h95.length =
2716 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2717 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002718
2719 p.sector = cpu_to_be64(req->sector);
2720 p.block_id = (unsigned long)req;
2721 p.seq_num = cpu_to_be32(req->seq_num =
2722 atomic_add_return(1, &mdev->packet_seq));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002723
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002724 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2725
Philipp Reisnerb411b362009-09-25 16:07:19 -07002726 if (mdev->state.conn >= C_SYNC_SOURCE &&
2727 mdev->state.conn <= C_PAUSED_SYNC_T)
2728 dp_flags |= DP_MAY_SET_IN_SYNC;
2729
2730 p.dp_flags = cpu_to_be32(dp_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002731 set_bit(UNPLUG_REMOTE, &mdev->flags);
2732 ok = (sizeof(p) ==
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002733 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002734 if (ok && dgs) {
2735 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002736 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
Andreas Gruenbachercab2f742010-12-09 16:08:46 +01002737 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002738 }
2739 if (ok) {
Lars Ellenberg470be442010-11-10 10:36:52 +01002740 /* For protocol A, we have to memcpy the payload into
2741 * socket buffers, as we may complete right away
2742 * as soon as we handed it over to tcp, at which point the data
2743 * pages may become invalid.
2744 *
2745 * For data-integrity enabled, we copy it as well, so we can be
2746 * sure that even if the bio pages may still be modified, it
2747 * won't change the data on the wire, thus if the digest checks
2748 * out ok after sending on this side, but does not fit on the
2749 * receiving side, we sure have detected corruption elsewhere.
2750 */
2751 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002752 ok = _drbd_send_bio(mdev, req->master_bio);
2753 else
2754 ok = _drbd_send_zc_bio(mdev, req->master_bio);
Lars Ellenberg470be442010-11-10 10:36:52 +01002755
2756 /* double check digest, sometimes buffers have been modified in flight. */
2757 if (dgs > 0 && dgs <= 64) {
Bart Van Assche24c48302011-05-21 18:32:29 +02002758 /* 64 byte, 512 bit, is the largest digest size
Lars Ellenberg470be442010-11-10 10:36:52 +01002759 * currently supported in kernel crypto. */
2760 unsigned char digest[64];
2761 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2762 if (memcmp(mdev->int_dig_out, digest, dgs)) {
2763 dev_warn(DEV,
2764 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2765 (unsigned long long)req->sector, req->size);
2766 }
2767 } /* else if (dgs > 64) {
2768 ... Be noisy about digest too large ...
2769 } */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002770 }
2771
2772 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002773
Philipp Reisnerb411b362009-09-25 16:07:19 -07002774 return ok;
2775}
2776
2777/* answer packet, used to send data back for read requests:
2778 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2779 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2780 */
2781int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2782 struct drbd_epoch_entry *e)
2783{
2784 int ok;
2785 struct p_data p;
2786 void *dgb;
2787 int dgs;
2788
2789 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2790 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2791
Philipp Reisnerd5373382010-08-23 15:18:33 +02002792 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002793 p.head.h80.magic = BE_DRBD_MAGIC;
2794 p.head.h80.command = cpu_to_be16(cmd);
2795 p.head.h80.length =
2796 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2797 } else {
2798 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2799 p.head.h95.command = cpu_to_be16(cmd);
2800 p.head.h95.length =
2801 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2802 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002803
2804 p.sector = cpu_to_be64(e->sector);
2805 p.block_id = e->block_id;
2806 /* p.seq_num = 0; No sequence numbers here.. */
2807
2808 /* Only called by our kernel thread.
2809 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2810 * in response to admin command or module unload.
2811 */
2812 if (!drbd_get_data_sock(mdev))
2813 return 0;
2814
Philipp Reisner0b70a132010-08-20 13:36:10 +02002815 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002816 if (ok && dgs) {
2817 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002818 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
Andreas Gruenbachercab2f742010-12-09 16:08:46 +01002819 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002820 }
2821 if (ok)
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002822 ok = _drbd_send_zc_ee(mdev, e);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002823
2824 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002825
Philipp Reisnerb411b362009-09-25 16:07:19 -07002826 return ok;
2827}
2828
Philipp Reisner73a01a12010-10-27 14:33:00 +02002829int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2830{
2831 struct p_block_desc p;
2832
2833 p.sector = cpu_to_be64(req->sector);
2834 p.blksize = cpu_to_be32(req->size);
2835
2836 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2837}
2838
Philipp Reisnerb411b362009-09-25 16:07:19 -07002839/*
2840 drbd_send distinguishes two cases:
2841
2842 Packets sent via the data socket "sock"
2843 and packets sent via the meta data socket "msock"
2844
2845 sock msock
2846 -----------------+-------------------------+------------------------------
2847 timeout conf.timeout / 2 conf.timeout / 2
2848 timeout action send a ping via msock Abort communication
2849 and close all sockets
2850*/
2851
2852/*
2853 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2854 */
2855int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2856 void *buf, size_t size, unsigned msg_flags)
2857{
2858 struct kvec iov;
2859 struct msghdr msg;
2860 int rv, sent = 0;
2861
2862 if (!sock)
2863 return -1000;
2864
2865 /* THINK if (signal_pending) return ... ? */
2866
2867 iov.iov_base = buf;
2868 iov.iov_len = size;
2869
2870 msg.msg_name = NULL;
2871 msg.msg_namelen = 0;
2872 msg.msg_control = NULL;
2873 msg.msg_controllen = 0;
2874 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2875
2876 if (sock == mdev->data.socket) {
2877 mdev->ko_count = mdev->net_conf->ko_count;
2878 drbd_update_congested(mdev);
2879 }
2880 do {
2881 /* STRANGE
2882 * tcp_sendmsg does _not_ use its size parameter at all ?
2883 *
2884 * -EAGAIN on timeout, -EINTR on signal.
2885 */
2886/* THINK
2887 * do we need to block DRBD_SIG if sock == &meta.socket ??
2888 * otherwise wake_asender() might interrupt some send_*Ack !
2889 */
2890 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2891 if (rv == -EAGAIN) {
2892 if (we_should_drop_the_connection(mdev, sock))
2893 break;
2894 else
2895 continue;
2896 }
2897 D_ASSERT(rv != 0);
2898 if (rv == -EINTR) {
2899 flush_signals(current);
2900 rv = 0;
2901 }
2902 if (rv < 0)
2903 break;
2904 sent += rv;
2905 iov.iov_base += rv;
2906 iov.iov_len -= rv;
2907 } while (sent < size);
2908
2909 if (sock == mdev->data.socket)
2910 clear_bit(NET_CONGESTED, &mdev->flags);
2911
2912 if (rv <= 0) {
2913 if (rv != -EAGAIN) {
2914 dev_err(DEV, "%s_sendmsg returned %d\n",
2915 sock == mdev->meta.socket ? "msock" : "sock",
2916 rv);
2917 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2918 } else
2919 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2920 }
2921
2922 return sent;
2923}
2924
2925static int drbd_open(struct block_device *bdev, fmode_t mode)
2926{
2927 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2928 unsigned long flags;
2929 int rv = 0;
2930
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002931 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002932 spin_lock_irqsave(&mdev->req_lock, flags);
2933 /* to have a stable mdev->state.role
2934 * and no race with updating open_cnt */
2935
2936 if (mdev->state.role != R_PRIMARY) {
2937 if (mode & FMODE_WRITE)
2938 rv = -EROFS;
2939 else if (!allow_oos)
2940 rv = -EMEDIUMTYPE;
2941 }
2942
2943 if (!rv)
2944 mdev->open_cnt++;
2945 spin_unlock_irqrestore(&mdev->req_lock, flags);
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002946 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002947
2948 return rv;
2949}
2950
2951static int drbd_release(struct gendisk *gd, fmode_t mode)
2952{
2953 struct drbd_conf *mdev = gd->private_data;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002954 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002955 mdev->open_cnt--;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002956 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002957 return 0;
2958}
2959
Philipp Reisnerb411b362009-09-25 16:07:19 -07002960static void drbd_set_defaults(struct drbd_conf *mdev)
2961{
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002962 /* This way we get a compile error when sync_conf grows,
2963 and we forgot to initialize it here */
2964 mdev->sync_conf = (struct syncer_conf) {
2965 /* .rate = */ DRBD_RATE_DEF,
2966 /* .after = */ DRBD_AFTER_DEF,
2967 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002968 /* .verify_alg = */ {}, 0,
2969 /* .cpu_mask = */ {}, 0,
2970 /* .csums_alg = */ {}, 0,
Philipp Reisnere7564142010-06-29 17:35:34 +02002971 /* .use_rle = */ 0,
Philipp Reisner9a31d712010-07-05 13:42:03 +02002972 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
2973 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
2974 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2975 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002976 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
2977 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002978 };
2979
2980 /* Have to use that way, because the layout differs between
2981 big endian and little endian */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002982 mdev->state = (union drbd_state) {
2983 { .role = R_SECONDARY,
2984 .peer = R_UNKNOWN,
2985 .conn = C_STANDALONE,
2986 .disk = D_DISKLESS,
2987 .pdsk = D_UNKNOWN,
Philipp Reisnerfb22c402010-09-08 23:20:21 +02002988 .susp = 0,
2989 .susp_nod = 0,
2990 .susp_fen = 0
Philipp Reisnerb411b362009-09-25 16:07:19 -07002991 } };
2992}
2993
2994void drbd_init_set_defaults(struct drbd_conf *mdev)
2995{
2996 /* the memset(,0,) did most of this.
2997 * note: only assignments, no allocation in here */
2998
2999 drbd_set_defaults(mdev);
3000
Philipp Reisnerb411b362009-09-25 16:07:19 -07003001 atomic_set(&mdev->ap_bio_cnt, 0);
3002 atomic_set(&mdev->ap_pending_cnt, 0);
3003 atomic_set(&mdev->rs_pending_cnt, 0);
3004 atomic_set(&mdev->unacked_cnt, 0);
3005 atomic_set(&mdev->local_cnt, 0);
3006 atomic_set(&mdev->net_cnt, 0);
3007 atomic_set(&mdev->packet_seq, 0);
3008 atomic_set(&mdev->pp_in_use, 0);
Lars Ellenberg435f0742010-09-06 12:30:25 +02003009 atomic_set(&mdev->pp_in_use_by_net, 0);
Philipp Reisner778f2712010-07-06 11:14:00 +02003010 atomic_set(&mdev->rs_sect_in, 0);
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02003011 atomic_set(&mdev->rs_sect_ev, 0);
Philipp Reisner759fbdf2010-10-26 16:02:27 +02003012 atomic_set(&mdev->ap_in_flight, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003013
3014 mutex_init(&mdev->md_io_mutex);
3015 mutex_init(&mdev->data.mutex);
3016 mutex_init(&mdev->meta.mutex);
3017 sema_init(&mdev->data.work.s, 0);
3018 sema_init(&mdev->meta.work.s, 0);
3019 mutex_init(&mdev->state_mutex);
3020
3021 spin_lock_init(&mdev->data.work.q_lock);
3022 spin_lock_init(&mdev->meta.work.q_lock);
3023
3024 spin_lock_init(&mdev->al_lock);
3025 spin_lock_init(&mdev->req_lock);
3026 spin_lock_init(&mdev->peer_seq_lock);
3027 spin_lock_init(&mdev->epoch_lock);
3028
3029 INIT_LIST_HEAD(&mdev->active_ee);
3030 INIT_LIST_HEAD(&mdev->sync_ee);
3031 INIT_LIST_HEAD(&mdev->done_ee);
3032 INIT_LIST_HEAD(&mdev->read_ee);
3033 INIT_LIST_HEAD(&mdev->net_ee);
3034 INIT_LIST_HEAD(&mdev->resync_reads);
3035 INIT_LIST_HEAD(&mdev->data.work.q);
3036 INIT_LIST_HEAD(&mdev->meta.work.q);
3037 INIT_LIST_HEAD(&mdev->resync_work.list);
3038 INIT_LIST_HEAD(&mdev->unplug_work.list);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003039 INIT_LIST_HEAD(&mdev->go_diskless.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003040 INIT_LIST_HEAD(&mdev->md_sync_work.list);
Philipp Reisnerc4752ef2010-10-27 17:32:36 +02003041 INIT_LIST_HEAD(&mdev->start_resync_work.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003042 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
Philipp Reisner0ced55a2010-04-30 15:26:20 +02003043
Philipp Reisner794abb72010-12-27 11:51:23 +01003044 mdev->resync_work.cb = w_resync_timer;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003045 mdev->unplug_work.cb = w_send_write_hint;
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003046 mdev->go_diskless.cb = w_go_diskless;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003047 mdev->md_sync_work.cb = w_md_sync;
3048 mdev->bm_io_work.w.cb = w_bitmap_io;
Philipp Reisner370a43e2011-01-14 16:03:11 +01003049 mdev->start_resync_work.cb = w_start_resync;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003050 init_timer(&mdev->resync_timer);
3051 init_timer(&mdev->md_sync_timer);
Philipp Reisner370a43e2011-01-14 16:03:11 +01003052 init_timer(&mdev->start_resync_timer);
Philipp Reisner7fde2be2011-03-01 11:08:28 +01003053 init_timer(&mdev->request_timer);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003054 mdev->resync_timer.function = resync_timer_fn;
3055 mdev->resync_timer.data = (unsigned long) mdev;
3056 mdev->md_sync_timer.function = md_sync_timer_fn;
3057 mdev->md_sync_timer.data = (unsigned long) mdev;
Philipp Reisner370a43e2011-01-14 16:03:11 +01003058 mdev->start_resync_timer.function = start_resync_timer_fn;
3059 mdev->start_resync_timer.data = (unsigned long) mdev;
Philipp Reisner7fde2be2011-03-01 11:08:28 +01003060 mdev->request_timer.function = request_timer_fn;
3061 mdev->request_timer.data = (unsigned long) mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003062
3063 init_waitqueue_head(&mdev->misc_wait);
3064 init_waitqueue_head(&mdev->state_wait);
Philipp Reisner84dfb9f2010-06-23 11:20:05 +02003065 init_waitqueue_head(&mdev->net_cnt_wait);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003066 init_waitqueue_head(&mdev->ee_wait);
3067 init_waitqueue_head(&mdev->al_wait);
3068 init_waitqueue_head(&mdev->seq_wait);
3069
3070 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
3071 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
3072 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
3073
3074 mdev->agreed_pro_version = PRO_VERSION_MAX;
Philipp Reisner2451fc32010-08-24 13:43:11 +02003075 mdev->write_ordering = WO_bdev_flush;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003076 mdev->resync_wenr = LC_FREE;
Philipp Reisner99432fc2011-05-20 16:39:13 +02003077 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3078 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003079}
3080
3081void drbd_mdev_cleanup(struct drbd_conf *mdev)
3082{
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003083 int i;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003084 if (mdev->receiver.t_state != None)
3085 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
3086 mdev->receiver.t_state);
3087
3088 /* no need to lock it, I'm the only thread alive */
3089 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
3090 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
3091 mdev->al_writ_cnt =
3092 mdev->bm_writ_cnt =
3093 mdev->read_cnt =
3094 mdev->recv_cnt =
3095 mdev->send_cnt =
3096 mdev->writ_cnt =
3097 mdev->p_size =
3098 mdev->rs_start =
3099 mdev->rs_total =
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003100 mdev->rs_failed = 0;
3101 mdev->rs_last_events = 0;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02003102 mdev->rs_last_sect_ev = 0;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003103 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3104 mdev->rs_mark_left[i] = 0;
3105 mdev->rs_mark_time[i] = 0;
3106 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003107 D_ASSERT(mdev->net_conf == NULL);
3108
3109 drbd_set_my_capacity(mdev, 0);
3110 if (mdev->bitmap) {
3111 /* maybe never allocated. */
Philipp Reisner02d9a942010-03-24 16:23:03 +01003112 drbd_bm_resize(mdev, 0, 1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003113 drbd_bm_cleanup(mdev);
3114 }
3115
3116 drbd_free_resources(mdev);
Philipp Reisner07782862010-08-31 12:00:50 +02003117 clear_bit(AL_SUSPENDED, &mdev->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003118
3119 /*
3120 * currently we drbd_init_ee only on module load, so
3121 * we may do drbd_release_ee only on module unload!
3122 */
3123 D_ASSERT(list_empty(&mdev->active_ee));
3124 D_ASSERT(list_empty(&mdev->sync_ee));
3125 D_ASSERT(list_empty(&mdev->done_ee));
3126 D_ASSERT(list_empty(&mdev->read_ee));
3127 D_ASSERT(list_empty(&mdev->net_ee));
3128 D_ASSERT(list_empty(&mdev->resync_reads));
3129 D_ASSERT(list_empty(&mdev->data.work.q));
3130 D_ASSERT(list_empty(&mdev->meta.work.q));
3131 D_ASSERT(list_empty(&mdev->resync_work.list));
3132 D_ASSERT(list_empty(&mdev->unplug_work.list));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003133 D_ASSERT(list_empty(&mdev->go_diskless.list));
Lars Ellenberg2265b472010-12-16 15:41:26 +01003134
3135 drbd_set_defaults(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003136}
3137
3138
3139static void drbd_destroy_mempools(void)
3140{
3141 struct page *page;
3142
3143 while (drbd_pp_pool) {
3144 page = drbd_pp_pool;
3145 drbd_pp_pool = (struct page *)page_private(page);
3146 __free_page(page);
3147 drbd_pp_vacant--;
3148 }
3149
3150 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3151
3152 if (drbd_ee_mempool)
3153 mempool_destroy(drbd_ee_mempool);
3154 if (drbd_request_mempool)
3155 mempool_destroy(drbd_request_mempool);
3156 if (drbd_ee_cache)
3157 kmem_cache_destroy(drbd_ee_cache);
3158 if (drbd_request_cache)
3159 kmem_cache_destroy(drbd_request_cache);
3160 if (drbd_bm_ext_cache)
3161 kmem_cache_destroy(drbd_bm_ext_cache);
3162 if (drbd_al_ext_cache)
3163 kmem_cache_destroy(drbd_al_ext_cache);
3164
3165 drbd_ee_mempool = NULL;
3166 drbd_request_mempool = NULL;
3167 drbd_ee_cache = NULL;
3168 drbd_request_cache = NULL;
3169 drbd_bm_ext_cache = NULL;
3170 drbd_al_ext_cache = NULL;
3171
3172 return;
3173}
3174
3175static int drbd_create_mempools(void)
3176{
3177 struct page *page;
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01003178 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003179 int i;
3180
3181 /* prepare our caches and mempools */
3182 drbd_request_mempool = NULL;
3183 drbd_ee_cache = NULL;
3184 drbd_request_cache = NULL;
3185 drbd_bm_ext_cache = NULL;
3186 drbd_al_ext_cache = NULL;
3187 drbd_pp_pool = NULL;
3188
3189 /* caches */
3190 drbd_request_cache = kmem_cache_create(
3191 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3192 if (drbd_request_cache == NULL)
3193 goto Enomem;
3194
3195 drbd_ee_cache = kmem_cache_create(
3196 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3197 if (drbd_ee_cache == NULL)
3198 goto Enomem;
3199
3200 drbd_bm_ext_cache = kmem_cache_create(
3201 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3202 if (drbd_bm_ext_cache == NULL)
3203 goto Enomem;
3204
3205 drbd_al_ext_cache = kmem_cache_create(
3206 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3207 if (drbd_al_ext_cache == NULL)
3208 goto Enomem;
3209
3210 /* mempools */
3211 drbd_request_mempool = mempool_create(number,
3212 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3213 if (drbd_request_mempool == NULL)
3214 goto Enomem;
3215
3216 drbd_ee_mempool = mempool_create(number,
3217 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
Nicolas Kaiser2027ae12010-10-28 06:15:26 -06003218 if (drbd_ee_mempool == NULL)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003219 goto Enomem;
3220
3221 /* drbd's page pool */
3222 spin_lock_init(&drbd_pp_lock);
3223
3224 for (i = 0; i < number; i++) {
3225 page = alloc_page(GFP_HIGHUSER);
3226 if (!page)
3227 goto Enomem;
3228 set_page_private(page, (unsigned long)drbd_pp_pool);
3229 drbd_pp_pool = page;
3230 }
3231 drbd_pp_vacant = number;
3232
3233 return 0;
3234
3235Enomem:
3236 drbd_destroy_mempools(); /* in case we allocated some */
3237 return -ENOMEM;
3238}
3239
3240static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3241 void *unused)
3242{
3243 /* just so we have it. you never know what interesting things we
3244 * might want to do here some day...
3245 */
3246
3247 return NOTIFY_DONE;
3248}
3249
3250static struct notifier_block drbd_notifier = {
3251 .notifier_call = drbd_notify_sys,
3252};
3253
3254static void drbd_release_ee_lists(struct drbd_conf *mdev)
3255{
3256 int rr;
3257
3258 rr = drbd_release_ee(mdev, &mdev->active_ee);
3259 if (rr)
3260 dev_err(DEV, "%d EEs in active list found!\n", rr);
3261
3262 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3263 if (rr)
3264 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3265
3266 rr = drbd_release_ee(mdev, &mdev->read_ee);
3267 if (rr)
3268 dev_err(DEV, "%d EEs in read list found!\n", rr);
3269
3270 rr = drbd_release_ee(mdev, &mdev->done_ee);
3271 if (rr)
3272 dev_err(DEV, "%d EEs in done list found!\n", rr);
3273
3274 rr = drbd_release_ee(mdev, &mdev->net_ee);
3275 if (rr)
3276 dev_err(DEV, "%d EEs in net list found!\n", rr);
3277}
3278
3279/* caution. no locking.
3280 * currently only used from module cleanup code. */
3281static void drbd_delete_device(unsigned int minor)
3282{
3283 struct drbd_conf *mdev = minor_to_mdev(minor);
3284
3285 if (!mdev)
3286 return;
3287
3288 /* paranoia asserts */
3289 if (mdev->open_cnt != 0)
3290 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3291 __FILE__ , __LINE__);
3292
3293 ERR_IF (!list_empty(&mdev->data.work.q)) {
3294 struct list_head *lp;
3295 list_for_each(lp, &mdev->data.work.q) {
3296 dev_err(DEV, "lp = %p\n", lp);
3297 }
3298 };
3299 /* end paranoia asserts */
3300
3301 del_gendisk(mdev->vdisk);
3302
3303 /* cleanup stuff that may have been allocated during
3304 * device (re-)configuration or state changes */
3305
3306 if (mdev->this_bdev)
3307 bdput(mdev->this_bdev);
3308
3309 drbd_free_resources(mdev);
3310
3311 drbd_release_ee_lists(mdev);
3312
Bart Van Assche24c48302011-05-21 18:32:29 +02003313 /* should be freed on disconnect? */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003314 kfree(mdev->ee_hash);
3315 /*
3316 mdev->ee_hash_s = 0;
3317 mdev->ee_hash = NULL;
3318 */
3319
3320 lc_destroy(mdev->act_log);
3321 lc_destroy(mdev->resync);
3322
3323 kfree(mdev->p_uuid);
3324 /* mdev->p_uuid = NULL; */
3325
3326 kfree(mdev->int_dig_out);
3327 kfree(mdev->int_dig_in);
3328 kfree(mdev->int_dig_vv);
3329
3330 /* cleanup the rest that has been
3331 * allocated from drbd_new_device
3332 * and actually free the mdev itself */
3333 drbd_free_mdev(mdev);
3334}
3335
3336static void drbd_cleanup(void)
3337{
3338 unsigned int i;
3339
3340 unregister_reboot_notifier(&drbd_notifier);
3341
Lars Ellenberg17a93f32010-11-24 10:37:35 +01003342 /* first remove proc,
3343 * drbdsetup uses it's presence to detect
3344 * whether DRBD is loaded.
3345 * If we would get stuck in proc removal,
3346 * but have netlink already deregistered,
3347 * some drbdsetup commands may wait forever
3348 * for an answer.
3349 */
3350 if (drbd_proc)
3351 remove_proc_entry("drbd", NULL);
3352
Philipp Reisnerb411b362009-09-25 16:07:19 -07003353 drbd_nl_cleanup();
3354
3355 if (minor_table) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003356 i = minor_count;
3357 while (i--)
3358 drbd_delete_device(i);
3359 drbd_destroy_mempools();
3360 }
3361
3362 kfree(minor_table);
3363
3364 unregister_blkdev(DRBD_MAJOR, "drbd");
3365
3366 printk(KERN_INFO "drbd: module cleanup done.\n");
3367}
3368
3369/**
3370 * drbd_congested() - Callback for pdflush
3371 * @congested_data: User data
3372 * @bdi_bits: Bits pdflush is currently interested in
3373 *
3374 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3375 */
3376static int drbd_congested(void *congested_data, int bdi_bits)
3377{
3378 struct drbd_conf *mdev = congested_data;
3379 struct request_queue *q;
3380 char reason = '-';
3381 int r = 0;
3382
Andreas Gruenbacher1b881ef2010-12-13 18:03:38 +01003383 if (!may_inc_ap_bio(mdev)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003384 /* DRBD has frozen IO */
3385 r = bdi_bits;
3386 reason = 'd';
3387 goto out;
3388 }
3389
3390 if (get_ldev(mdev)) {
3391 q = bdev_get_queue(mdev->ldev->backing_bdev);
3392 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3393 put_ldev(mdev);
3394 if (r)
3395 reason = 'b';
3396 }
3397
3398 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3399 r |= (1 << BDI_async_congested);
3400 reason = reason == 'b' ? 'a' : 'n';
3401 }
3402
3403out:
3404 mdev->congestion_reason = reason;
3405 return r;
3406}
3407
3408struct drbd_conf *drbd_new_device(unsigned int minor)
3409{
3410 struct drbd_conf *mdev;
3411 struct gendisk *disk;
3412 struct request_queue *q;
3413
3414 /* GFP_KERNEL, we are outside of all write-out paths */
3415 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3416 if (!mdev)
3417 return NULL;
3418 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3419 goto out_no_cpumask;
3420
3421 mdev->minor = minor;
3422
3423 drbd_init_set_defaults(mdev);
3424
3425 q = blk_alloc_queue(GFP_KERNEL);
3426 if (!q)
3427 goto out_no_q;
3428 mdev->rq_queue = q;
3429 q->queuedata = mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003430
3431 disk = alloc_disk(1);
3432 if (!disk)
3433 goto out_no_disk;
3434 mdev->vdisk = disk;
3435
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003436 set_disk_ro(disk, true);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003437
3438 disk->queue = q;
3439 disk->major = DRBD_MAJOR;
3440 disk->first_minor = minor;
3441 disk->fops = &drbd_ops;
3442 sprintf(disk->disk_name, "drbd%d", minor);
3443 disk->private_data = mdev;
3444
3445 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3446 /* we have no partitions. we contain only ourselves. */
3447 mdev->this_bdev->bd_contains = mdev->this_bdev;
3448
3449 q->backing_dev_info.congested_fn = drbd_congested;
3450 q->backing_dev_info.congested_data = mdev;
3451
Andreas Gruenbacher2f58dcf2010-12-13 17:48:19 +01003452 blk_queue_make_request(q, drbd_make_request);
Philipp Reisner99432fc2011-05-20 16:39:13 +02003453 /* Setting the max_hw_sectors to an odd value of 8kibyte here
3454 This triggers a max_bio_size message upon first attach or connect */
3455 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003456 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3457 blk_queue_merge_bvec(q, drbd_merge_bvec);
Jens Axboe7eaceac2011-03-10 08:52:07 +01003458 q->queue_lock = &mdev->req_lock;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003459
3460 mdev->md_io_page = alloc_page(GFP_KERNEL);
3461 if (!mdev->md_io_page)
3462 goto out_no_io_page;
3463
3464 if (drbd_bm_init(mdev))
3465 goto out_no_bitmap;
3466 /* no need to lock access, we are still initializing this minor device. */
3467 if (!tl_init(mdev))
3468 goto out_no_tl;
3469
3470 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3471 if (!mdev->app_reads_hash)
3472 goto out_no_app_reads;
3473
3474 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3475 if (!mdev->current_epoch)
3476 goto out_no_epoch;
3477
3478 INIT_LIST_HEAD(&mdev->current_epoch->list);
3479 mdev->epochs = 1;
3480
3481 return mdev;
3482
3483/* out_whatever_else:
3484 kfree(mdev->current_epoch); */
3485out_no_epoch:
3486 kfree(mdev->app_reads_hash);
3487out_no_app_reads:
3488 tl_cleanup(mdev);
3489out_no_tl:
3490 drbd_bm_cleanup(mdev);
3491out_no_bitmap:
3492 __free_page(mdev->md_io_page);
3493out_no_io_page:
3494 put_disk(disk);
3495out_no_disk:
3496 blk_cleanup_queue(q);
3497out_no_q:
3498 free_cpumask_var(mdev->cpu_mask);
3499out_no_cpumask:
3500 kfree(mdev);
3501 return NULL;
3502}
3503
3504/* counterpart of drbd_new_device.
3505 * last part of drbd_delete_device. */
3506void drbd_free_mdev(struct drbd_conf *mdev)
3507{
3508 kfree(mdev->current_epoch);
3509 kfree(mdev->app_reads_hash);
3510 tl_cleanup(mdev);
3511 if (mdev->bitmap) /* should no longer be there. */
3512 drbd_bm_cleanup(mdev);
3513 __free_page(mdev->md_io_page);
3514 put_disk(mdev->vdisk);
3515 blk_cleanup_queue(mdev->rq_queue);
3516 free_cpumask_var(mdev->cpu_mask);
Philipp Reisner37190942010-11-10 12:08:37 +01003517 drbd_free_tl_hash(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003518 kfree(mdev);
3519}
3520
3521
3522int __init drbd_init(void)
3523{
3524 int err;
3525
3526 if (sizeof(struct p_handshake) != 80) {
3527 printk(KERN_ERR
3528 "drbd: never change the size or layout "
3529 "of the HandShake packet.\n");
3530 return -EINVAL;
3531 }
3532
Philipp Reisner2b8a90b2011-01-10 11:15:17 +01003533 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003534 printk(KERN_ERR
3535 "drbd: invalid minor_count (%d)\n", minor_count);
3536#ifdef MODULE
3537 return -EINVAL;
3538#else
3539 minor_count = 8;
3540#endif
3541 }
3542
3543 err = drbd_nl_init();
3544 if (err)
3545 return err;
3546
3547 err = register_blkdev(DRBD_MAJOR, "drbd");
3548 if (err) {
3549 printk(KERN_ERR
3550 "drbd: unable to register block device major %d\n",
3551 DRBD_MAJOR);
3552 return err;
3553 }
3554
3555 register_reboot_notifier(&drbd_notifier);
3556
3557 /*
3558 * allocate all necessary structs
3559 */
3560 err = -ENOMEM;
3561
3562 init_waitqueue_head(&drbd_pp_wait);
3563
3564 drbd_proc = NULL; /* play safe for drbd_cleanup */
3565 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3566 GFP_KERNEL);
3567 if (!minor_table)
3568 goto Enomem;
3569
3570 err = drbd_create_mempools();
3571 if (err)
3572 goto Enomem;
3573
Lars Ellenberg8c484ee2010-03-11 16:47:58 +01003574 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003575 if (!drbd_proc) {
3576 printk(KERN_ERR "drbd: unable to register proc file\n");
3577 goto Enomem;
3578 }
3579
3580 rwlock_init(&global_state_lock);
3581
3582 printk(KERN_INFO "drbd: initialized. "
3583 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3584 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3585 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3586 printk(KERN_INFO "drbd: registered as block device major %d\n",
3587 DRBD_MAJOR);
3588 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3589
3590 return 0; /* Success! */
3591
3592Enomem:
3593 drbd_cleanup();
3594 if (err == -ENOMEM)
3595 /* currently always the case */
3596 printk(KERN_ERR "drbd: ran out of memory\n");
3597 else
3598 printk(KERN_ERR "drbd: initialization failure\n");
3599 return err;
3600}
3601
3602void drbd_free_bc(struct drbd_backing_dev *ldev)
3603{
3604 if (ldev == NULL)
3605 return;
3606
Tejun Heoe525fd82010-11-13 11:55:17 +01003607 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3608 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003609
3610 kfree(ldev);
3611}
3612
3613void drbd_free_sock(struct drbd_conf *mdev)
3614{
3615 if (mdev->data.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003616 mutex_lock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003617 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3618 sock_release(mdev->data.socket);
3619 mdev->data.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003620 mutex_unlock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003621 }
3622 if (mdev->meta.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003623 mutex_lock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003624 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3625 sock_release(mdev->meta.socket);
3626 mdev->meta.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003627 mutex_unlock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003628 }
3629}
3630
3631
3632void drbd_free_resources(struct drbd_conf *mdev)
3633{
3634 crypto_free_hash(mdev->csums_tfm);
3635 mdev->csums_tfm = NULL;
3636 crypto_free_hash(mdev->verify_tfm);
3637 mdev->verify_tfm = NULL;
3638 crypto_free_hash(mdev->cram_hmac_tfm);
3639 mdev->cram_hmac_tfm = NULL;
3640 crypto_free_hash(mdev->integrity_w_tfm);
3641 mdev->integrity_w_tfm = NULL;
3642 crypto_free_hash(mdev->integrity_r_tfm);
3643 mdev->integrity_r_tfm = NULL;
3644
3645 drbd_free_sock(mdev);
3646
3647 __no_warn(local,
3648 drbd_free_bc(mdev->ldev);
3649 mdev->ldev = NULL;);
3650}
3651
3652/* meta data management */
3653
3654struct meta_data_on_disk {
3655 u64 la_size; /* last agreed size. */
3656 u64 uuid[UI_SIZE]; /* UUIDs. */
3657 u64 device_uuid;
3658 u64 reserved_u64_1;
3659 u32 flags; /* MDF */
3660 u32 magic;
3661 u32 md_size_sect;
3662 u32 al_offset; /* offset to this block */
3663 u32 al_nr_extents; /* important for restoring the AL */
3664 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3665 u32 bm_offset; /* offset to the bitmap, from here */
3666 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
Philipp Reisner99432fc2011-05-20 16:39:13 +02003667 u32 la_peer_max_bio_size; /* last peer max_bio_size */
3668 u32 reserved_u32[3];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003669
3670} __packed;
3671
3672/**
3673 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3674 * @mdev: DRBD device.
3675 */
3676void drbd_md_sync(struct drbd_conf *mdev)
3677{
3678 struct meta_data_on_disk *buffer;
3679 sector_t sector;
3680 int i;
3681
Lars Ellenbergee15b032010-09-03 10:00:09 +02003682 del_timer(&mdev->md_sync_timer);
3683 /* timer may be rearmed by drbd_md_mark_dirty() now. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003684 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3685 return;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003686
3687 /* We use here D_FAILED and not D_ATTACHING because we try to write
3688 * metadata even if we detach due to a disk failure! */
3689 if (!get_ldev_if_state(mdev, D_FAILED))
3690 return;
3691
Philipp Reisnerb411b362009-09-25 16:07:19 -07003692 mutex_lock(&mdev->md_io_mutex);
3693 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3694 memset(buffer, 0, 512);
3695
3696 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3697 for (i = UI_CURRENT; i < UI_SIZE; i++)
3698 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3699 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3700 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3701
3702 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3703 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3704 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3705 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3706 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3707
3708 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
Philipp Reisner99432fc2011-05-20 16:39:13 +02003709 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003710
3711 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3712 sector = mdev->ldev->md.md_offset;
3713
Lars Ellenberg3f3a9b82010-09-01 15:12:12 +02003714 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003715 /* this was a try anyways ... */
3716 dev_err(DEV, "meta data update failed!\n");
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003717 drbd_chk_io_error(mdev, 1, true);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003718 }
3719
3720 /* Update mdev->ldev->md.la_size_sect,
3721 * since we updated it on metadata. */
3722 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3723
3724 mutex_unlock(&mdev->md_io_mutex);
3725 put_ldev(mdev);
3726}
3727
3728/**
3729 * drbd_md_read() - Reads in the meta data super block
3730 * @mdev: DRBD device.
3731 * @bdev: Device from which the meta data should be read in.
3732 *
Andreas Gruenbacher116676c2010-12-08 13:33:11 +01003733 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
Philipp Reisnerb411b362009-09-25 16:07:19 -07003734 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3735 */
3736int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3737{
3738 struct meta_data_on_disk *buffer;
3739 int i, rv = NO_ERROR;
3740
3741 if (!get_ldev_if_state(mdev, D_ATTACHING))
3742 return ERR_IO_MD_DISK;
3743
Philipp Reisnerb411b362009-09-25 16:07:19 -07003744 mutex_lock(&mdev->md_io_mutex);
3745 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3746
3747 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
Lucas De Marchi25985ed2011-03-30 22:57:33 -03003748 /* NOTE: can't do normal error processing here as this is
Philipp Reisnerb411b362009-09-25 16:07:19 -07003749 called BEFORE disk is attached */
3750 dev_err(DEV, "Error while reading metadata.\n");
3751 rv = ERR_IO_MD_DISK;
3752 goto err;
3753 }
3754
3755 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3756 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3757 rv = ERR_MD_INVALID;
3758 goto err;
3759 }
3760 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3761 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3762 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3763 rv = ERR_MD_INVALID;
3764 goto err;
3765 }
3766 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3767 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3768 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3769 rv = ERR_MD_INVALID;
3770 goto err;
3771 }
3772 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3773 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3774 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3775 rv = ERR_MD_INVALID;
3776 goto err;
3777 }
3778
3779 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3780 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3781 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3782 rv = ERR_MD_INVALID;
3783 goto err;
3784 }
3785
3786 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3787 for (i = UI_CURRENT; i < UI_SIZE; i++)
3788 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3789 bdev->md.flags = be32_to_cpu(buffer->flags);
3790 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3791 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3792
Philipp Reisner99432fc2011-05-20 16:39:13 +02003793 spin_lock_irq(&mdev->req_lock);
3794 if (mdev->state.conn < C_CONNECTED) {
3795 int peer;
3796 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3797 peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
3798 mdev->peer_max_bio_size = peer;
3799 }
3800 spin_unlock_irq(&mdev->req_lock);
3801
Philipp Reisnerb411b362009-09-25 16:07:19 -07003802 if (mdev->sync_conf.al_extents < 7)
3803 mdev->sync_conf.al_extents = 127;
3804
3805 err:
3806 mutex_unlock(&mdev->md_io_mutex);
3807 put_ldev(mdev);
3808
3809 return rv;
3810}
3811
3812/**
3813 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3814 * @mdev: DRBD device.
3815 *
3816 * Call this function if you change anything that should be written to
3817 * the meta-data super block. This function sets MD_DIRTY, and starts a
3818 * timer that ensures that within five seconds you have to call drbd_md_sync().
3819 */
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003820#ifdef DEBUG
Lars Ellenbergee15b032010-09-03 10:00:09 +02003821void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3822{
3823 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3824 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3825 mdev->last_md_mark_dirty.line = line;
3826 mdev->last_md_mark_dirty.func = func;
3827 }
3828}
3829#else
Philipp Reisnerb411b362009-09-25 16:07:19 -07003830void drbd_md_mark_dirty(struct drbd_conf *mdev)
3831{
Lars Ellenbergee15b032010-09-03 10:00:09 +02003832 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003833 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003834}
Lars Ellenbergee15b032010-09-03 10:00:09 +02003835#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07003836
3837static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3838{
3839 int i;
3840
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003841 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003842 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003843}
3844
3845void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3846{
3847 if (idx == UI_CURRENT) {
3848 if (mdev->state.role == R_PRIMARY)
3849 val |= 1;
3850 else
3851 val &= ~((u64)1);
3852
3853 drbd_set_ed_uuid(mdev, val);
3854 }
3855
3856 mdev->ldev->md.uuid[idx] = val;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003857 drbd_md_mark_dirty(mdev);
3858}
3859
3860
3861void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3862{
3863 if (mdev->ldev->md.uuid[idx]) {
3864 drbd_uuid_move_history(mdev);
3865 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003866 }
3867 _drbd_uuid_set(mdev, idx, val);
3868}
3869
3870/**
3871 * drbd_uuid_new_current() - Creates a new current UUID
3872 * @mdev: DRBD device.
3873 *
3874 * Creates a new current UUID, and rotates the old current UUID into
3875 * the bitmap slot. Causes an incremental resync upon next connect.
3876 */
3877void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3878{
3879 u64 val;
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003880 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003881
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003882 if (bm_uuid)
3883 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
3884
Philipp Reisnerb411b362009-09-25 16:07:19 -07003885 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003886
3887 get_random_bytes(&val, sizeof(u64));
3888 _drbd_uuid_set(mdev, UI_CURRENT, val);
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003889 drbd_print_uuids(mdev, "new current UUID");
Lars Ellenbergaaa8e2b2010-10-15 13:16:53 +02003890 /* get it to stable storage _now_ */
3891 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003892}
3893
3894void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3895{
3896 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3897 return;
3898
3899 if (val == 0) {
3900 drbd_uuid_move_history(mdev);
3901 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3902 mdev->ldev->md.uuid[UI_BITMAP] = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003903 } else {
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003904 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3905 if (bm_uuid)
3906 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003907
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003908 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003909 }
3910 drbd_md_mark_dirty(mdev);
3911}
3912
3913/**
3914 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3915 * @mdev: DRBD device.
3916 *
3917 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3918 */
3919int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3920{
3921 int rv = -EIO;
3922
3923 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3924 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3925 drbd_md_sync(mdev);
3926 drbd_bm_set_all(mdev);
3927
3928 rv = drbd_bm_write(mdev);
3929
3930 if (!rv) {
3931 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3932 drbd_md_sync(mdev);
3933 }
3934
3935 put_ldev(mdev);
3936 }
3937
3938 return rv;
3939}
3940
3941/**
3942 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3943 * @mdev: DRBD device.
3944 *
3945 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3946 */
3947int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3948{
3949 int rv = -EIO;
3950
Philipp Reisner07782862010-08-31 12:00:50 +02003951 drbd_resume_al(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003952 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3953 drbd_bm_clear_all(mdev);
3954 rv = drbd_bm_write(mdev);
3955 put_ldev(mdev);
3956 }
3957
3958 return rv;
3959}
3960
3961static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3962{
3963 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
Lars Ellenberg02851e92010-12-16 14:47:39 +01003964 int rv = -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003965
3966 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3967
Lars Ellenberg02851e92010-12-16 14:47:39 +01003968 if (get_ldev(mdev)) {
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01003969 drbd_bm_lock(mdev, work->why, work->flags);
Lars Ellenberg02851e92010-12-16 14:47:39 +01003970 rv = work->io_fn(mdev);
3971 drbd_bm_unlock(mdev);
3972 put_ldev(mdev);
3973 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003974
3975 clear_bit(BITMAP_IO, &mdev->flags);
Philipp Reisner127b3172010-11-16 10:07:53 +01003976 smp_mb__after_clear_bit();
Philipp Reisnerb411b362009-09-25 16:07:19 -07003977 wake_up(&mdev->misc_wait);
3978
3979 if (work->done)
3980 work->done(mdev, rv);
3981
3982 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3983 work->why = NULL;
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01003984 work->flags = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003985
3986 return 1;
3987}
3988
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02003989void drbd_ldev_destroy(struct drbd_conf *mdev)
3990{
3991 lc_destroy(mdev->resync);
3992 mdev->resync = NULL;
3993 lc_destroy(mdev->act_log);
3994 mdev->act_log = NULL;
3995 __no_warn(local,
3996 drbd_free_bc(mdev->ldev);
3997 mdev->ldev = NULL;);
3998
3999 if (mdev->md_io_tmpp) {
4000 __free_page(mdev->md_io_tmpp);
4001 mdev->md_io_tmpp = NULL;
4002 }
4003 clear_bit(GO_DISKLESS, &mdev->flags);
4004}
4005
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004006static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4007{
4008 D_ASSERT(mdev->state.disk == D_FAILED);
Lars Ellenberg9d282872010-10-14 13:57:07 +02004009 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
4010 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02004011 * the protected members anymore, though, so once put_ldev reaches zero
4012 * again, it will be safe to free them. */
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004013 drbd_force_state(mdev, NS(disk, D_DISKLESS));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004014 return 1;
4015}
4016
4017void drbd_go_diskless(struct drbd_conf *mdev)
4018{
4019 D_ASSERT(mdev->state.disk == D_FAILED);
4020 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
Lars Ellenberg9d282872010-10-14 13:57:07 +02004021 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004022}
4023
Philipp Reisnerb411b362009-09-25 16:07:19 -07004024/**
4025 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
4026 * @mdev: DRBD device.
4027 * @io_fn: IO callback to be called when bitmap IO is possible
4028 * @done: callback to be called after the bitmap IO was performed
4029 * @why: Descriptive text of the reason for doing the IO
4030 *
4031 * While IO on the bitmap happens we freeze application IO thus we ensure
4032 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
4033 * called from worker context. It MUST NOT be used while a previous such
4034 * work is still pending!
4035 */
4036void drbd_queue_bitmap_io(struct drbd_conf *mdev,
4037 int (*io_fn)(struct drbd_conf *),
4038 void (*done)(struct drbd_conf *, int),
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004039 char *why, enum bm_flag flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004040{
4041 D_ASSERT(current == mdev->worker.task);
4042
4043 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
4044 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
4045 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
4046 if (mdev->bm_io_work.why)
4047 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
4048 why, mdev->bm_io_work.why);
4049
4050 mdev->bm_io_work.io_fn = io_fn;
4051 mdev->bm_io_work.done = done;
4052 mdev->bm_io_work.why = why;
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004053 mdev->bm_io_work.flags = flags;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004054
Philipp Reisner22afd7e2010-11-16 15:30:44 +01004055 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004056 set_bit(BITMAP_IO, &mdev->flags);
4057 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
Philipp Reisner127b3172010-11-16 10:07:53 +01004058 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004059 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004060 }
Philipp Reisner22afd7e2010-11-16 15:30:44 +01004061 spin_unlock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004062}
4063
4064/**
4065 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
4066 * @mdev: DRBD device.
4067 * @io_fn: IO callback to be called when bitmap IO is possible
4068 * @why: Descriptive text of the reason for doing the IO
4069 *
4070 * freezes application IO while that the actual IO operations runs. This
4071 * functions MAY NOT be called from worker context.
4072 */
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004073int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4074 char *why, enum bm_flag flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004075{
4076 int rv;
4077
4078 D_ASSERT(current != mdev->worker.task);
4079
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004080 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4081 drbd_suspend_io(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004082
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004083 drbd_bm_lock(mdev, why, flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004084 rv = io_fn(mdev);
4085 drbd_bm_unlock(mdev);
4086
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004087 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4088 drbd_resume_io(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004089
4090 return rv;
4091}
4092
4093void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4094{
4095 if ((mdev->ldev->md.flags & flag) != flag) {
4096 drbd_md_mark_dirty(mdev);
4097 mdev->ldev->md.flags |= flag;
4098 }
4099}
4100
4101void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4102{
4103 if ((mdev->ldev->md.flags & flag) != 0) {
4104 drbd_md_mark_dirty(mdev);
4105 mdev->ldev->md.flags &= ~flag;
4106 }
4107}
4108int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4109{
4110 return (bdev->md.flags & flag) != 0;
4111}
4112
4113static void md_sync_timer_fn(unsigned long data)
4114{
4115 struct drbd_conf *mdev = (struct drbd_conf *) data;
4116
4117 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
4118}
4119
4120static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4121{
4122 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
Lars Ellenbergee15b032010-09-03 10:00:09 +02004123#ifdef DEBUG
4124 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4125 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4126#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07004127 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004128 return 1;
4129}
4130
4131#ifdef CONFIG_DRBD_FAULT_INJECTION
4132/* Fault insertion support including random number generator shamelessly
4133 * stolen from kernel/rcutorture.c */
4134struct fault_random_state {
4135 unsigned long state;
4136 unsigned long count;
4137};
4138
4139#define FAULT_RANDOM_MULT 39916801 /* prime */
4140#define FAULT_RANDOM_ADD 479001701 /* prime */
4141#define FAULT_RANDOM_REFRESH 10000
4142
4143/*
4144 * Crude but fast random-number generator. Uses a linear congruential
4145 * generator, with occasional help from get_random_bytes().
4146 */
4147static unsigned long
4148_drbd_fault_random(struct fault_random_state *rsp)
4149{
4150 long refresh;
4151
Roel Kluin49829ea2009-12-15 22:55:44 +01004152 if (!rsp->count--) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004153 get_random_bytes(&refresh, sizeof(refresh));
4154 rsp->state += refresh;
4155 rsp->count = FAULT_RANDOM_REFRESH;
4156 }
4157 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4158 return swahw32(rsp->state);
4159}
4160
4161static char *
4162_drbd_fault_str(unsigned int type) {
4163 static char *_faults[] = {
4164 [DRBD_FAULT_MD_WR] = "Meta-data write",
4165 [DRBD_FAULT_MD_RD] = "Meta-data read",
4166 [DRBD_FAULT_RS_WR] = "Resync write",
4167 [DRBD_FAULT_RS_RD] = "Resync read",
4168 [DRBD_FAULT_DT_WR] = "Data write",
4169 [DRBD_FAULT_DT_RD] = "Data read",
4170 [DRBD_FAULT_DT_RA] = "Data read ahead",
4171 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
Philipp Reisner6b4388a2010-04-26 14:11:45 +02004172 [DRBD_FAULT_AL_EE] = "EE allocation",
4173 [DRBD_FAULT_RECEIVE] = "receive data corruption",
Philipp Reisnerb411b362009-09-25 16:07:19 -07004174 };
4175
4176 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4177}
4178
4179unsigned int
4180_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4181{
4182 static struct fault_random_state rrs = {0, 0};
4183
4184 unsigned int ret = (
4185 (fault_devs == 0 ||
4186 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4187 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4188
4189 if (ret) {
4190 fault_count++;
4191
Lars Ellenberg73835062010-05-27 11:51:56 +02004192 if (__ratelimit(&drbd_ratelimit_state))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004193 dev_warn(DEV, "***Simulating %s failure\n",
4194 _drbd_fault_str(type));
4195 }
4196
4197 return ret;
4198}
4199#endif
4200
4201const char *drbd_buildtag(void)
4202{
4203 /* DRBD built from external sources has here a reference to the
4204 git hash of the source code. */
4205
4206 static char buildtag[38] = "\0uilt-in";
4207
4208 if (buildtag[0] == 0) {
4209#ifdef CONFIG_MODULES
4210 if (THIS_MODULE != NULL)
4211 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4212 else
4213#endif
4214 buildtag[0] = 'b';
4215 }
4216
4217 return buildtag;
4218}
4219
4220module_init(drbd_init)
4221module_exit(drbd_cleanup)
4222
Philipp Reisnerb411b362009-09-25 16:07:19 -07004223EXPORT_SYMBOL(drbd_conn_str);
4224EXPORT_SYMBOL(drbd_role_str);
4225EXPORT_SYMBOL(drbd_disk_str);
4226EXPORT_SYMBOL(drbd_set_st_err_str);