blob: a15135b9b60aafbbea18aed00ea965fb7cf90e24 [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
Philipp Reisnerb411b362009-09-25 16:07:19 -070029#include <linux/module.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070030#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020035#include <linux/mutex.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070036#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070055#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020067static DEFINE_MUTEX(drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -070068int drbdd_init(struct drbd_thread *);
69int drbd_worker(struct drbd_thread *);
70int drbd_asender(struct drbd_thread *);
71
72int drbd_init(void);
73static int drbd_open(struct block_device *bdev, fmode_t mode);
74static int drbd_release(struct gendisk *gd, fmode_t mode);
75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79static void md_sync_timer_fn(unsigned long data);
80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +020081static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Philipp Reisnerb411b362009-09-25 16:07:19 -070082
Philipp Reisnerb411b362009-09-25 16:07:19 -070083MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84 "Lars Ellenberg <lars@linbit.com>");
85MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86MODULE_VERSION(REL_VERSION);
87MODULE_LICENSE("GPL");
Philipp Reisner2b8a90b2011-01-10 11:15:17 +010088MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
89 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
Philipp Reisnerb411b362009-09-25 16:07:19 -070090MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
91
92#include <linux/moduleparam.h>
93/* allow_open_on_secondary */
94MODULE_PARM_DESC(allow_oos, "DONT USE!");
95/* thanks to these macros, if compiled into the kernel (not-module),
96 * this becomes the boot parameter drbd.minor_count */
97module_param(minor_count, uint, 0444);
98module_param(disable_sendpage, bool, 0644);
99module_param(allow_oos, bool, 0);
100module_param(cn_idx, uint, 0444);
101module_param(proc_details, int, 0644);
102
103#ifdef CONFIG_DRBD_FAULT_INJECTION
104int enable_faults;
105int fault_rate;
106static int fault_count;
107int fault_devs;
108/* bitmap of enabled faults */
109module_param(enable_faults, int, 0664);
110/* fault rate % value - applies to all enabled faults */
111module_param(fault_rate, int, 0664);
112/* count of faults inserted */
113module_param(fault_count, int, 0664);
114/* bitmap of devices to insert faults on */
115module_param(fault_devs, int, 0644);
116#endif
117
118/* module parameter, defined */
Philipp Reisner2b8a90b2011-01-10 11:15:17 +0100119unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
Rusty Russell90ab5ee2012-01-13 09:32:20 +1030120bool disable_sendpage;
121bool allow_oos;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700122unsigned int cn_idx = CN_IDX_DRBD;
123int proc_details; /* Detail level in proc drbd*/
124
125/* Module parameter for setting the user mode helper program
126 * to run. Default is /sbin/drbdadm */
127char usermode_helper[80] = "/sbin/drbdadm";
128
129module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
130
131/* in 2.6.x, our device mapping and config info contains our virtual gendisks
132 * as member "struct gendisk *vdisk;"
133 */
134struct drbd_conf **minor_table;
135
136struct kmem_cache *drbd_request_cache;
137struct kmem_cache *drbd_ee_cache; /* epoch entries */
138struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
139struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
140mempool_t *drbd_request_mempool;
141mempool_t *drbd_ee_mempool;
142
143/* I do not use a standard mempool, because:
144 1) I want to hand out the pre-allocated objects first.
145 2) I want to be able to interrupt sleeping allocation with a signal.
146 Note: This is a single linked list, the next pointer is the private
147 member of struct page.
148 */
149struct page *drbd_pp_pool;
150spinlock_t drbd_pp_lock;
151int drbd_pp_vacant;
152wait_queue_head_t drbd_pp_wait;
153
154DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
155
Emese Revfy7d4e9d02009-12-14 00:59:30 +0100156static const struct block_device_operations drbd_ops = {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700157 .owner = THIS_MODULE,
158 .open = drbd_open,
159 .release = drbd_release,
160};
161
162#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
163
164#ifdef __CHECKER__
165/* When checking with sparse, and this is an inline function, sparse will
166 give tons of false positives. When this is a real functions sparse works.
167 */
168int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
169{
170 int io_allowed;
171
172 atomic_inc(&mdev->local_cnt);
173 io_allowed = (mdev->state.disk >= mins);
174 if (!io_allowed) {
175 if (atomic_dec_and_test(&mdev->local_cnt))
176 wake_up(&mdev->misc_wait);
177 }
178 return io_allowed;
179}
180
181#endif
182
183/**
184 * DOC: The transfer log
185 *
186 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
187 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
188 * of the list. There is always at least one &struct drbd_tl_epoch object.
189 *
190 * Each &struct drbd_tl_epoch has a circular double linked list of requests
191 * attached.
192 */
193static int tl_init(struct drbd_conf *mdev)
194{
195 struct drbd_tl_epoch *b;
196
197 /* during device minor initialization, we may well use GFP_KERNEL */
198 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
199 if (!b)
200 return 0;
201 INIT_LIST_HEAD(&b->requests);
202 INIT_LIST_HEAD(&b->w.list);
203 b->next = NULL;
204 b->br_number = 4711;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200205 b->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700206 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
207
208 mdev->oldest_tle = b;
209 mdev->newest_tle = b;
210 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
211
212 mdev->tl_hash = NULL;
213 mdev->tl_hash_s = 0;
214
215 return 1;
216}
217
218static void tl_cleanup(struct drbd_conf *mdev)
219{
220 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
221 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
222 kfree(mdev->oldest_tle);
223 mdev->oldest_tle = NULL;
224 kfree(mdev->unused_spare_tle);
225 mdev->unused_spare_tle = NULL;
226 kfree(mdev->tl_hash);
227 mdev->tl_hash = NULL;
228 mdev->tl_hash_s = 0;
229}
230
231/**
232 * _tl_add_barrier() - Adds a barrier to the transfer log
233 * @mdev: DRBD device.
234 * @new: Barrier to be added before the current head of the TL.
235 *
236 * The caller must hold the req_lock.
237 */
238void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
239{
240 struct drbd_tl_epoch *newest_before;
241
242 INIT_LIST_HEAD(&new->requests);
243 INIT_LIST_HEAD(&new->w.list);
244 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
245 new->next = NULL;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200246 new->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700247
248 newest_before = mdev->newest_tle;
249 /* never send a barrier number == 0, because that is special-cased
250 * when using TCQ for our write ordering code */
251 new->br_number = (newest_before->br_number+1) ?: 1;
252 if (mdev->newest_tle != new) {
253 mdev->newest_tle->next = new;
254 mdev->newest_tle = new;
255 }
256}
257
258/**
259 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
260 * @mdev: DRBD device.
261 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
262 * @set_size: Expected number of requests before that barrier.
263 *
264 * In case the passed barrier_nr or set_size does not match the oldest
265 * &struct drbd_tl_epoch objects this function will cause a termination
266 * of the connection.
267 */
268void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
269 unsigned int set_size)
270{
271 struct drbd_tl_epoch *b, *nob; /* next old barrier */
272 struct list_head *le, *tle;
273 struct drbd_request *r;
274
275 spin_lock_irq(&mdev->req_lock);
276
277 b = mdev->oldest_tle;
278
279 /* first some paranoia code */
280 if (b == NULL) {
281 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
282 barrier_nr);
283 goto bail;
284 }
285 if (b->br_number != barrier_nr) {
286 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
287 barrier_nr, b->br_number);
288 goto bail;
289 }
Philipp Reisner7e602c02010-05-27 14:49:27 +0200290 if (b->n_writes != set_size) {
291 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
292 barrier_nr, set_size, b->n_writes);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700293 goto bail;
294 }
295
296 /* Clean up list of requests processed during current epoch */
297 list_for_each_safe(le, tle, &b->requests) {
298 r = list_entry(le, struct drbd_request, tl_requests);
299 _req_mod(r, barrier_acked);
300 }
301 /* There could be requests on the list waiting for completion
302 of the write to the local disk. To avoid corruptions of
303 slab's data structures we have to remove the lists head.
304
305 Also there could have been a barrier ack out of sequence, overtaking
306 the write acks - which would be a bug and violating write ordering.
307 To not deadlock in case we lose connection while such requests are
308 still pending, we need some way to find them for the
309 _req_mode(connection_lost_while_pending).
310
311 These have been list_move'd to the out_of_sequence_requests list in
312 _req_mod(, barrier_acked) above.
313 */
314 list_del_init(&b->requests);
315
316 nob = b->next;
317 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
318 _tl_add_barrier(mdev, b);
319 if (nob)
320 mdev->oldest_tle = nob;
321 /* if nob == NULL b was the only barrier, and becomes the new
322 barrier. Therefore mdev->oldest_tle points already to b */
323 } else {
324 D_ASSERT(nob != NULL);
325 mdev->oldest_tle = nob;
326 kfree(b);
327 }
328
329 spin_unlock_irq(&mdev->req_lock);
330 dec_ap_pending(mdev);
331
332 return;
333
334bail:
335 spin_unlock_irq(&mdev->req_lock);
336 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
337}
338
Philipp Reisner617049a2010-12-22 12:48:31 +0100339
Philipp Reisner11b58e72010-05-12 17:08:26 +0200340/**
341 * _tl_restart() - Walks the transfer log, and applies an action to all requests
342 * @mdev: DRBD device.
343 * @what: The action/event to perform with all request objects
344 *
345 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
346 * restart_frozen_disk_io.
347 */
348static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
349{
350 struct drbd_tl_epoch *b, *tmp, **pn;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200351 struct list_head *le, *tle, carry_reads;
Philipp Reisner11b58e72010-05-12 17:08:26 +0200352 struct drbd_request *req;
353 int rv, n_writes, n_reads;
354
355 b = mdev->oldest_tle;
356 pn = &mdev->oldest_tle;
357 while (b) {
358 n_writes = 0;
359 n_reads = 0;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200360 INIT_LIST_HEAD(&carry_reads);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200361 list_for_each_safe(le, tle, &b->requests) {
362 req = list_entry(le, struct drbd_request, tl_requests);
363 rv = _req_mod(req, what);
364
365 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
366 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
367 }
368 tmp = b->next;
369
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200370 if (n_writes) {
Philipp Reisner11b58e72010-05-12 17:08:26 +0200371 if (what == resend) {
372 b->n_writes = n_writes;
373 if (b->w.cb == NULL) {
374 b->w.cb = w_send_barrier;
375 inc_ap_pending(mdev);
376 set_bit(CREATE_BARRIER, &mdev->flags);
377 }
378
379 drbd_queue_work(&mdev->data.work, &b->w);
380 }
381 pn = &b->next;
382 } else {
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200383 if (n_reads)
384 list_add(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200385 /* there could still be requests on that ring list,
386 * in case local io is still pending */
387 list_del(&b->requests);
388
389 /* dec_ap_pending corresponding to queue_barrier.
390 * the newest barrier may not have been queued yet,
391 * in which case w.cb is still NULL. */
392 if (b->w.cb != NULL)
393 dec_ap_pending(mdev);
394
395 if (b == mdev->newest_tle) {
396 /* recycle, but reinit! */
397 D_ASSERT(tmp == NULL);
398 INIT_LIST_HEAD(&b->requests);
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200399 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200400 INIT_LIST_HEAD(&b->w.list);
401 b->w.cb = NULL;
402 b->br_number = net_random();
403 b->n_writes = 0;
404
405 *pn = b;
406 break;
407 }
408 *pn = tmp;
409 kfree(b);
410 }
411 b = tmp;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200412 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200413 }
414}
415
Philipp Reisnerb411b362009-09-25 16:07:19 -0700416
417/**
418 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
419 * @mdev: DRBD device.
420 *
421 * This is called after the connection to the peer was lost. The storage covered
422 * by the requests on the transfer gets marked as our of sync. Called from the
423 * receiver thread and the worker thread.
424 */
425void tl_clear(struct drbd_conf *mdev)
426{
Philipp Reisnerb411b362009-09-25 16:07:19 -0700427 struct list_head *le, *tle;
428 struct drbd_request *r;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700429
430 spin_lock_irq(&mdev->req_lock);
431
Philipp Reisner11b58e72010-05-12 17:08:26 +0200432 _tl_restart(mdev, connection_lost_while_pending);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700433
434 /* we expect this list to be empty. */
435 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
436
437 /* but just in case, clean it up anyways! */
438 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
439 r = list_entry(le, struct drbd_request, tl_requests);
440 /* It would be nice to complete outside of spinlock.
441 * But this is easier for now. */
442 _req_mod(r, connection_lost_while_pending);
443 }
444
445 /* ensure bit indicating barrier is required is clear */
446 clear_bit(CREATE_BARRIER, &mdev->flags);
447
Philipp Reisner288f4222010-05-27 15:07:43 +0200448 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
449
Philipp Reisnerb411b362009-09-25 16:07:19 -0700450 spin_unlock_irq(&mdev->req_lock);
451}
452
Philipp Reisner11b58e72010-05-12 17:08:26 +0200453void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
454{
455 spin_lock_irq(&mdev->req_lock);
456 _tl_restart(mdev, what);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700457 spin_unlock_irq(&mdev->req_lock);
458}
459
460/**
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100461 * cl_wide_st_chg() - true if the state change is a cluster wide one
Philipp Reisnerb411b362009-09-25 16:07:19 -0700462 * @mdev: DRBD device.
463 * @os: old (current) state.
464 * @ns: new (wanted) state.
465 */
466static int cl_wide_st_chg(struct drbd_conf *mdev,
467 union drbd_state os, union drbd_state ns)
468{
469 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
470 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
471 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
472 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
473 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
474 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
475 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
476}
477
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100478enum drbd_state_rv
479drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
480 union drbd_state mask, union drbd_state val)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700481{
482 unsigned long flags;
483 union drbd_state os, ns;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100484 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700485
486 spin_lock_irqsave(&mdev->req_lock, flags);
487 os = mdev->state;
488 ns.i = (os.i & ~mask.i) | val.i;
489 rv = _drbd_set_state(mdev, ns, f, NULL);
490 ns = mdev->state;
491 spin_unlock_irqrestore(&mdev->req_lock, flags);
492
493 return rv;
494}
495
496/**
497 * drbd_force_state() - Impose a change which happens outside our control on our state
498 * @mdev: DRBD device.
499 * @mask: mask of state bits to change.
500 * @val: value of new state bits.
501 */
502void drbd_force_state(struct drbd_conf *mdev,
503 union drbd_state mask, union drbd_state val)
504{
505 drbd_change_state(mdev, CS_HARD, mask, val);
506}
507
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100508static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
509static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
510 union drbd_state,
511 union drbd_state);
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200512enum sanitize_state_warnings {
513 NO_WARNING,
514 ABORTED_ONLINE_VERIFY,
515 ABORTED_RESYNC,
516 CONNECTION_LOST_NEGOTIATING,
517 IMPLICITLY_UPGRADED_DISK,
518 IMPLICITLY_UPGRADED_PDSK,
519};
Philipp Reisnerb411b362009-09-25 16:07:19 -0700520static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200521 union drbd_state ns, enum sanitize_state_warnings *warn);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700522int drbd_send_state_req(struct drbd_conf *,
523 union drbd_state, union drbd_state);
524
Andreas Gruenbacherc8b32562010-12-08 01:06:16 +0100525static enum drbd_state_rv
526_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
527 union drbd_state val)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700528{
529 union drbd_state os, ns;
530 unsigned long flags;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100531 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700532
533 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
534 return SS_CW_SUCCESS;
535
536 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
537 return SS_CW_FAILED_BY_PEER;
538
539 rv = 0;
540 spin_lock_irqsave(&mdev->req_lock, flags);
541 os = mdev->state;
542 ns.i = (os.i & ~mask.i) | val.i;
543 ns = sanitize_state(mdev, os, ns, NULL);
544
545 if (!cl_wide_st_chg(mdev, os, ns))
546 rv = SS_CW_NO_NEED;
547 if (!rv) {
548 rv = is_valid_state(mdev, ns);
549 if (rv == SS_SUCCESS) {
550 rv = is_valid_state_transition(mdev, ns, os);
551 if (rv == SS_SUCCESS)
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100552 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700553 }
554 }
555 spin_unlock_irqrestore(&mdev->req_lock, flags);
556
557 return rv;
558}
559
560/**
561 * drbd_req_state() - Perform an eventually cluster wide state change
562 * @mdev: DRBD device.
563 * @mask: mask of state bits to change.
564 * @val: value of new state bits.
565 * @f: flags
566 *
567 * Should not be called directly, use drbd_request_state() or
568 * _drbd_request_state().
569 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100570static enum drbd_state_rv
571drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
572 union drbd_state val, enum chg_state_flags f)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700573{
574 struct completion done;
575 unsigned long flags;
576 union drbd_state os, ns;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100577 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700578
579 init_completion(&done);
580
581 if (f & CS_SERIALIZE)
582 mutex_lock(&mdev->state_mutex);
583
584 spin_lock_irqsave(&mdev->req_lock, flags);
585 os = mdev->state;
586 ns.i = (os.i & ~mask.i) | val.i;
587 ns = sanitize_state(mdev, os, ns, NULL);
588
589 if (cl_wide_st_chg(mdev, os, ns)) {
590 rv = is_valid_state(mdev, ns);
591 if (rv == SS_SUCCESS)
592 rv = is_valid_state_transition(mdev, ns, os);
593 spin_unlock_irqrestore(&mdev->req_lock, flags);
594
595 if (rv < SS_SUCCESS) {
596 if (f & CS_VERBOSE)
597 print_st_err(mdev, os, ns, rv);
598 goto abort;
599 }
600
601 drbd_state_lock(mdev);
602 if (!drbd_send_state_req(mdev, mask, val)) {
603 drbd_state_unlock(mdev);
604 rv = SS_CW_FAILED_BY_PEER;
605 if (f & CS_VERBOSE)
606 print_st_err(mdev, os, ns, rv);
607 goto abort;
608 }
609
610 wait_event(mdev->state_wait,
611 (rv = _req_st_cond(mdev, mask, val)));
612
613 if (rv < SS_SUCCESS) {
614 drbd_state_unlock(mdev);
615 if (f & CS_VERBOSE)
616 print_st_err(mdev, os, ns, rv);
617 goto abort;
618 }
619 spin_lock_irqsave(&mdev->req_lock, flags);
620 os = mdev->state;
621 ns.i = (os.i & ~mask.i) | val.i;
622 rv = _drbd_set_state(mdev, ns, f, &done);
623 drbd_state_unlock(mdev);
624 } else {
625 rv = _drbd_set_state(mdev, ns, f, &done);
626 }
627
628 spin_unlock_irqrestore(&mdev->req_lock, flags);
629
630 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
631 D_ASSERT(current != mdev->worker.task);
632 wait_for_completion(&done);
633 }
634
635abort:
636 if (f & CS_SERIALIZE)
637 mutex_unlock(&mdev->state_mutex);
638
639 return rv;
640}
641
642/**
643 * _drbd_request_state() - Request a state change (with flags)
644 * @mdev: DRBD device.
645 * @mask: mask of state bits to change.
646 * @val: value of new state bits.
647 * @f: flags
648 *
649 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
650 * flag, or when logging of failed state change requests is not desired.
651 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100652enum drbd_state_rv
653_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
654 union drbd_state val, enum chg_state_flags f)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700655{
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100656 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700657
658 wait_event(mdev->state_wait,
659 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
660
661 return rv;
662}
663
664static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
665{
666 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
667 name,
668 drbd_conn_str(ns.conn),
669 drbd_role_str(ns.role),
670 drbd_role_str(ns.peer),
671 drbd_disk_str(ns.disk),
672 drbd_disk_str(ns.pdsk),
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200673 is_susp(ns) ? 's' : 'r',
Philipp Reisnerb411b362009-09-25 16:07:19 -0700674 ns.aftr_isp ? 'a' : '-',
675 ns.peer_isp ? 'p' : '-',
676 ns.user_isp ? 'u' : '-'
677 );
678}
679
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100680void print_st_err(struct drbd_conf *mdev, union drbd_state os,
681 union drbd_state ns, enum drbd_state_rv err)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700682{
683 if (err == SS_IN_TRANSIENT_STATE)
684 return;
685 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
686 print_st(mdev, " state", os);
687 print_st(mdev, "wanted", ns);
688}
689
690
Philipp Reisnerb411b362009-09-25 16:07:19 -0700691/**
692 * is_valid_state() - Returns an SS_ error code if ns is not valid
693 * @mdev: DRBD device.
694 * @ns: State to consider.
695 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100696static enum drbd_state_rv
697is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700698{
699 /* See drbd_state_sw_errors in drbd_strings.c */
700
701 enum drbd_fencing_p fp;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100702 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700703
704 fp = FP_DONT_CARE;
705 if (get_ldev(mdev)) {
706 fp = mdev->ldev->dc.fencing;
707 put_ldev(mdev);
708 }
709
710 if (get_net_conf(mdev)) {
711 if (!mdev->net_conf->two_primaries &&
712 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
713 rv = SS_TWO_PRIMARIES;
714 put_net_conf(mdev);
715 }
716
717 if (rv <= 0)
718 /* already found a reason to abort */;
719 else if (ns.role == R_SECONDARY && mdev->open_cnt)
720 rv = SS_DEVICE_IN_USE;
721
722 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
723 rv = SS_NO_UP_TO_DATE_DISK;
724
725 else if (fp >= FP_RESOURCE &&
726 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
727 rv = SS_PRIMARY_NOP;
728
729 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
730 rv = SS_NO_UP_TO_DATE_DISK;
731
732 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
733 rv = SS_NO_LOCAL_DISK;
734
735 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
736 rv = SS_NO_REMOTE_DISK;
737
Lars Ellenberg8d4ce822010-04-01 16:59:32 +0200738 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
739 rv = SS_NO_UP_TO_DATE_DISK;
740
Philipp Reisnerb411b362009-09-25 16:07:19 -0700741 else if ((ns.conn == C_CONNECTED ||
742 ns.conn == C_WF_BITMAP_S ||
743 ns.conn == C_SYNC_SOURCE ||
744 ns.conn == C_PAUSED_SYNC_S) &&
745 ns.disk == D_OUTDATED)
746 rv = SS_CONNECTED_OUTDATES;
747
748 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
749 (mdev->sync_conf.verify_alg[0] == 0))
750 rv = SS_NO_VERIFY_ALG;
751
752 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
753 mdev->agreed_pro_version < 88)
754 rv = SS_NOT_SUPPORTED;
755
Philipp Reisnerfa7d9392011-05-17 14:48:55 +0200756 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
757 rv = SS_CONNECTED_OUTDATES;
758
Philipp Reisnerb411b362009-09-25 16:07:19 -0700759 return rv;
760}
761
762/**
763 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
764 * @mdev: DRBD device.
765 * @ns: new state.
766 * @os: old state.
767 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100768static enum drbd_state_rv
769is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
770 union drbd_state os)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700771{
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100772 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700773
774 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
775 os.conn > C_CONNECTED)
776 rv = SS_RESYNC_RUNNING;
777
778 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
779 rv = SS_ALREADY_STANDALONE;
780
781 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
782 rv = SS_IS_DISKLESS;
783
784 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
785 rv = SS_NO_NET_CONFIG;
786
787 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
788 rv = SS_LOWER_THAN_OUTDATED;
789
790 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
791 rv = SS_IN_TRANSIENT_STATE;
792
793 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
794 rv = SS_IN_TRANSIENT_STATE;
795
796 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
797 rv = SS_NEED_CONNECTION;
798
799 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
800 ns.conn != os.conn && os.conn > C_CONNECTED)
801 rv = SS_RESYNC_RUNNING;
802
803 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
804 os.conn < C_CONNECTED)
805 rv = SS_NEED_CONNECTION;
806
Philipp Reisner1fc80cf2010-11-22 14:18:47 +0100807 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
808 && os.conn < C_WF_REPORT_PARAMS)
809 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
810
Philipp Reisnerb411b362009-09-25 16:07:19 -0700811 return rv;
812}
813
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200814static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
815{
816 static const char *msg_table[] = {
817 [NO_WARNING] = "",
818 [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
819 [ABORTED_RESYNC] = "Resync aborted.",
820 [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
821 [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
822 [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
823 };
824
825 if (warn != NO_WARNING)
826 dev_warn(DEV, "%s\n", msg_table[warn]);
827}
828
Philipp Reisnerb411b362009-09-25 16:07:19 -0700829/**
830 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
831 * @mdev: DRBD device.
832 * @os: old state.
833 * @ns: new state.
834 * @warn_sync_abort:
835 *
836 * When we loose connection, we have to set the state of the peers disk (pdsk)
837 * to D_UNKNOWN. This rule and many more along those lines are in this function.
838 */
839static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200840 union drbd_state ns, enum sanitize_state_warnings *warn)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700841{
842 enum drbd_fencing_p fp;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100843 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700844
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200845 if (warn)
846 *warn = NO_WARNING;
847
Philipp Reisnerb411b362009-09-25 16:07:19 -0700848 fp = FP_DONT_CARE;
849 if (get_ldev(mdev)) {
850 fp = mdev->ldev->dc.fencing;
851 put_ldev(mdev);
852 }
853
854 /* Disallow Network errors to configure a device's network part */
855 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
856 os.conn <= C_DISCONNECTING)
857 ns.conn = os.conn;
858
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200859 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
860 * If you try to go into some Sync* state, that shall fail (elsewhere). */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700861 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200862 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700863 ns.conn = os.conn;
864
Lars Ellenberg82f59cc2010-10-16 12:13:47 +0200865 /* we cannot fail (again) if we already detached */
866 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
867 ns.disk = D_DISKLESS;
868
869 /* if we are only D_ATTACHING yet,
870 * we can (and should) go directly to D_DISKLESS. */
871 if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
872 ns.disk = D_DISKLESS;
873
Philipp Reisnerb411b362009-09-25 16:07:19 -0700874 /* After C_DISCONNECTING only C_STANDALONE may follow */
875 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
876 ns.conn = os.conn;
877
878 if (ns.conn < C_CONNECTED) {
879 ns.peer_isp = 0;
880 ns.peer = R_UNKNOWN;
881 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
882 ns.pdsk = D_UNKNOWN;
883 }
884
885 /* Clear the aftr_isp when becoming unconfigured */
886 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
887 ns.aftr_isp = 0;
888
Philipp Reisnerb411b362009-09-25 16:07:19 -0700889 /* Abort resync if a disk fails/detaches */
890 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
891 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200892 if (warn)
893 *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
894 ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700895 ns.conn = C_CONNECTED;
896 }
897
Philipp Reisnerb411b362009-09-25 16:07:19 -0700898 /* Connection breaks down before we finished "Negotiating" */
899 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
900 get_ldev_if_state(mdev, D_NEGOTIATING)) {
901 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
902 ns.disk = mdev->new_state_tmp.disk;
903 ns.pdsk = mdev->new_state_tmp.pdsk;
904 } else {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200905 if (warn)
906 *warn = CONNECTION_LOST_NEGOTIATING;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700907 ns.disk = D_DISKLESS;
908 ns.pdsk = D_UNKNOWN;
909 }
910 put_ldev(mdev);
911 }
912
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100913 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
914 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
915 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
916 ns.disk = D_UP_TO_DATE;
917 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
918 ns.pdsk = D_UP_TO_DATE;
919 }
920
921 /* Implications of the connection stat on the disk states */
922 disk_min = D_DISKLESS;
923 disk_max = D_UP_TO_DATE;
924 pdsk_min = D_INCONSISTENT;
925 pdsk_max = D_UNKNOWN;
926 switch ((enum drbd_conns)ns.conn) {
927 case C_WF_BITMAP_T:
928 case C_PAUSED_SYNC_T:
929 case C_STARTING_SYNC_T:
930 case C_WF_SYNC_UUID:
931 case C_BEHIND:
932 disk_min = D_INCONSISTENT;
933 disk_max = D_OUTDATED;
934 pdsk_min = D_UP_TO_DATE;
935 pdsk_max = D_UP_TO_DATE;
936 break;
937 case C_VERIFY_S:
938 case C_VERIFY_T:
939 disk_min = D_UP_TO_DATE;
940 disk_max = D_UP_TO_DATE;
941 pdsk_min = D_UP_TO_DATE;
942 pdsk_max = D_UP_TO_DATE;
943 break;
944 case C_CONNECTED:
945 disk_min = D_DISKLESS;
946 disk_max = D_UP_TO_DATE;
947 pdsk_min = D_DISKLESS;
948 pdsk_max = D_UP_TO_DATE;
949 break;
950 case C_WF_BITMAP_S:
951 case C_PAUSED_SYNC_S:
952 case C_STARTING_SYNC_S:
953 case C_AHEAD:
954 disk_min = D_UP_TO_DATE;
955 disk_max = D_UP_TO_DATE;
956 pdsk_min = D_INCONSISTENT;
957 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
958 break;
959 case C_SYNC_TARGET:
960 disk_min = D_INCONSISTENT;
961 disk_max = D_INCONSISTENT;
962 pdsk_min = D_UP_TO_DATE;
963 pdsk_max = D_UP_TO_DATE;
964 break;
965 case C_SYNC_SOURCE:
966 disk_min = D_UP_TO_DATE;
967 disk_max = D_UP_TO_DATE;
968 pdsk_min = D_INCONSISTENT;
969 pdsk_max = D_INCONSISTENT;
970 break;
971 case C_STANDALONE:
972 case C_DISCONNECTING:
973 case C_UNCONNECTED:
974 case C_TIMEOUT:
975 case C_BROKEN_PIPE:
976 case C_NETWORK_FAILURE:
977 case C_PROTOCOL_ERROR:
978 case C_TEAR_DOWN:
979 case C_WF_CONNECTION:
980 case C_WF_REPORT_PARAMS:
981 case C_MASK:
982 break;
983 }
984 if (ns.disk > disk_max)
985 ns.disk = disk_max;
986
987 if (ns.disk < disk_min) {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200988 if (warn)
989 *warn = IMPLICITLY_UPGRADED_DISK;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100990 ns.disk = disk_min;
991 }
992 if (ns.pdsk > pdsk_max)
993 ns.pdsk = pdsk_max;
994
995 if (ns.pdsk < pdsk_min) {
Philipp Reisner77e8fdf2011-06-29 10:49:13 +0200996 if (warn)
997 *warn = IMPLICITLY_UPGRADED_PDSK;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100998 ns.pdsk = pdsk_min;
999 }
1000
Philipp Reisnerb411b362009-09-25 16:07:19 -07001001 if (fp == FP_STONITH &&
Philipp Reisner0a492162009-10-21 13:08:29 +02001002 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
1003 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001004 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
Philipp Reisner265be2d2010-05-31 10:14:17 +02001005
1006 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
1007 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
1008 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001009 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001010
1011 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
1012 if (ns.conn == C_SYNC_SOURCE)
1013 ns.conn = C_PAUSED_SYNC_S;
1014 if (ns.conn == C_SYNC_TARGET)
1015 ns.conn = C_PAUSED_SYNC_T;
1016 } else {
1017 if (ns.conn == C_PAUSED_SYNC_S)
1018 ns.conn = C_SYNC_SOURCE;
1019 if (ns.conn == C_PAUSED_SYNC_T)
1020 ns.conn = C_SYNC_TARGET;
1021 }
1022
1023 return ns;
1024}
1025
1026/* helper for __drbd_set_state */
1027static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
1028{
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001029 if (mdev->agreed_pro_version < 90)
1030 mdev->ov_start_sector = 0;
1031 mdev->rs_total = drbd_bm_bits(mdev);
1032 mdev->ov_position = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001033 if (cs == C_VERIFY_T) {
1034 /* starting online verify from an arbitrary position
1035 * does not fit well into the existing protocol.
1036 * on C_VERIFY_T, we initialize ov_left and friends
1037 * implicitly in receive_DataRequest once the
1038 * first P_OV_REQUEST is received */
1039 mdev->ov_start_sector = ~(sector_t)0;
1040 } else {
1041 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001042 if (bit >= mdev->rs_total) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001043 mdev->ov_start_sector =
1044 BM_BIT_TO_SECT(mdev->rs_total - 1);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001045 mdev->rs_total = 1;
1046 } else
1047 mdev->rs_total -= bit;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001048 mdev->ov_position = mdev->ov_start_sector;
1049 }
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001050 mdev->ov_left = mdev->rs_total;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001051}
1052
Philipp Reisner07782862010-08-31 12:00:50 +02001053static void drbd_resume_al(struct drbd_conf *mdev)
1054{
1055 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1056 dev_info(DEV, "Resumed AL updates\n");
1057}
1058
Philipp Reisnerb411b362009-09-25 16:07:19 -07001059/**
1060 * __drbd_set_state() - Set a new DRBD state
1061 * @mdev: DRBD device.
1062 * @ns: new state.
1063 * @flags: Flags
1064 * @done: Optional completion, that will get completed after the after_state_ch() finished
1065 *
1066 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1067 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01001068enum drbd_state_rv
1069__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1070 enum chg_state_flags flags, struct completion *done)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001071{
1072 union drbd_state os;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01001073 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001074 enum sanitize_state_warnings ssw;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001075 struct after_state_chg_work *ascw;
1076
1077 os = mdev->state;
1078
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001079 ns = sanitize_state(mdev, os, ns, &ssw);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001080
1081 if (ns.i == os.i)
1082 return SS_NOTHING_TO_DO;
1083
1084 if (!(flags & CS_HARD)) {
1085 /* pre-state-change checks ; only look at ns */
1086 /* See drbd_state_sw_errors in drbd_strings.c */
1087
1088 rv = is_valid_state(mdev, ns);
1089 if (rv < SS_SUCCESS) {
1090 /* If the old state was illegal as well, then let
1091 this happen...*/
1092
Philipp Reisner1616a252010-06-10 16:55:15 +02001093 if (is_valid_state(mdev, os) == rv)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001094 rv = is_valid_state_transition(mdev, ns, os);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001095 } else
1096 rv = is_valid_state_transition(mdev, ns, os);
1097 }
1098
1099 if (rv < SS_SUCCESS) {
1100 if (flags & CS_VERBOSE)
1101 print_st_err(mdev, os, ns, rv);
1102 return rv;
1103 }
1104
Philipp Reisner77e8fdf2011-06-29 10:49:13 +02001105 print_sanitize_warnings(mdev, ssw);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001106
1107 {
Andreas Gruenbacher662d91a2010-12-07 03:01:41 +01001108 char *pbp, pb[300];
1109 pbp = pb;
1110 *pbp = 0;
1111 if (ns.role != os.role)
1112 pbp += sprintf(pbp, "role( %s -> %s ) ",
1113 drbd_role_str(os.role),
1114 drbd_role_str(ns.role));
1115 if (ns.peer != os.peer)
1116 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1117 drbd_role_str(os.peer),
1118 drbd_role_str(ns.peer));
1119 if (ns.conn != os.conn)
1120 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1121 drbd_conn_str(os.conn),
1122 drbd_conn_str(ns.conn));
1123 if (ns.disk != os.disk)
1124 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1125 drbd_disk_str(os.disk),
1126 drbd_disk_str(ns.disk));
1127 if (ns.pdsk != os.pdsk)
1128 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1129 drbd_disk_str(os.pdsk),
1130 drbd_disk_str(ns.pdsk));
1131 if (is_susp(ns) != is_susp(os))
1132 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1133 is_susp(os),
1134 is_susp(ns));
1135 if (ns.aftr_isp != os.aftr_isp)
1136 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1137 os.aftr_isp,
1138 ns.aftr_isp);
1139 if (ns.peer_isp != os.peer_isp)
1140 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1141 os.peer_isp,
1142 ns.peer_isp);
1143 if (ns.user_isp != os.user_isp)
1144 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1145 os.user_isp,
1146 ns.user_isp);
1147 dev_info(DEV, "%s\n", pb);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001148 }
1149
1150 /* solve the race between becoming unconfigured,
1151 * worker doing the cleanup, and
1152 * admin reconfiguring us:
1153 * on (re)configure, first set CONFIG_PENDING,
1154 * then wait for a potentially exiting worker,
1155 * start the worker, and schedule one no_op.
1156 * then proceed with configuration.
1157 */
1158 if (ns.disk == D_DISKLESS &&
1159 ns.conn == C_STANDALONE &&
1160 ns.role == R_SECONDARY &&
1161 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1162 set_bit(DEVICE_DYING, &mdev->flags);
1163
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001164 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1165 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1166 * drbd_ldev_destroy() won't happen before our corresponding
1167 * after_state_ch works run, where we put_ldev again. */
1168 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1169 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1170 atomic_inc(&mdev->local_cnt);
1171
1172 mdev->state = ns;
Lars Ellenberg62b0da32011-01-20 13:25:21 +01001173
1174 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1175 drbd_print_uuids(mdev, "attached to UUIDs");
1176
Philipp Reisnerb411b362009-09-25 16:07:19 -07001177 wake_up(&mdev->misc_wait);
1178 wake_up(&mdev->state_wait);
1179
Philipp Reisnerb411b362009-09-25 16:07:19 -07001180 /* aborted verify run. log the last position */
1181 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1182 ns.conn < C_CONNECTED) {
1183 mdev->ov_start_sector =
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001184 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001185 dev_info(DEV, "Online Verify reached sector %llu\n",
1186 (unsigned long long)mdev->ov_start_sector);
1187 }
1188
1189 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1190 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1191 dev_info(DEV, "Syncer continues.\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001192 mdev->rs_paused += (long)jiffies
1193 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
Philipp Reisner63106d32010-09-01 15:47:15 +02001194 if (ns.conn == C_SYNC_TARGET)
1195 mod_timer(&mdev->resync_timer, jiffies);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001196 }
1197
1198 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1199 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1200 dev_info(DEV, "Resync suspended\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001201 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001202 }
1203
1204 if (os.conn == C_CONNECTED &&
1205 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001206 unsigned long now = jiffies;
1207 int i;
1208
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001209 set_ov_position(mdev, ns.conn);
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001210 mdev->rs_start = now;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001211 mdev->rs_last_events = 0;
1212 mdev->rs_last_sect_ev = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001213 mdev->ov_last_oos_size = 0;
1214 mdev->ov_last_oos_start = 0;
1215
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001216 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001217 mdev->rs_mark_left[i] = mdev->ov_left;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001218 mdev->rs_mark_time[i] = now;
1219 }
1220
Lars Ellenberg2649f082010-11-05 10:05:47 +01001221 drbd_rs_controller_reset(mdev);
1222
Philipp Reisnerb411b362009-09-25 16:07:19 -07001223 if (ns.conn == C_VERIFY_S) {
1224 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1225 (unsigned long long)mdev->ov_position);
1226 mod_timer(&mdev->resync_timer, jiffies);
1227 }
1228 }
1229
1230 if (get_ldev(mdev)) {
1231 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1232 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1233 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1234
1235 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1236 mdf |= MDF_CRASHED_PRIMARY;
1237 if (mdev->state.role == R_PRIMARY ||
1238 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1239 mdf |= MDF_PRIMARY_IND;
1240 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1241 mdf |= MDF_CONNECTED_IND;
1242 if (mdev->state.disk > D_INCONSISTENT)
1243 mdf |= MDF_CONSISTENT;
1244 if (mdev->state.disk > D_OUTDATED)
1245 mdf |= MDF_WAS_UP_TO_DATE;
1246 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1247 mdf |= MDF_PEER_OUT_DATED;
1248 if (mdf != mdev->ldev->md.flags) {
1249 mdev->ldev->md.flags = mdf;
1250 drbd_md_mark_dirty(mdev);
1251 }
1252 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1253 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1254 put_ldev(mdev);
1255 }
1256
1257 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1258 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1259 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1260 set_bit(CONSIDER_RESYNC, &mdev->flags);
1261
1262 /* Receiver should clean up itself */
1263 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1264 drbd_thread_stop_nowait(&mdev->receiver);
1265
1266 /* Now the receiver finished cleaning up itself, it should die */
1267 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1268 drbd_thread_stop_nowait(&mdev->receiver);
1269
1270 /* Upon network failure, we need to restart the receiver. */
1271 if (os.conn > C_TEAR_DOWN &&
1272 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1273 drbd_thread_restart_nowait(&mdev->receiver);
1274
Philipp Reisner07782862010-08-31 12:00:50 +02001275 /* Resume AL writing if we get a connection */
1276 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1277 drbd_resume_al(mdev);
1278
Philipp Reisnerb411b362009-09-25 16:07:19 -07001279 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1280 if (ascw) {
1281 ascw->os = os;
1282 ascw->ns = ns;
1283 ascw->flags = flags;
1284 ascw->w.cb = w_after_state_ch;
1285 ascw->done = done;
1286 drbd_queue_work(&mdev->data.work, &ascw->w);
1287 } else {
1288 dev_warn(DEV, "Could not kmalloc an ascw\n");
1289 }
1290
1291 return rv;
1292}
1293
1294static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1295{
1296 struct after_state_chg_work *ascw =
1297 container_of(w, struct after_state_chg_work, w);
1298 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1299 if (ascw->flags & CS_WAIT_COMPLETE) {
1300 D_ASSERT(ascw->done != NULL);
1301 complete(ascw->done);
1302 }
1303 kfree(ascw);
1304
1305 return 1;
1306}
1307
1308static void abw_start_sync(struct drbd_conf *mdev, int rv)
1309{
1310 if (rv) {
1311 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1312 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1313 return;
1314 }
1315
1316 switch (mdev->state.conn) {
1317 case C_STARTING_SYNC_T:
1318 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1319 break;
1320 case C_STARTING_SYNC_S:
1321 drbd_start_resync(mdev, C_SYNC_SOURCE);
1322 break;
1323 }
1324}
1325
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001326int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1327 int (*io_fn)(struct drbd_conf *),
1328 char *why, enum bm_flag flags)
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001329{
1330 int rv;
1331
1332 D_ASSERT(current == mdev->worker.task);
1333
1334 /* open coded non-blocking drbd_suspend_io(mdev); */
1335 set_bit(SUSPEND_IO, &mdev->flags);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001336
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001337 drbd_bm_lock(mdev, why, flags);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001338 rv = io_fn(mdev);
1339 drbd_bm_unlock(mdev);
1340
1341 drbd_resume_io(mdev);
1342
1343 return rv;
1344}
1345
Philipp Reisnerb411b362009-09-25 16:07:19 -07001346/**
1347 * after_state_ch() - Perform after state change actions that may sleep
1348 * @mdev: DRBD device.
1349 * @os: old state.
1350 * @ns: new state.
1351 * @flags: Flags
1352 */
1353static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1354 union drbd_state ns, enum chg_state_flags flags)
1355{
1356 enum drbd_fencing_p fp;
Philipp Reisner67098932010-06-24 16:24:25 +02001357 enum drbd_req_event what = nothing;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001358 union drbd_state nsm = (union drbd_state){ .i = -1 };
Philipp Reisnerb411b362009-09-25 16:07:19 -07001359
1360 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1361 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1362 if (mdev->p_uuid)
1363 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1364 }
1365
1366 fp = FP_DONT_CARE;
1367 if (get_ldev(mdev)) {
1368 fp = mdev->ldev->dc.fencing;
1369 put_ldev(mdev);
1370 }
1371
1372 /* Inform userspace about the change... */
1373 drbd_bcast_state(mdev, ns);
1374
1375 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1376 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1377 drbd_khelper(mdev, "pri-on-incon-degr");
1378
1379 /* Here we have the actions that are performed after a
1380 state change. This function might sleep */
1381
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001382 nsm.i = -1;
1383 if (ns.susp_nod) {
Philipp Reisner3f986882010-12-20 14:48:20 +01001384 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1385 what = resend;
Philipp Reisner265be2d2010-05-31 10:14:17 +02001386
Philipp Reisner67098932010-06-24 16:24:25 +02001387 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
Philipp Reisner3f986882010-12-20 14:48:20 +01001388 what = restart_frozen_disk_io;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001389
Philipp Reisner3f986882010-12-20 14:48:20 +01001390 if (what != nothing)
1391 nsm.susp_nod = 0;
Philipp Reisner265be2d2010-05-31 10:14:17 +02001392 }
1393
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001394 if (ns.susp_fen) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001395 /* case1: The outdate peer handler is successful: */
1396 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001397 tl_clear(mdev);
Philipp Reisner43a51822010-06-11 11:26:34 +02001398 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1399 drbd_uuid_new_current(mdev);
1400 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner43a51822010-06-11 11:26:34 +02001401 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001402 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001403 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001404 spin_unlock_irq(&mdev->req_lock);
1405 }
Philipp Reisner43a51822010-06-11 11:26:34 +02001406 /* case2: The connection was established again: */
1407 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1408 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner67098932010-06-24 16:24:25 +02001409 what = resend;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001410 nsm.susp_fen = 0;
Philipp Reisner43a51822010-06-11 11:26:34 +02001411 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001412 }
Philipp Reisner67098932010-06-24 16:24:25 +02001413
1414 if (what != nothing) {
1415 spin_lock_irq(&mdev->req_lock);
1416 _tl_restart(mdev, what);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001417 nsm.i &= mdev->state.i;
1418 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
Philipp Reisner67098932010-06-24 16:24:25 +02001419 spin_unlock_irq(&mdev->req_lock);
1420 }
1421
Lars Ellenberg5a22db82010-12-17 21:14:23 +01001422 /* Became sync source. With protocol >= 96, we still need to send out
1423 * the sync uuid now. Need to do that before any drbd_send_state, or
1424 * the other side may go "paused sync" before receiving the sync uuids,
1425 * which is unexpected. */
1426 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1427 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1428 mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1429 drbd_gen_and_send_sync_uuid(mdev);
1430 put_ldev(mdev);
1431 }
1432
Philipp Reisnerb411b362009-09-25 16:07:19 -07001433 /* Do not change the order of the if above and the two below... */
1434 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1435 drbd_send_uuids(mdev);
1436 drbd_send_state(mdev);
1437 }
Lars Ellenberg54b956a2011-01-20 10:47:53 +01001438 /* No point in queuing send_bitmap if we don't have a connection
1439 * anymore, so check also the _current_ state, not only the new state
1440 * at the time this work was queued. */
1441 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1442 mdev->state.conn == C_WF_BITMAP_S)
1443 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001444 "send_bitmap (WFBitMapS)",
1445 BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001446
1447 /* Lost contact to peer's copy of the data */
1448 if ((os.pdsk >= D_INCONSISTENT &&
1449 os.pdsk != D_UNKNOWN &&
1450 os.pdsk != D_OUTDATED)
1451 && (ns.pdsk < D_INCONSISTENT ||
1452 ns.pdsk == D_UNKNOWN ||
1453 ns.pdsk == D_OUTDATED)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001454 if (get_ldev(mdev)) {
1455 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001456 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001457 if (is_susp(mdev->state)) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001458 set_bit(NEW_CUR_UUID, &mdev->flags);
1459 } else {
1460 drbd_uuid_new_current(mdev);
1461 drbd_send_uuids(mdev);
1462 }
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001463 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001464 put_ldev(mdev);
1465 }
1466 }
1467
1468 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001469 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001470 drbd_uuid_new_current(mdev);
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001471 drbd_send_uuids(mdev);
1472 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001473
1474 /* D_DISKLESS Peer becomes secondary */
1475 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001476 /* We may still be Primary ourselves.
1477 * No harm done if the bitmap still changes,
1478 * redirtied pages will follow later. */
1479 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1480 "demote diskless peer", BM_LOCKED_SET_ALLOWED);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001481 put_ldev(mdev);
1482 }
1483
Lars Ellenberg06d33e92010-12-18 17:00:59 +01001484 /* Write out all changed bits on demote.
1485 * Though, no need to da that just yet
1486 * if there is a resync going on still */
1487 if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1488 mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001489 /* No changes to the bitmap expected this time, so assert that,
1490 * even though no harm was done if it did change. */
1491 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1492 "demote", BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001493 put_ldev(mdev);
1494 }
1495
1496 /* Last part of the attaching process ... */
1497 if (ns.conn >= C_CONNECTED &&
1498 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
Philipp Reisnere89b5912010-03-24 17:11:33 +01001499 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001500 drbd_send_uuids(mdev);
1501 drbd_send_state(mdev);
1502 }
1503
1504 /* We want to pause/continue resync, tell peer. */
1505 if (ns.conn >= C_CONNECTED &&
1506 ((os.aftr_isp != ns.aftr_isp) ||
1507 (os.user_isp != ns.user_isp)))
1508 drbd_send_state(mdev);
1509
1510 /* In case one of the isp bits got set, suspend other devices. */
1511 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1512 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1513 suspend_other_sg(mdev);
1514
1515 /* Make sure the peer gets informed about eventual state
1516 changes (ISP bits) while we were in WFReportParams. */
1517 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1518 drbd_send_state(mdev);
1519
Philipp Reisner67531712010-10-27 12:21:30 +02001520 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1521 drbd_send_state(mdev);
1522
Philipp Reisnerb411b362009-09-25 16:07:19 -07001523 /* We are in the progress to start a full sync... */
1524 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1525 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001526 /* no other bitmap changes expected during this phase */
1527 drbd_queue_bitmap_io(mdev,
1528 &drbd_bmio_set_n_write, &abw_start_sync,
1529 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001530
1531 /* We are invalidating our self... */
1532 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1533 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001534 /* other bitmap operation expected during this phase */
1535 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1536 "set_n_write from invalidate", BM_LOCKED_MASK);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001537
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001538 /* first half of local IO error, failure to attach,
1539 * or administrative detach */
1540 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1541 enum drbd_io_error_p eh;
1542 int was_io_error;
1543 /* corresponding get_ldev was in __drbd_set_state, to serialize
1544 * our cleanup here with the transition to D_DISKLESS,
1545 * so it is safe to dreference ldev here. */
1546 eh = mdev->ldev->dc.on_io_error;
1547 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1548
1549 /* current state still has to be D_FAILED,
1550 * there is only one way out: to D_DISKLESS,
1551 * and that may only happen after our put_ldev below. */
1552 if (mdev->state.disk != D_FAILED)
1553 dev_err(DEV,
1554 "ASSERT FAILED: disk is %s during detach\n",
1555 drbd_disk_str(mdev->state.disk));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001556
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001557 if (drbd_send_state(mdev))
Lars Ellenberg076673472011-06-21 01:13:37 +02001558 dev_info(DEV, "Notified peer that I am detaching my disk\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001559
1560 drbd_rs_cancel_all(mdev);
1561
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001562 /* In case we want to get something to stable storage still,
1563 * this may be the last chance.
1564 * Following put_ldev may transition to D_DISKLESS. */
1565 drbd_md_sync(mdev);
1566 put_ldev(mdev);
1567
1568 if (was_io_error && eh == EP_CALL_HELPER)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001569 drbd_khelper(mdev, "local-io-error");
1570 }
1571
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001572 /* second half of local IO error, failure to attach,
1573 * or administrative detach,
1574 * after local_cnt references have reached zero again */
1575 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1576 /* We must still be diskless,
1577 * re-attach has to be serialized with this! */
1578 if (mdev->state.disk != D_DISKLESS)
1579 dev_err(DEV,
1580 "ASSERT FAILED: disk is %s while going diskless\n",
1581 drbd_disk_str(mdev->state.disk));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001582
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001583 mdev->rs_total = 0;
1584 mdev->rs_failed = 0;
1585 atomic_set(&mdev->rs_pending_cnt, 0);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001586
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001587 if (drbd_send_state(mdev))
Lars Ellenberg076673472011-06-21 01:13:37 +02001588 dev_info(DEV, "Notified peer that I'm now diskless.\n");
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001589 /* corresponding get_ldev in __drbd_set_state
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001590 * this may finally trigger drbd_ldev_destroy. */
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001591 put_ldev(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001592 }
1593
Philipp Reisner738a84b2011-03-03 00:21:30 +01001594 /* Notify peer that I had a local IO error, and did not detached.. */
1595 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
1596 drbd_send_state(mdev);
1597
Philipp Reisnerb411b362009-09-25 16:07:19 -07001598 /* Disks got bigger while they were detached */
1599 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1600 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1601 if (ns.conn == C_CONNECTED)
1602 resync_after_online_grow(mdev);
1603 }
1604
1605 /* A resync finished or aborted, wake paused devices... */
1606 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1607 (os.peer_isp && !ns.peer_isp) ||
1608 (os.user_isp && !ns.user_isp))
1609 resume_next_sg(mdev);
1610
Lars Ellenbergaf85e8e2010-10-07 16:07:55 +02001611 /* sync target done with resync. Explicitly notify peer, even though
1612 * it should (at least for non-empty resyncs) already know itself. */
1613 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1614 drbd_send_state(mdev);
1615
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001616 /* This triggers bitmap writeout of potentially still unwritten pages
1617 * if the resync finished cleanly, or aborted because of peer disk
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001618 * failure, or because of connection loss.
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001619 * For resync aborted because of local disk failure, we cannot do
1620 * any bitmap writeout anymore.
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001621 * No harm done if some bits change during this phase.
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001622 */
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001623 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
1624 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
1625 "write from resync_finished", BM_LOCKED_SET_ALLOWED);
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001626 put_ldev(mdev);
1627 }
Lars Ellenberg02851e92010-12-16 14:47:39 +01001628
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001629 /* free tl_hash if we Got thawed and are C_STANDALONE */
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001630 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001631 drbd_free_tl_hash(mdev);
1632
Philipp Reisnerb411b362009-09-25 16:07:19 -07001633 /* Upon network connection, we need to start the receiver */
1634 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1635 drbd_thread_start(&mdev->receiver);
1636
1637 /* Terminate worker thread if we are unconfigured - it will be
1638 restarted as needed... */
1639 if (ns.disk == D_DISKLESS &&
1640 ns.conn == C_STANDALONE &&
1641 ns.role == R_SECONDARY) {
1642 if (os.aftr_isp != ns.aftr_isp)
1643 resume_next_sg(mdev);
1644 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1645 if (test_bit(DEVICE_DYING, &mdev->flags))
1646 drbd_thread_stop_nowait(&mdev->worker);
1647 }
1648
1649 drbd_md_sync(mdev);
1650}
1651
1652
1653static int drbd_thread_setup(void *arg)
1654{
1655 struct drbd_thread *thi = (struct drbd_thread *) arg;
1656 struct drbd_conf *mdev = thi->mdev;
1657 unsigned long flags;
1658 int retval;
1659
1660restart:
1661 retval = thi->function(thi);
1662
1663 spin_lock_irqsave(&thi->t_lock, flags);
1664
1665 /* if the receiver has been "Exiting", the last thing it did
1666 * was set the conn state to "StandAlone",
1667 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1668 * and receiver thread will be "started".
1669 * drbd_thread_start needs to set "Restarting" in that case.
1670 * t_state check and assignment needs to be within the same spinlock,
1671 * so either thread_start sees Exiting, and can remap to Restarting,
1672 * or thread_start see None, and can proceed as normal.
1673 */
1674
1675 if (thi->t_state == Restarting) {
1676 dev_info(DEV, "Restarting %s\n", current->comm);
1677 thi->t_state = Running;
1678 spin_unlock_irqrestore(&thi->t_lock, flags);
1679 goto restart;
1680 }
1681
1682 thi->task = NULL;
1683 thi->t_state = None;
1684 smp_mb();
1685 complete(&thi->stop);
1686 spin_unlock_irqrestore(&thi->t_lock, flags);
1687
1688 dev_info(DEV, "Terminating %s\n", current->comm);
1689
1690 /* Release mod reference taken when thread was started */
1691 module_put(THIS_MODULE);
1692 return retval;
1693}
1694
1695static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1696 int (*func) (struct drbd_thread *))
1697{
1698 spin_lock_init(&thi->t_lock);
1699 thi->task = NULL;
1700 thi->t_state = None;
1701 thi->function = func;
1702 thi->mdev = mdev;
1703}
1704
1705int drbd_thread_start(struct drbd_thread *thi)
1706{
1707 struct drbd_conf *mdev = thi->mdev;
1708 struct task_struct *nt;
1709 unsigned long flags;
1710
1711 const char *me =
1712 thi == &mdev->receiver ? "receiver" :
1713 thi == &mdev->asender ? "asender" :
1714 thi == &mdev->worker ? "worker" : "NONSENSE";
1715
1716 /* is used from state engine doing drbd_thread_stop_nowait,
1717 * while holding the req lock irqsave */
1718 spin_lock_irqsave(&thi->t_lock, flags);
1719
1720 switch (thi->t_state) {
1721 case None:
1722 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1723 me, current->comm, current->pid);
1724
1725 /* Get ref on module for thread - this is released when thread exits */
1726 if (!try_module_get(THIS_MODULE)) {
1727 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1728 spin_unlock_irqrestore(&thi->t_lock, flags);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001729 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001730 }
1731
1732 init_completion(&thi->stop);
1733 D_ASSERT(thi->task == NULL);
1734 thi->reset_cpu_mask = 1;
1735 thi->t_state = Running;
1736 spin_unlock_irqrestore(&thi->t_lock, flags);
1737 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1738
1739 nt = kthread_create(drbd_thread_setup, (void *) thi,
1740 "drbd%d_%s", mdev_to_minor(mdev), me);
1741
1742 if (IS_ERR(nt)) {
1743 dev_err(DEV, "Couldn't start thread\n");
1744
1745 module_put(THIS_MODULE);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001746 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001747 }
1748 spin_lock_irqsave(&thi->t_lock, flags);
1749 thi->task = nt;
1750 thi->t_state = Running;
1751 spin_unlock_irqrestore(&thi->t_lock, flags);
1752 wake_up_process(nt);
1753 break;
1754 case Exiting:
1755 thi->t_state = Restarting;
1756 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1757 me, current->comm, current->pid);
1758 /* fall through */
1759 case Running:
1760 case Restarting:
1761 default:
1762 spin_unlock_irqrestore(&thi->t_lock, flags);
1763 break;
1764 }
1765
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001766 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001767}
1768
1769
1770void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1771{
1772 unsigned long flags;
1773
1774 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1775
1776 /* may be called from state engine, holding the req lock irqsave */
1777 spin_lock_irqsave(&thi->t_lock, flags);
1778
1779 if (thi->t_state == None) {
1780 spin_unlock_irqrestore(&thi->t_lock, flags);
1781 if (restart)
1782 drbd_thread_start(thi);
1783 return;
1784 }
1785
1786 if (thi->t_state != ns) {
1787 if (thi->task == NULL) {
1788 spin_unlock_irqrestore(&thi->t_lock, flags);
1789 return;
1790 }
1791
1792 thi->t_state = ns;
1793 smp_mb();
1794 init_completion(&thi->stop);
1795 if (thi->task != current)
1796 force_sig(DRBD_SIGKILL, thi->task);
1797
1798 }
1799
1800 spin_unlock_irqrestore(&thi->t_lock, flags);
1801
1802 if (wait)
1803 wait_for_completion(&thi->stop);
1804}
1805
1806#ifdef CONFIG_SMP
1807/**
1808 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1809 * @mdev: DRBD device.
1810 *
1811 * Forces all threads of a device onto the same CPU. This is beneficial for
1812 * DRBD's performance. May be overwritten by user's configuration.
1813 */
1814void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1815{
1816 int ord, cpu;
1817
1818 /* user override. */
1819 if (cpumask_weight(mdev->cpu_mask))
1820 return;
1821
1822 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1823 for_each_online_cpu(cpu) {
1824 if (ord-- == 0) {
1825 cpumask_set_cpu(cpu, mdev->cpu_mask);
1826 return;
1827 }
1828 }
1829 /* should not be reached */
1830 cpumask_setall(mdev->cpu_mask);
1831}
1832
1833/**
1834 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1835 * @mdev: DRBD device.
1836 *
1837 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1838 * prematurely.
1839 */
1840void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1841{
1842 struct task_struct *p = current;
1843 struct drbd_thread *thi =
1844 p == mdev->asender.task ? &mdev->asender :
1845 p == mdev->receiver.task ? &mdev->receiver :
1846 p == mdev->worker.task ? &mdev->worker :
1847 NULL;
1848 ERR_IF(thi == NULL)
1849 return;
1850 if (!thi->reset_cpu_mask)
1851 return;
1852 thi->reset_cpu_mask = 0;
1853 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1854}
1855#endif
1856
1857/* the appropriate socket mutex must be held already */
1858int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001859 enum drbd_packets cmd, struct p_header80 *h,
Philipp Reisnerb411b362009-09-25 16:07:19 -07001860 size_t size, unsigned msg_flags)
1861{
1862 int sent, ok;
1863
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001864 ERR_IF(!h) return false;
1865 ERR_IF(!size) return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001866
1867 h->magic = BE_DRBD_MAGIC;
1868 h->command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02001869 h->length = cpu_to_be16(size-sizeof(struct p_header80));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001870
Philipp Reisnerb411b362009-09-25 16:07:19 -07001871 sent = drbd_send(mdev, sock, h, size, msg_flags);
1872
1873 ok = (sent == size);
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01001874 if (!ok && !signal_pending(current))
1875 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
Philipp Reisnerb411b362009-09-25 16:07:19 -07001876 cmdname(cmd), (int)size, sent);
1877 return ok;
1878}
1879
1880/* don't pass the socket. we may only look at it
1881 * when we hold the appropriate socket mutex.
1882 */
1883int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001884 enum drbd_packets cmd, struct p_header80 *h, size_t size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001885{
1886 int ok = 0;
1887 struct socket *sock;
1888
1889 if (use_data_socket) {
1890 mutex_lock(&mdev->data.mutex);
1891 sock = mdev->data.socket;
1892 } else {
1893 mutex_lock(&mdev->meta.mutex);
1894 sock = mdev->meta.socket;
1895 }
1896
1897 /* drbd_disconnect() could have called drbd_free_sock()
1898 * while we were waiting in down()... */
1899 if (likely(sock != NULL))
1900 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1901
1902 if (use_data_socket)
1903 mutex_unlock(&mdev->data.mutex);
1904 else
1905 mutex_unlock(&mdev->meta.mutex);
1906 return ok;
1907}
1908
1909int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1910 size_t size)
1911{
Philipp Reisner0b70a132010-08-20 13:36:10 +02001912 struct p_header80 h;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001913 int ok;
1914
1915 h.magic = BE_DRBD_MAGIC;
1916 h.command = cpu_to_be16(cmd);
1917 h.length = cpu_to_be16(size);
1918
1919 if (!drbd_get_data_sock(mdev))
1920 return 0;
1921
Philipp Reisnerb411b362009-09-25 16:07:19 -07001922 ok = (sizeof(h) ==
1923 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1924 ok = ok && (size ==
1925 drbd_send(mdev, mdev->data.socket, data, size, 0));
1926
1927 drbd_put_data_sock(mdev);
1928
1929 return ok;
1930}
1931
1932int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1933{
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001934 struct p_rs_param_95 *p;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001935 struct socket *sock;
1936 int size, rv;
1937 const int apv = mdev->agreed_pro_version;
1938
1939 size = apv <= 87 ? sizeof(struct p_rs_param)
1940 : apv == 88 ? sizeof(struct p_rs_param)
1941 + strlen(mdev->sync_conf.verify_alg) + 1
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001942 : apv <= 94 ? sizeof(struct p_rs_param_89)
1943 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001944
1945 /* used from admin command context and receiver/worker context.
1946 * to avoid kmalloc, grab the socket right here,
1947 * then use the pre-allocated sbuf there */
1948 mutex_lock(&mdev->data.mutex);
1949 sock = mdev->data.socket;
1950
1951 if (likely(sock != NULL)) {
1952 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1953
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001954 p = &mdev->data.sbuf.rs_param_95;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001955
1956 /* initialize verify_alg and csums_alg */
1957 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1958
1959 p->rate = cpu_to_be32(sc->rate);
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001960 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1961 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1962 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1963 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001964
1965 if (apv >= 88)
1966 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1967 if (apv >= 89)
1968 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1969
1970 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1971 } else
1972 rv = 0; /* not ok */
1973
1974 mutex_unlock(&mdev->data.mutex);
1975
1976 return rv;
1977}
1978
1979int drbd_send_protocol(struct drbd_conf *mdev)
1980{
1981 struct p_protocol *p;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001982 int size, cf, rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001983
1984 size = sizeof(struct p_protocol);
1985
1986 if (mdev->agreed_pro_version >= 87)
1987 size += strlen(mdev->net_conf->integrity_alg) + 1;
1988
1989 /* we must not recurse into our own queue,
1990 * as that is blocked during handshake */
1991 p = kmalloc(size, GFP_NOIO);
1992 if (p == NULL)
1993 return 0;
1994
1995 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
1996 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
1997 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
1998 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001999 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
2000
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002001 cf = 0;
2002 if (mdev->net_conf->want_lose)
2003 cf |= CF_WANT_LOSE;
2004 if (mdev->net_conf->dry_run) {
2005 if (mdev->agreed_pro_version >= 92)
2006 cf |= CF_DRY_RUN;
2007 else {
2008 dev_err(DEV, "--dry-run is not supported by peer");
Dan Carpenter7ac314c2010-04-22 14:27:23 +02002009 kfree(p);
Philipp Reisner148efa12011-01-15 00:21:15 +01002010 return -1;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002011 }
2012 }
2013 p->conn_flags = cpu_to_be32(cf);
2014
Philipp Reisnerb411b362009-09-25 16:07:19 -07002015 if (mdev->agreed_pro_version >= 87)
2016 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
2017
2018 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002019 (struct p_header80 *)p, size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002020 kfree(p);
2021 return rv;
2022}
2023
2024int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
2025{
2026 struct p_uuids p;
2027 int i;
2028
2029 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2030 return 1;
2031
2032 for (i = UI_CURRENT; i < UI_SIZE; i++)
2033 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
2034
2035 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2036 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
2037 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
2038 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
2039 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2040 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2041
2042 put_ldev(mdev);
2043
2044 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002045 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002046}
2047
2048int drbd_send_uuids(struct drbd_conf *mdev)
2049{
2050 return _drbd_send_uuids(mdev, 0);
2051}
2052
2053int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2054{
2055 return _drbd_send_uuids(mdev, 8);
2056}
2057
Lars Ellenberg62b0da32011-01-20 13:25:21 +01002058void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2059{
2060 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2061 u64 *uuid = mdev->ldev->md.uuid;
2062 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2063 text,
2064 (unsigned long long)uuid[UI_CURRENT],
2065 (unsigned long long)uuid[UI_BITMAP],
2066 (unsigned long long)uuid[UI_HISTORY_START],
2067 (unsigned long long)uuid[UI_HISTORY_END]);
2068 put_ldev(mdev);
2069 } else {
2070 dev_info(DEV, "%s effective data uuid: %016llX\n",
2071 text,
2072 (unsigned long long)mdev->ed_uuid);
2073 }
2074}
2075
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002076int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002077{
2078 struct p_rs_uuid p;
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002079 u64 uuid;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002080
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002081 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
2082
Philipp Reisner4a23f262011-01-11 17:42:17 +01002083 uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002084 drbd_uuid_set(mdev, UI_BITMAP, uuid);
Lars Ellenberg62b0da32011-01-20 13:25:21 +01002085 drbd_print_uuids(mdev, "updated sync UUID");
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002086 drbd_md_sync(mdev);
2087 p.uuid = cpu_to_be64(uuid);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002088
2089 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002090 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002091}
2092
Philipp Reisnere89b5912010-03-24 17:11:33 +01002093int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002094{
2095 struct p_sizes p;
2096 sector_t d_size, u_size;
Philipp Reisner99432fc2011-05-20 16:39:13 +02002097 int q_order_type, max_bio_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002098 int ok;
2099
2100 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2101 D_ASSERT(mdev->ldev->backing_bdev);
2102 d_size = drbd_get_max_capacity(mdev->ldev);
2103 u_size = mdev->ldev->dc.disk_size;
2104 q_order_type = drbd_queue_order_type(mdev);
Philipp Reisner99432fc2011-05-20 16:39:13 +02002105 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
2106 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002107 put_ldev(mdev);
2108 } else {
2109 d_size = 0;
2110 u_size = 0;
2111 q_order_type = QUEUE_ORDERED_NONE;
Philipp Reisner99432fc2011-05-20 16:39:13 +02002112 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002113 }
2114
Philipp Reisner68093842011-06-30 15:43:06 +02002115 /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
2116 if (mdev->agreed_pro_version <= 94)
2117 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
2118
Philipp Reisnerb411b362009-09-25 16:07:19 -07002119 p.d_size = cpu_to_be64(d_size);
2120 p.u_size = cpu_to_be64(u_size);
2121 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
Philipp Reisner99432fc2011-05-20 16:39:13 +02002122 p.max_bio_size = cpu_to_be32(max_bio_size);
Philipp Reisnere89b5912010-03-24 17:11:33 +01002123 p.queue_order_type = cpu_to_be16(q_order_type);
2124 p.dds_flags = cpu_to_be16(flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002125
2126 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002127 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002128 return ok;
2129}
2130
2131/**
2132 * drbd_send_state() - Sends the drbd state to the peer
2133 * @mdev: DRBD device.
2134 */
2135int drbd_send_state(struct drbd_conf *mdev)
2136{
2137 struct socket *sock;
2138 struct p_state p;
2139 int ok = 0;
2140
2141 /* Grab state lock so we wont send state if we're in the middle
2142 * of a cluster wide state change on another thread */
2143 drbd_state_lock(mdev);
2144
2145 mutex_lock(&mdev->data.mutex);
2146
2147 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2148 sock = mdev->data.socket;
2149
2150 if (likely(sock != NULL)) {
2151 ok = _drbd_send_cmd(mdev, sock, P_STATE,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002152 (struct p_header80 *)&p, sizeof(p), 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002153 }
2154
2155 mutex_unlock(&mdev->data.mutex);
2156
2157 drbd_state_unlock(mdev);
2158 return ok;
2159}
2160
2161int drbd_send_state_req(struct drbd_conf *mdev,
2162 union drbd_state mask, union drbd_state val)
2163{
2164 struct p_req_state p;
2165
2166 p.mask = cpu_to_be32(mask.i);
2167 p.val = cpu_to_be32(val.i);
2168
2169 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002170 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002171}
2172
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01002173int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002174{
2175 struct p_req_state_reply p;
2176
2177 p.retcode = cpu_to_be32(retcode);
2178
2179 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002180 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002181}
2182
2183int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2184 struct p_compressed_bm *p,
2185 struct bm_xfer_ctx *c)
2186{
2187 struct bitstream bs;
2188 unsigned long plain_bits;
2189 unsigned long tmp;
2190 unsigned long rl;
2191 unsigned len;
2192 unsigned toggle;
2193 int bits;
2194
2195 /* may we use this feature? */
2196 if ((mdev->sync_conf.use_rle == 0) ||
2197 (mdev->agreed_pro_version < 90))
2198 return 0;
2199
2200 if (c->bit_offset >= c->bm_bits)
2201 return 0; /* nothing to do. */
2202
2203 /* use at most thus many bytes */
2204 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2205 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2206 /* plain bits covered in this code string */
2207 plain_bits = 0;
2208
2209 /* p->encoding & 0x80 stores whether the first run length is set.
2210 * bit offset is implicit.
2211 * start with toggle == 2 to be able to tell the first iteration */
2212 toggle = 2;
2213
2214 /* see how much plain bits we can stuff into one packet
2215 * using RLE and VLI. */
2216 do {
2217 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2218 : _drbd_bm_find_next(mdev, c->bit_offset);
2219 if (tmp == -1UL)
2220 tmp = c->bm_bits;
2221 rl = tmp - c->bit_offset;
2222
2223 if (toggle == 2) { /* first iteration */
2224 if (rl == 0) {
2225 /* the first checked bit was set,
2226 * store start value, */
2227 DCBP_set_start(p, 1);
2228 /* but skip encoding of zero run length */
2229 toggle = !toggle;
2230 continue;
2231 }
2232 DCBP_set_start(p, 0);
2233 }
2234
2235 /* paranoia: catch zero runlength.
2236 * can only happen if bitmap is modified while we scan it. */
2237 if (rl == 0) {
2238 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2239 "t:%u bo:%lu\n", toggle, c->bit_offset);
2240 return -1;
2241 }
2242
2243 bits = vli_encode_bits(&bs, rl);
2244 if (bits == -ENOBUFS) /* buffer full */
2245 break;
2246 if (bits <= 0) {
2247 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2248 return 0;
2249 }
2250
2251 toggle = !toggle;
2252 plain_bits += rl;
2253 c->bit_offset = tmp;
2254 } while (c->bit_offset < c->bm_bits);
2255
2256 len = bs.cur.b - p->code + !!bs.cur.bit;
2257
2258 if (plain_bits < (len << 3)) {
2259 /* incompressible with this method.
2260 * we need to rewind both word and bit position. */
2261 c->bit_offset -= plain_bits;
2262 bm_xfer_ctx_bit_to_word_offset(c);
2263 c->bit_offset = c->word_offset * BITS_PER_LONG;
2264 return 0;
2265 }
2266
2267 /* RLE + VLI was able to compress it just fine.
2268 * update c->word_offset. */
2269 bm_xfer_ctx_bit_to_word_offset(c);
2270
2271 /* store pad_bits */
2272 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2273
2274 return len;
2275}
2276
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002277/**
2278 * send_bitmap_rle_or_plain
2279 *
2280 * Return 0 when done, 1 when another iteration is needed, and a negative error
2281 * code upon failure.
2282 */
2283static int
Philipp Reisnerb411b362009-09-25 16:07:19 -07002284send_bitmap_rle_or_plain(struct drbd_conf *mdev,
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002285 struct p_header80 *h, struct bm_xfer_ctx *c)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002286{
2287 struct p_compressed_bm *p = (void*)h;
2288 unsigned long num_words;
2289 int len;
2290 int ok;
2291
2292 len = fill_bitmap_rle_bits(mdev, p, c);
2293
2294 if (len < 0)
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002295 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002296
2297 if (len) {
2298 DCBP_set_code(p, RLE_VLI_Bits);
2299 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2300 sizeof(*p) + len, 0);
2301
2302 c->packets[0]++;
2303 c->bytes[0] += sizeof(*p) + len;
2304
2305 if (c->bit_offset >= c->bm_bits)
2306 len = 0; /* DONE */
2307 } else {
2308 /* was not compressible.
2309 * send a buffer full of plain text bits instead. */
2310 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2311 len = num_words * sizeof(long);
2312 if (len)
2313 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2314 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002315 h, sizeof(struct p_header80) + len, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002316 c->word_offset += num_words;
2317 c->bit_offset = c->word_offset * BITS_PER_LONG;
2318
2319 c->packets[1]++;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002320 c->bytes[1] += sizeof(struct p_header80) + len;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002321
2322 if (c->bit_offset > c->bm_bits)
2323 c->bit_offset = c->bm_bits;
2324 }
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002325 if (ok) {
2326 if (len == 0) {
2327 INFO_bm_xfer_stats(mdev, "send", c);
2328 return 0;
2329 } else
2330 return 1;
2331 }
2332 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002333}
2334
2335/* See the comment at receive_bitmap() */
2336int _drbd_send_bitmap(struct drbd_conf *mdev)
2337{
2338 struct bm_xfer_ctx c;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002339 struct p_header80 *p;
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002340 int err;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002341
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002342 ERR_IF(!mdev->bitmap) return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002343
2344 /* maybe we should use some per thread scratch page,
2345 * and allocate that during initial device creation? */
Philipp Reisner0b70a132010-08-20 13:36:10 +02002346 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002347 if (!p) {
2348 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002349 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002350 }
2351
2352 if (get_ldev(mdev)) {
2353 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2354 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2355 drbd_bm_set_all(mdev);
2356 if (drbd_bm_write(mdev)) {
2357 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2358 * but otherwise process as per normal - need to tell other
2359 * side that a full resync is required! */
2360 dev_err(DEV, "Failed to write bitmap to disk!\n");
2361 } else {
2362 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2363 drbd_md_sync(mdev);
2364 }
2365 }
2366 put_ldev(mdev);
2367 }
2368
2369 c = (struct bm_xfer_ctx) {
2370 .bm_bits = drbd_bm_bits(mdev),
2371 .bm_words = drbd_bm_words(mdev),
2372 };
2373
2374 do {
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002375 err = send_bitmap_rle_or_plain(mdev, p, &c);
2376 } while (err > 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002377
2378 free_page((unsigned long) p);
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002379 return err == 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002380}
2381
2382int drbd_send_bitmap(struct drbd_conf *mdev)
2383{
2384 int err;
2385
2386 if (!drbd_get_data_sock(mdev))
2387 return -1;
2388 err = !_drbd_send_bitmap(mdev);
2389 drbd_put_data_sock(mdev);
2390 return err;
2391}
2392
2393int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2394{
2395 int ok;
2396 struct p_barrier_ack p;
2397
2398 p.barrier = barrier_nr;
2399 p.set_size = cpu_to_be32(set_size);
2400
2401 if (mdev->state.conn < C_CONNECTED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002402 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002403 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002404 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002405 return ok;
2406}
2407
2408/**
2409 * _drbd_send_ack() - Sends an ack packet
2410 * @mdev: DRBD device.
2411 * @cmd: Packet command code.
2412 * @sector: sector, needs to be in big endian byte order
2413 * @blksize: size in byte, needs to be in big endian byte order
2414 * @block_id: Id, big endian byte order
2415 */
2416static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2417 u64 sector,
2418 u32 blksize,
2419 u64 block_id)
2420{
2421 int ok;
2422 struct p_block_ack p;
2423
2424 p.sector = sector;
2425 p.block_id = block_id;
2426 p.blksize = blksize;
2427 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2428
2429 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002430 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002431 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002432 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002433 return ok;
2434}
2435
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002436/* dp->sector and dp->block_id already/still in network byte order,
2437 * data_size is payload size according to dp->head,
2438 * and may need to be corrected for digest size. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002439int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002440 struct p_data *dp, int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002441{
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002442 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2443 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002444 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2445 dp->block_id);
2446}
2447
2448int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2449 struct p_block_req *rp)
2450{
2451 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2452}
2453
2454/**
2455 * drbd_send_ack() - Sends an ack packet
2456 * @mdev: DRBD device.
2457 * @cmd: Packet command code.
2458 * @e: Epoch entry.
2459 */
2460int drbd_send_ack(struct drbd_conf *mdev,
2461 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2462{
2463 return _drbd_send_ack(mdev, cmd,
2464 cpu_to_be64(e->sector),
2465 cpu_to_be32(e->size),
2466 e->block_id);
2467}
2468
2469/* This function misuses the block_id field to signal if the blocks
2470 * are is sync or not. */
2471int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2472 sector_t sector, int blksize, u64 block_id)
2473{
2474 return _drbd_send_ack(mdev, cmd,
2475 cpu_to_be64(sector),
2476 cpu_to_be32(blksize),
2477 cpu_to_be64(block_id));
2478}
2479
2480int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2481 sector_t sector, int size, u64 block_id)
2482{
2483 int ok;
2484 struct p_block_req p;
2485
2486 p.sector = cpu_to_be64(sector);
2487 p.block_id = block_id;
2488 p.blksize = cpu_to_be32(size);
2489
2490 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002491 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002492 return ok;
2493}
2494
2495int drbd_send_drequest_csum(struct drbd_conf *mdev,
2496 sector_t sector, int size,
2497 void *digest, int digest_size,
2498 enum drbd_packets cmd)
2499{
2500 int ok;
2501 struct p_block_req p;
2502
2503 p.sector = cpu_to_be64(sector);
2504 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2505 p.blksize = cpu_to_be32(size);
2506
2507 p.head.magic = BE_DRBD_MAGIC;
2508 p.head.command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002509 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002510
2511 mutex_lock(&mdev->data.mutex);
2512
2513 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2514 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2515
2516 mutex_unlock(&mdev->data.mutex);
2517
2518 return ok;
2519}
2520
2521int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2522{
2523 int ok;
2524 struct p_block_req p;
2525
2526 p.sector = cpu_to_be64(sector);
2527 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2528 p.blksize = cpu_to_be32(size);
2529
2530 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002531 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002532 return ok;
2533}
2534
2535/* called on sndtimeo
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002536 * returns false if we should retry,
2537 * true if we think connection is dead
Philipp Reisnerb411b362009-09-25 16:07:19 -07002538 */
2539static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2540{
2541 int drop_it;
2542 /* long elapsed = (long)(jiffies - mdev->last_received); */
2543
2544 drop_it = mdev->meta.socket == sock
2545 || !mdev->asender.task
2546 || get_t_state(&mdev->asender) != Running
2547 || mdev->state.conn < C_CONNECTED;
2548
2549 if (drop_it)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002550 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002551
2552 drop_it = !--mdev->ko_count;
2553 if (!drop_it) {
2554 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2555 current->comm, current->pid, mdev->ko_count);
2556 request_ping(mdev);
2557 }
2558
2559 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2560}
2561
2562/* The idea of sendpage seems to be to put some kind of reference
2563 * to the page into the skb, and to hand it over to the NIC. In
2564 * this process get_page() gets called.
2565 *
2566 * As soon as the page was really sent over the network put_page()
2567 * gets called by some part of the network layer. [ NIC driver? ]
2568 *
2569 * [ get_page() / put_page() increment/decrement the count. If count
2570 * reaches 0 the page will be freed. ]
2571 *
2572 * This works nicely with pages from FSs.
2573 * But this means that in protocol A we might signal IO completion too early!
2574 *
2575 * In order not to corrupt data during a resync we must make sure
2576 * that we do not reuse our own buffer pages (EEs) to early, therefore
2577 * we have the net_ee list.
2578 *
2579 * XFS seems to have problems, still, it submits pages with page_count == 0!
2580 * As a workaround, we disable sendpage on pages
2581 * with page_count == 0 or PageSlab.
2582 */
2583static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002584 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002585{
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002586 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002587 kunmap(page);
2588 if (sent == size)
2589 mdev->send_cnt += size>>9;
2590 return sent == size;
2591}
2592
2593static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002594 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002595{
2596 mm_segment_t oldfs = get_fs();
2597 int sent, ok;
2598 int len = size;
2599
2600 /* e.g. XFS meta- & log-data is in slab pages, which have a
2601 * page_count of 0 and/or have PageSlab() set.
2602 * we cannot use send_page for those, as that does get_page();
2603 * put_page(); and would cause either a VM_BUG directly, or
2604 * __page_cache_release a page that would actually still be referenced
2605 * by someone, leading to some obscure delayed Oops somewhere else. */
2606 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002607 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002608
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002609 msg_flags |= MSG_NOSIGNAL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002610 drbd_update_congested(mdev);
2611 set_fs(KERNEL_DS);
2612 do {
2613 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2614 offset, len,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002615 msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002616 if (sent == -EAGAIN) {
2617 if (we_should_drop_the_connection(mdev,
2618 mdev->data.socket))
2619 break;
2620 else
2621 continue;
2622 }
2623 if (sent <= 0) {
2624 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2625 __func__, (int)size, len, sent);
2626 break;
2627 }
2628 len -= sent;
2629 offset += sent;
2630 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2631 set_fs(oldfs);
2632 clear_bit(NET_CONGESTED, &mdev->flags);
2633
2634 ok = (len == 0);
2635 if (likely(ok))
2636 mdev->send_cnt += size>>9;
2637 return ok;
2638}
2639
2640static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2641{
2642 struct bio_vec *bvec;
2643 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002644 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002645 __bio_for_each_segment(bvec, bio, i, 0) {
2646 if (!_drbd_no_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002647 bvec->bv_offset, bvec->bv_len,
2648 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002649 return 0;
2650 }
2651 return 1;
2652}
2653
2654static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2655{
2656 struct bio_vec *bvec;
2657 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002658 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002659 __bio_for_each_segment(bvec, bio, i, 0) {
2660 if (!_drbd_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002661 bvec->bv_offset, bvec->bv_len,
2662 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002663 return 0;
2664 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002665 return 1;
2666}
2667
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002668static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2669{
2670 struct page *page = e->pages;
2671 unsigned len = e->size;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002672 /* hint all but last page with MSG_MORE */
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002673 page_chain_for_each(page) {
2674 unsigned l = min_t(unsigned, len, PAGE_SIZE);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002675 if (!_drbd_send_page(mdev, page, 0, l,
2676 page_chain_next(page) ? MSG_MORE : 0))
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002677 return 0;
2678 len -= l;
2679 }
2680 return 1;
2681}
2682
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002683static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2684{
2685 if (mdev->agreed_pro_version >= 95)
2686 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002687 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2688 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2689 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2690 else
Jens Axboe721a9602011-03-09 11:56:30 +01002691 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002692}
2693
Philipp Reisnerb411b362009-09-25 16:07:19 -07002694/* Used to send write requests
2695 * R_PRIMARY -> Peer (P_DATA)
2696 */
2697int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2698{
2699 int ok = 1;
2700 struct p_data p;
2701 unsigned int dp_flags = 0;
2702 void *dgb;
2703 int dgs;
2704
2705 if (!drbd_get_data_sock(mdev))
2706 return 0;
2707
2708 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2709 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2710
Philipp Reisnerd5373382010-08-23 15:18:33 +02002711 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002712 p.head.h80.magic = BE_DRBD_MAGIC;
2713 p.head.h80.command = cpu_to_be16(P_DATA);
2714 p.head.h80.length =
2715 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2716 } else {
2717 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2718 p.head.h95.command = cpu_to_be16(P_DATA);
2719 p.head.h95.length =
2720 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2721 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002722
2723 p.sector = cpu_to_be64(req->sector);
2724 p.block_id = (unsigned long)req;
2725 p.seq_num = cpu_to_be32(req->seq_num =
2726 atomic_add_return(1, &mdev->packet_seq));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002727
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002728 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2729
Philipp Reisnerb411b362009-09-25 16:07:19 -07002730 if (mdev->state.conn >= C_SYNC_SOURCE &&
2731 mdev->state.conn <= C_PAUSED_SYNC_T)
2732 dp_flags |= DP_MAY_SET_IN_SYNC;
2733
2734 p.dp_flags = cpu_to_be32(dp_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002735 set_bit(UNPLUG_REMOTE, &mdev->flags);
2736 ok = (sizeof(p) ==
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002737 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002738 if (ok && dgs) {
2739 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002740 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
Andreas Gruenbachercab2f742010-12-09 16:08:46 +01002741 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002742 }
2743 if (ok) {
Lars Ellenberg470be442010-11-10 10:36:52 +01002744 /* For protocol A, we have to memcpy the payload into
2745 * socket buffers, as we may complete right away
2746 * as soon as we handed it over to tcp, at which point the data
2747 * pages may become invalid.
2748 *
2749 * For data-integrity enabled, we copy it as well, so we can be
2750 * sure that even if the bio pages may still be modified, it
2751 * won't change the data on the wire, thus if the digest checks
2752 * out ok after sending on this side, but does not fit on the
2753 * receiving side, we sure have detected corruption elsewhere.
2754 */
2755 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002756 ok = _drbd_send_bio(mdev, req->master_bio);
2757 else
2758 ok = _drbd_send_zc_bio(mdev, req->master_bio);
Lars Ellenberg470be442010-11-10 10:36:52 +01002759
2760 /* double check digest, sometimes buffers have been modified in flight. */
2761 if (dgs > 0 && dgs <= 64) {
Bart Van Assche24c48302011-05-21 18:32:29 +02002762 /* 64 byte, 512 bit, is the largest digest size
Lars Ellenberg470be442010-11-10 10:36:52 +01002763 * currently supported in kernel crypto. */
2764 unsigned char digest[64];
2765 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2766 if (memcmp(mdev->int_dig_out, digest, dgs)) {
2767 dev_warn(DEV,
2768 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2769 (unsigned long long)req->sector, req->size);
2770 }
2771 } /* else if (dgs > 64) {
2772 ... Be noisy about digest too large ...
2773 } */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002774 }
2775
2776 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002777
Philipp Reisnerb411b362009-09-25 16:07:19 -07002778 return ok;
2779}
2780
2781/* answer packet, used to send data back for read requests:
2782 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2783 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2784 */
2785int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2786 struct drbd_epoch_entry *e)
2787{
2788 int ok;
2789 struct p_data p;
2790 void *dgb;
2791 int dgs;
2792
2793 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2794 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2795
Philipp Reisnerd5373382010-08-23 15:18:33 +02002796 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002797 p.head.h80.magic = BE_DRBD_MAGIC;
2798 p.head.h80.command = cpu_to_be16(cmd);
2799 p.head.h80.length =
2800 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2801 } else {
2802 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2803 p.head.h95.command = cpu_to_be16(cmd);
2804 p.head.h95.length =
2805 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2806 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002807
2808 p.sector = cpu_to_be64(e->sector);
2809 p.block_id = e->block_id;
2810 /* p.seq_num = 0; No sequence numbers here.. */
2811
2812 /* Only called by our kernel thread.
2813 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2814 * in response to admin command or module unload.
2815 */
2816 if (!drbd_get_data_sock(mdev))
2817 return 0;
2818
Philipp Reisner0b70a132010-08-20 13:36:10 +02002819 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002820 if (ok && dgs) {
2821 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002822 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
Andreas Gruenbachercab2f742010-12-09 16:08:46 +01002823 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002824 }
2825 if (ok)
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002826 ok = _drbd_send_zc_ee(mdev, e);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002827
2828 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002829
Philipp Reisnerb411b362009-09-25 16:07:19 -07002830 return ok;
2831}
2832
Philipp Reisner73a01a12010-10-27 14:33:00 +02002833int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2834{
2835 struct p_block_desc p;
2836
2837 p.sector = cpu_to_be64(req->sector);
2838 p.blksize = cpu_to_be32(req->size);
2839
2840 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2841}
2842
Philipp Reisnerb411b362009-09-25 16:07:19 -07002843/*
2844 drbd_send distinguishes two cases:
2845
2846 Packets sent via the data socket "sock"
2847 and packets sent via the meta data socket "msock"
2848
2849 sock msock
2850 -----------------+-------------------------+------------------------------
2851 timeout conf.timeout / 2 conf.timeout / 2
2852 timeout action send a ping via msock Abort communication
2853 and close all sockets
2854*/
2855
2856/*
2857 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2858 */
2859int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2860 void *buf, size_t size, unsigned msg_flags)
2861{
2862 struct kvec iov;
2863 struct msghdr msg;
2864 int rv, sent = 0;
2865
2866 if (!sock)
2867 return -1000;
2868
2869 /* THINK if (signal_pending) return ... ? */
2870
2871 iov.iov_base = buf;
2872 iov.iov_len = size;
2873
2874 msg.msg_name = NULL;
2875 msg.msg_namelen = 0;
2876 msg.msg_control = NULL;
2877 msg.msg_controllen = 0;
2878 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2879
2880 if (sock == mdev->data.socket) {
2881 mdev->ko_count = mdev->net_conf->ko_count;
2882 drbd_update_congested(mdev);
2883 }
2884 do {
2885 /* STRANGE
2886 * tcp_sendmsg does _not_ use its size parameter at all ?
2887 *
2888 * -EAGAIN on timeout, -EINTR on signal.
2889 */
2890/* THINK
2891 * do we need to block DRBD_SIG if sock == &meta.socket ??
2892 * otherwise wake_asender() might interrupt some send_*Ack !
2893 */
2894 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2895 if (rv == -EAGAIN) {
2896 if (we_should_drop_the_connection(mdev, sock))
2897 break;
2898 else
2899 continue;
2900 }
2901 D_ASSERT(rv != 0);
2902 if (rv == -EINTR) {
2903 flush_signals(current);
2904 rv = 0;
2905 }
2906 if (rv < 0)
2907 break;
2908 sent += rv;
2909 iov.iov_base += rv;
2910 iov.iov_len -= rv;
2911 } while (sent < size);
2912
2913 if (sock == mdev->data.socket)
2914 clear_bit(NET_CONGESTED, &mdev->flags);
2915
2916 if (rv <= 0) {
2917 if (rv != -EAGAIN) {
2918 dev_err(DEV, "%s_sendmsg returned %d\n",
2919 sock == mdev->meta.socket ? "msock" : "sock",
2920 rv);
2921 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2922 } else
2923 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2924 }
2925
2926 return sent;
2927}
2928
2929static int drbd_open(struct block_device *bdev, fmode_t mode)
2930{
2931 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2932 unsigned long flags;
2933 int rv = 0;
2934
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002935 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002936 spin_lock_irqsave(&mdev->req_lock, flags);
2937 /* to have a stable mdev->state.role
2938 * and no race with updating open_cnt */
2939
2940 if (mdev->state.role != R_PRIMARY) {
2941 if (mode & FMODE_WRITE)
2942 rv = -EROFS;
2943 else if (!allow_oos)
2944 rv = -EMEDIUMTYPE;
2945 }
2946
2947 if (!rv)
2948 mdev->open_cnt++;
2949 spin_unlock_irqrestore(&mdev->req_lock, flags);
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002950 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002951
2952 return rv;
2953}
2954
2955static int drbd_release(struct gendisk *gd, fmode_t mode)
2956{
2957 struct drbd_conf *mdev = gd->private_data;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002958 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002959 mdev->open_cnt--;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002960 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002961 return 0;
2962}
2963
Philipp Reisnerb411b362009-09-25 16:07:19 -07002964static void drbd_set_defaults(struct drbd_conf *mdev)
2965{
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002966 /* This way we get a compile error when sync_conf grows,
2967 and we forgot to initialize it here */
2968 mdev->sync_conf = (struct syncer_conf) {
2969 /* .rate = */ DRBD_RATE_DEF,
2970 /* .after = */ DRBD_AFTER_DEF,
2971 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002972 /* .verify_alg = */ {}, 0,
2973 /* .cpu_mask = */ {}, 0,
2974 /* .csums_alg = */ {}, 0,
Philipp Reisnere7564142010-06-29 17:35:34 +02002975 /* .use_rle = */ 0,
Philipp Reisner9a31d712010-07-05 13:42:03 +02002976 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
2977 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
2978 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2979 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002980 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
2981 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002982 };
2983
2984 /* Have to use that way, because the layout differs between
2985 big endian and little endian */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002986 mdev->state = (union drbd_state) {
2987 { .role = R_SECONDARY,
2988 .peer = R_UNKNOWN,
2989 .conn = C_STANDALONE,
2990 .disk = D_DISKLESS,
2991 .pdsk = D_UNKNOWN,
Philipp Reisnerfb22c402010-09-08 23:20:21 +02002992 .susp = 0,
2993 .susp_nod = 0,
2994 .susp_fen = 0
Philipp Reisnerb411b362009-09-25 16:07:19 -07002995 } };
2996}
2997
2998void drbd_init_set_defaults(struct drbd_conf *mdev)
2999{
3000 /* the memset(,0,) did most of this.
3001 * note: only assignments, no allocation in here */
3002
3003 drbd_set_defaults(mdev);
3004
Philipp Reisnerb411b362009-09-25 16:07:19 -07003005 atomic_set(&mdev->ap_bio_cnt, 0);
3006 atomic_set(&mdev->ap_pending_cnt, 0);
3007 atomic_set(&mdev->rs_pending_cnt, 0);
3008 atomic_set(&mdev->unacked_cnt, 0);
3009 atomic_set(&mdev->local_cnt, 0);
3010 atomic_set(&mdev->net_cnt, 0);
3011 atomic_set(&mdev->packet_seq, 0);
3012 atomic_set(&mdev->pp_in_use, 0);
Lars Ellenberg435f0742010-09-06 12:30:25 +02003013 atomic_set(&mdev->pp_in_use_by_net, 0);
Philipp Reisner778f2712010-07-06 11:14:00 +02003014 atomic_set(&mdev->rs_sect_in, 0);
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02003015 atomic_set(&mdev->rs_sect_ev, 0);
Philipp Reisner759fbdf2010-10-26 16:02:27 +02003016 atomic_set(&mdev->ap_in_flight, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003017
3018 mutex_init(&mdev->md_io_mutex);
3019 mutex_init(&mdev->data.mutex);
3020 mutex_init(&mdev->meta.mutex);
3021 sema_init(&mdev->data.work.s, 0);
3022 sema_init(&mdev->meta.work.s, 0);
3023 mutex_init(&mdev->state_mutex);
3024
3025 spin_lock_init(&mdev->data.work.q_lock);
3026 spin_lock_init(&mdev->meta.work.q_lock);
3027
3028 spin_lock_init(&mdev->al_lock);
3029 spin_lock_init(&mdev->req_lock);
3030 spin_lock_init(&mdev->peer_seq_lock);
3031 spin_lock_init(&mdev->epoch_lock);
3032
3033 INIT_LIST_HEAD(&mdev->active_ee);
3034 INIT_LIST_HEAD(&mdev->sync_ee);
3035 INIT_LIST_HEAD(&mdev->done_ee);
3036 INIT_LIST_HEAD(&mdev->read_ee);
3037 INIT_LIST_HEAD(&mdev->net_ee);
3038 INIT_LIST_HEAD(&mdev->resync_reads);
3039 INIT_LIST_HEAD(&mdev->data.work.q);
3040 INIT_LIST_HEAD(&mdev->meta.work.q);
3041 INIT_LIST_HEAD(&mdev->resync_work.list);
3042 INIT_LIST_HEAD(&mdev->unplug_work.list);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003043 INIT_LIST_HEAD(&mdev->go_diskless.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003044 INIT_LIST_HEAD(&mdev->md_sync_work.list);
Philipp Reisnerc4752ef2010-10-27 17:32:36 +02003045 INIT_LIST_HEAD(&mdev->start_resync_work.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003046 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
Philipp Reisner0ced55a2010-04-30 15:26:20 +02003047
Philipp Reisner794abb72010-12-27 11:51:23 +01003048 mdev->resync_work.cb = w_resync_timer;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003049 mdev->unplug_work.cb = w_send_write_hint;
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003050 mdev->go_diskless.cb = w_go_diskless;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003051 mdev->md_sync_work.cb = w_md_sync;
3052 mdev->bm_io_work.w.cb = w_bitmap_io;
Philipp Reisner370a43e2011-01-14 16:03:11 +01003053 mdev->start_resync_work.cb = w_start_resync;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003054 init_timer(&mdev->resync_timer);
3055 init_timer(&mdev->md_sync_timer);
Philipp Reisner370a43e2011-01-14 16:03:11 +01003056 init_timer(&mdev->start_resync_timer);
Philipp Reisner7fde2be2011-03-01 11:08:28 +01003057 init_timer(&mdev->request_timer);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003058 mdev->resync_timer.function = resync_timer_fn;
3059 mdev->resync_timer.data = (unsigned long) mdev;
3060 mdev->md_sync_timer.function = md_sync_timer_fn;
3061 mdev->md_sync_timer.data = (unsigned long) mdev;
Philipp Reisner370a43e2011-01-14 16:03:11 +01003062 mdev->start_resync_timer.function = start_resync_timer_fn;
3063 mdev->start_resync_timer.data = (unsigned long) mdev;
Philipp Reisner7fde2be2011-03-01 11:08:28 +01003064 mdev->request_timer.function = request_timer_fn;
3065 mdev->request_timer.data = (unsigned long) mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003066
3067 init_waitqueue_head(&mdev->misc_wait);
3068 init_waitqueue_head(&mdev->state_wait);
Philipp Reisner84dfb9f2010-06-23 11:20:05 +02003069 init_waitqueue_head(&mdev->net_cnt_wait);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003070 init_waitqueue_head(&mdev->ee_wait);
3071 init_waitqueue_head(&mdev->al_wait);
3072 init_waitqueue_head(&mdev->seq_wait);
3073
3074 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
3075 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
3076 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
3077
3078 mdev->agreed_pro_version = PRO_VERSION_MAX;
Philipp Reisner2451fc32010-08-24 13:43:11 +02003079 mdev->write_ordering = WO_bdev_flush;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003080 mdev->resync_wenr = LC_FREE;
Philipp Reisner99432fc2011-05-20 16:39:13 +02003081 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3082 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003083}
3084
3085void drbd_mdev_cleanup(struct drbd_conf *mdev)
3086{
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003087 int i;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003088 if (mdev->receiver.t_state != None)
3089 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
3090 mdev->receiver.t_state);
3091
3092 /* no need to lock it, I'm the only thread alive */
3093 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
3094 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
3095 mdev->al_writ_cnt =
3096 mdev->bm_writ_cnt =
3097 mdev->read_cnt =
3098 mdev->recv_cnt =
3099 mdev->send_cnt =
3100 mdev->writ_cnt =
3101 mdev->p_size =
3102 mdev->rs_start =
3103 mdev->rs_total =
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003104 mdev->rs_failed = 0;
3105 mdev->rs_last_events = 0;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02003106 mdev->rs_last_sect_ev = 0;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003107 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3108 mdev->rs_mark_left[i] = 0;
3109 mdev->rs_mark_time[i] = 0;
3110 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003111 D_ASSERT(mdev->net_conf == NULL);
3112
3113 drbd_set_my_capacity(mdev, 0);
3114 if (mdev->bitmap) {
3115 /* maybe never allocated. */
Philipp Reisner02d9a942010-03-24 16:23:03 +01003116 drbd_bm_resize(mdev, 0, 1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003117 drbd_bm_cleanup(mdev);
3118 }
3119
3120 drbd_free_resources(mdev);
Philipp Reisner07782862010-08-31 12:00:50 +02003121 clear_bit(AL_SUSPENDED, &mdev->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003122
3123 /*
3124 * currently we drbd_init_ee only on module load, so
3125 * we may do drbd_release_ee only on module unload!
3126 */
3127 D_ASSERT(list_empty(&mdev->active_ee));
3128 D_ASSERT(list_empty(&mdev->sync_ee));
3129 D_ASSERT(list_empty(&mdev->done_ee));
3130 D_ASSERT(list_empty(&mdev->read_ee));
3131 D_ASSERT(list_empty(&mdev->net_ee));
3132 D_ASSERT(list_empty(&mdev->resync_reads));
3133 D_ASSERT(list_empty(&mdev->data.work.q));
3134 D_ASSERT(list_empty(&mdev->meta.work.q));
3135 D_ASSERT(list_empty(&mdev->resync_work.list));
3136 D_ASSERT(list_empty(&mdev->unplug_work.list));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003137 D_ASSERT(list_empty(&mdev->go_diskless.list));
Lars Ellenberg2265b472010-12-16 15:41:26 +01003138
3139 drbd_set_defaults(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003140}
3141
3142
3143static void drbd_destroy_mempools(void)
3144{
3145 struct page *page;
3146
3147 while (drbd_pp_pool) {
3148 page = drbd_pp_pool;
3149 drbd_pp_pool = (struct page *)page_private(page);
3150 __free_page(page);
3151 drbd_pp_vacant--;
3152 }
3153
3154 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3155
3156 if (drbd_ee_mempool)
3157 mempool_destroy(drbd_ee_mempool);
3158 if (drbd_request_mempool)
3159 mempool_destroy(drbd_request_mempool);
3160 if (drbd_ee_cache)
3161 kmem_cache_destroy(drbd_ee_cache);
3162 if (drbd_request_cache)
3163 kmem_cache_destroy(drbd_request_cache);
3164 if (drbd_bm_ext_cache)
3165 kmem_cache_destroy(drbd_bm_ext_cache);
3166 if (drbd_al_ext_cache)
3167 kmem_cache_destroy(drbd_al_ext_cache);
3168
3169 drbd_ee_mempool = NULL;
3170 drbd_request_mempool = NULL;
3171 drbd_ee_cache = NULL;
3172 drbd_request_cache = NULL;
3173 drbd_bm_ext_cache = NULL;
3174 drbd_al_ext_cache = NULL;
3175
3176 return;
3177}
3178
3179static int drbd_create_mempools(void)
3180{
3181 struct page *page;
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01003182 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003183 int i;
3184
3185 /* prepare our caches and mempools */
3186 drbd_request_mempool = NULL;
3187 drbd_ee_cache = NULL;
3188 drbd_request_cache = NULL;
3189 drbd_bm_ext_cache = NULL;
3190 drbd_al_ext_cache = NULL;
3191 drbd_pp_pool = NULL;
3192
3193 /* caches */
3194 drbd_request_cache = kmem_cache_create(
3195 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3196 if (drbd_request_cache == NULL)
3197 goto Enomem;
3198
3199 drbd_ee_cache = kmem_cache_create(
3200 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3201 if (drbd_ee_cache == NULL)
3202 goto Enomem;
3203
3204 drbd_bm_ext_cache = kmem_cache_create(
3205 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3206 if (drbd_bm_ext_cache == NULL)
3207 goto Enomem;
3208
3209 drbd_al_ext_cache = kmem_cache_create(
3210 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3211 if (drbd_al_ext_cache == NULL)
3212 goto Enomem;
3213
3214 /* mempools */
3215 drbd_request_mempool = mempool_create(number,
3216 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3217 if (drbd_request_mempool == NULL)
3218 goto Enomem;
3219
3220 drbd_ee_mempool = mempool_create(number,
3221 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
Nicolas Kaiser2027ae12010-10-28 06:15:26 -06003222 if (drbd_ee_mempool == NULL)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003223 goto Enomem;
3224
3225 /* drbd's page pool */
3226 spin_lock_init(&drbd_pp_lock);
3227
3228 for (i = 0; i < number; i++) {
3229 page = alloc_page(GFP_HIGHUSER);
3230 if (!page)
3231 goto Enomem;
3232 set_page_private(page, (unsigned long)drbd_pp_pool);
3233 drbd_pp_pool = page;
3234 }
3235 drbd_pp_vacant = number;
3236
3237 return 0;
3238
3239Enomem:
3240 drbd_destroy_mempools(); /* in case we allocated some */
3241 return -ENOMEM;
3242}
3243
3244static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3245 void *unused)
3246{
3247 /* just so we have it. you never know what interesting things we
3248 * might want to do here some day...
3249 */
3250
3251 return NOTIFY_DONE;
3252}
3253
3254static struct notifier_block drbd_notifier = {
3255 .notifier_call = drbd_notify_sys,
3256};
3257
3258static void drbd_release_ee_lists(struct drbd_conf *mdev)
3259{
3260 int rr;
3261
3262 rr = drbd_release_ee(mdev, &mdev->active_ee);
3263 if (rr)
3264 dev_err(DEV, "%d EEs in active list found!\n", rr);
3265
3266 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3267 if (rr)
3268 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3269
3270 rr = drbd_release_ee(mdev, &mdev->read_ee);
3271 if (rr)
3272 dev_err(DEV, "%d EEs in read list found!\n", rr);
3273
3274 rr = drbd_release_ee(mdev, &mdev->done_ee);
3275 if (rr)
3276 dev_err(DEV, "%d EEs in done list found!\n", rr);
3277
3278 rr = drbd_release_ee(mdev, &mdev->net_ee);
3279 if (rr)
3280 dev_err(DEV, "%d EEs in net list found!\n", rr);
3281}
3282
3283/* caution. no locking.
3284 * currently only used from module cleanup code. */
3285static void drbd_delete_device(unsigned int minor)
3286{
3287 struct drbd_conf *mdev = minor_to_mdev(minor);
3288
3289 if (!mdev)
3290 return;
3291
3292 /* paranoia asserts */
3293 if (mdev->open_cnt != 0)
3294 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3295 __FILE__ , __LINE__);
3296
3297 ERR_IF (!list_empty(&mdev->data.work.q)) {
3298 struct list_head *lp;
3299 list_for_each(lp, &mdev->data.work.q) {
3300 dev_err(DEV, "lp = %p\n", lp);
3301 }
3302 };
3303 /* end paranoia asserts */
3304
3305 del_gendisk(mdev->vdisk);
3306
3307 /* cleanup stuff that may have been allocated during
3308 * device (re-)configuration or state changes */
3309
3310 if (mdev->this_bdev)
3311 bdput(mdev->this_bdev);
3312
3313 drbd_free_resources(mdev);
3314
3315 drbd_release_ee_lists(mdev);
3316
Bart Van Assche24c48302011-05-21 18:32:29 +02003317 /* should be freed on disconnect? */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003318 kfree(mdev->ee_hash);
3319 /*
3320 mdev->ee_hash_s = 0;
3321 mdev->ee_hash = NULL;
3322 */
3323
3324 lc_destroy(mdev->act_log);
3325 lc_destroy(mdev->resync);
3326
3327 kfree(mdev->p_uuid);
3328 /* mdev->p_uuid = NULL; */
3329
3330 kfree(mdev->int_dig_out);
3331 kfree(mdev->int_dig_in);
3332 kfree(mdev->int_dig_vv);
3333
3334 /* cleanup the rest that has been
3335 * allocated from drbd_new_device
3336 * and actually free the mdev itself */
3337 drbd_free_mdev(mdev);
3338}
3339
3340static void drbd_cleanup(void)
3341{
3342 unsigned int i;
3343
3344 unregister_reboot_notifier(&drbd_notifier);
3345
Lars Ellenberg17a93f32010-11-24 10:37:35 +01003346 /* first remove proc,
3347 * drbdsetup uses it's presence to detect
3348 * whether DRBD is loaded.
3349 * If we would get stuck in proc removal,
3350 * but have netlink already deregistered,
3351 * some drbdsetup commands may wait forever
3352 * for an answer.
3353 */
3354 if (drbd_proc)
3355 remove_proc_entry("drbd", NULL);
3356
Philipp Reisnerb411b362009-09-25 16:07:19 -07003357 drbd_nl_cleanup();
3358
3359 if (minor_table) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003360 i = minor_count;
3361 while (i--)
3362 drbd_delete_device(i);
3363 drbd_destroy_mempools();
3364 }
3365
3366 kfree(minor_table);
3367
3368 unregister_blkdev(DRBD_MAJOR, "drbd");
3369
3370 printk(KERN_INFO "drbd: module cleanup done.\n");
3371}
3372
3373/**
3374 * drbd_congested() - Callback for pdflush
3375 * @congested_data: User data
3376 * @bdi_bits: Bits pdflush is currently interested in
3377 *
3378 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3379 */
3380static int drbd_congested(void *congested_data, int bdi_bits)
3381{
3382 struct drbd_conf *mdev = congested_data;
3383 struct request_queue *q;
3384 char reason = '-';
3385 int r = 0;
3386
Andreas Gruenbacher1b881ef2010-12-13 18:03:38 +01003387 if (!may_inc_ap_bio(mdev)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003388 /* DRBD has frozen IO */
3389 r = bdi_bits;
3390 reason = 'd';
3391 goto out;
3392 }
3393
3394 if (get_ldev(mdev)) {
3395 q = bdev_get_queue(mdev->ldev->backing_bdev);
3396 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3397 put_ldev(mdev);
3398 if (r)
3399 reason = 'b';
3400 }
3401
3402 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3403 r |= (1 << BDI_async_congested);
3404 reason = reason == 'b' ? 'a' : 'n';
3405 }
3406
3407out:
3408 mdev->congestion_reason = reason;
3409 return r;
3410}
3411
3412struct drbd_conf *drbd_new_device(unsigned int minor)
3413{
3414 struct drbd_conf *mdev;
3415 struct gendisk *disk;
3416 struct request_queue *q;
3417
3418 /* GFP_KERNEL, we are outside of all write-out paths */
3419 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3420 if (!mdev)
3421 return NULL;
3422 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3423 goto out_no_cpumask;
3424
3425 mdev->minor = minor;
3426
3427 drbd_init_set_defaults(mdev);
3428
3429 q = blk_alloc_queue(GFP_KERNEL);
3430 if (!q)
3431 goto out_no_q;
3432 mdev->rq_queue = q;
3433 q->queuedata = mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003434
3435 disk = alloc_disk(1);
3436 if (!disk)
3437 goto out_no_disk;
3438 mdev->vdisk = disk;
3439
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003440 set_disk_ro(disk, true);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003441
3442 disk->queue = q;
3443 disk->major = DRBD_MAJOR;
3444 disk->first_minor = minor;
3445 disk->fops = &drbd_ops;
3446 sprintf(disk->disk_name, "drbd%d", minor);
3447 disk->private_data = mdev;
3448
3449 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3450 /* we have no partitions. we contain only ourselves. */
3451 mdev->this_bdev->bd_contains = mdev->this_bdev;
3452
3453 q->backing_dev_info.congested_fn = drbd_congested;
3454 q->backing_dev_info.congested_data = mdev;
3455
Andreas Gruenbacher2f58dcf2010-12-13 17:48:19 +01003456 blk_queue_make_request(q, drbd_make_request);
Philipp Reisner99432fc2011-05-20 16:39:13 +02003457 /* Setting the max_hw_sectors to an odd value of 8kibyte here
3458 This triggers a max_bio_size message upon first attach or connect */
3459 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003460 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3461 blk_queue_merge_bvec(q, drbd_merge_bvec);
Jens Axboe7eaceac2011-03-10 08:52:07 +01003462 q->queue_lock = &mdev->req_lock;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003463
3464 mdev->md_io_page = alloc_page(GFP_KERNEL);
3465 if (!mdev->md_io_page)
3466 goto out_no_io_page;
3467
3468 if (drbd_bm_init(mdev))
3469 goto out_no_bitmap;
3470 /* no need to lock access, we are still initializing this minor device. */
3471 if (!tl_init(mdev))
3472 goto out_no_tl;
3473
3474 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3475 if (!mdev->app_reads_hash)
3476 goto out_no_app_reads;
3477
3478 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3479 if (!mdev->current_epoch)
3480 goto out_no_epoch;
3481
3482 INIT_LIST_HEAD(&mdev->current_epoch->list);
3483 mdev->epochs = 1;
3484
3485 return mdev;
3486
3487/* out_whatever_else:
3488 kfree(mdev->current_epoch); */
3489out_no_epoch:
3490 kfree(mdev->app_reads_hash);
3491out_no_app_reads:
3492 tl_cleanup(mdev);
3493out_no_tl:
3494 drbd_bm_cleanup(mdev);
3495out_no_bitmap:
3496 __free_page(mdev->md_io_page);
3497out_no_io_page:
3498 put_disk(disk);
3499out_no_disk:
3500 blk_cleanup_queue(q);
3501out_no_q:
3502 free_cpumask_var(mdev->cpu_mask);
3503out_no_cpumask:
3504 kfree(mdev);
3505 return NULL;
3506}
3507
3508/* counterpart of drbd_new_device.
3509 * last part of drbd_delete_device. */
3510void drbd_free_mdev(struct drbd_conf *mdev)
3511{
3512 kfree(mdev->current_epoch);
3513 kfree(mdev->app_reads_hash);
3514 tl_cleanup(mdev);
3515 if (mdev->bitmap) /* should no longer be there. */
3516 drbd_bm_cleanup(mdev);
3517 __free_page(mdev->md_io_page);
3518 put_disk(mdev->vdisk);
3519 blk_cleanup_queue(mdev->rq_queue);
3520 free_cpumask_var(mdev->cpu_mask);
Philipp Reisner37190942010-11-10 12:08:37 +01003521 drbd_free_tl_hash(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003522 kfree(mdev);
3523}
3524
3525
3526int __init drbd_init(void)
3527{
3528 int err;
3529
3530 if (sizeof(struct p_handshake) != 80) {
3531 printk(KERN_ERR
3532 "drbd: never change the size or layout "
3533 "of the HandShake packet.\n");
3534 return -EINVAL;
3535 }
3536
Philipp Reisner2b8a90b2011-01-10 11:15:17 +01003537 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003538 printk(KERN_ERR
3539 "drbd: invalid minor_count (%d)\n", minor_count);
3540#ifdef MODULE
3541 return -EINVAL;
3542#else
3543 minor_count = 8;
3544#endif
3545 }
3546
3547 err = drbd_nl_init();
3548 if (err)
3549 return err;
3550
3551 err = register_blkdev(DRBD_MAJOR, "drbd");
3552 if (err) {
3553 printk(KERN_ERR
3554 "drbd: unable to register block device major %d\n",
3555 DRBD_MAJOR);
3556 return err;
3557 }
3558
3559 register_reboot_notifier(&drbd_notifier);
3560
3561 /*
3562 * allocate all necessary structs
3563 */
3564 err = -ENOMEM;
3565
3566 init_waitqueue_head(&drbd_pp_wait);
3567
3568 drbd_proc = NULL; /* play safe for drbd_cleanup */
3569 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3570 GFP_KERNEL);
3571 if (!minor_table)
3572 goto Enomem;
3573
3574 err = drbd_create_mempools();
3575 if (err)
3576 goto Enomem;
3577
Lars Ellenberg8c484ee2010-03-11 16:47:58 +01003578 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003579 if (!drbd_proc) {
3580 printk(KERN_ERR "drbd: unable to register proc file\n");
3581 goto Enomem;
3582 }
3583
3584 rwlock_init(&global_state_lock);
3585
3586 printk(KERN_INFO "drbd: initialized. "
3587 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3588 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3589 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3590 printk(KERN_INFO "drbd: registered as block device major %d\n",
3591 DRBD_MAJOR);
3592 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3593
3594 return 0; /* Success! */
3595
3596Enomem:
3597 drbd_cleanup();
3598 if (err == -ENOMEM)
3599 /* currently always the case */
3600 printk(KERN_ERR "drbd: ran out of memory\n");
3601 else
3602 printk(KERN_ERR "drbd: initialization failure\n");
3603 return err;
3604}
3605
3606void drbd_free_bc(struct drbd_backing_dev *ldev)
3607{
3608 if (ldev == NULL)
3609 return;
3610
Tejun Heoe525fd82010-11-13 11:55:17 +01003611 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3612 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003613
3614 kfree(ldev);
3615}
3616
3617void drbd_free_sock(struct drbd_conf *mdev)
3618{
3619 if (mdev->data.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003620 mutex_lock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003621 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3622 sock_release(mdev->data.socket);
3623 mdev->data.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003624 mutex_unlock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003625 }
3626 if (mdev->meta.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003627 mutex_lock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003628 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3629 sock_release(mdev->meta.socket);
3630 mdev->meta.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003631 mutex_unlock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003632 }
3633}
3634
3635
3636void drbd_free_resources(struct drbd_conf *mdev)
3637{
3638 crypto_free_hash(mdev->csums_tfm);
3639 mdev->csums_tfm = NULL;
3640 crypto_free_hash(mdev->verify_tfm);
3641 mdev->verify_tfm = NULL;
3642 crypto_free_hash(mdev->cram_hmac_tfm);
3643 mdev->cram_hmac_tfm = NULL;
3644 crypto_free_hash(mdev->integrity_w_tfm);
3645 mdev->integrity_w_tfm = NULL;
3646 crypto_free_hash(mdev->integrity_r_tfm);
3647 mdev->integrity_r_tfm = NULL;
3648
3649 drbd_free_sock(mdev);
3650
3651 __no_warn(local,
3652 drbd_free_bc(mdev->ldev);
3653 mdev->ldev = NULL;);
3654}
3655
3656/* meta data management */
3657
3658struct meta_data_on_disk {
3659 u64 la_size; /* last agreed size. */
3660 u64 uuid[UI_SIZE]; /* UUIDs. */
3661 u64 device_uuid;
3662 u64 reserved_u64_1;
3663 u32 flags; /* MDF */
3664 u32 magic;
3665 u32 md_size_sect;
3666 u32 al_offset; /* offset to this block */
3667 u32 al_nr_extents; /* important for restoring the AL */
3668 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3669 u32 bm_offset; /* offset to the bitmap, from here */
3670 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
Philipp Reisner99432fc2011-05-20 16:39:13 +02003671 u32 la_peer_max_bio_size; /* last peer max_bio_size */
3672 u32 reserved_u32[3];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003673
3674} __packed;
3675
3676/**
3677 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3678 * @mdev: DRBD device.
3679 */
3680void drbd_md_sync(struct drbd_conf *mdev)
3681{
3682 struct meta_data_on_disk *buffer;
3683 sector_t sector;
3684 int i;
3685
Lars Ellenbergee15b032010-09-03 10:00:09 +02003686 del_timer(&mdev->md_sync_timer);
3687 /* timer may be rearmed by drbd_md_mark_dirty() now. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003688 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3689 return;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003690
3691 /* We use here D_FAILED and not D_ATTACHING because we try to write
3692 * metadata even if we detach due to a disk failure! */
3693 if (!get_ldev_if_state(mdev, D_FAILED))
3694 return;
3695
Philipp Reisnerb411b362009-09-25 16:07:19 -07003696 mutex_lock(&mdev->md_io_mutex);
3697 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3698 memset(buffer, 0, 512);
3699
3700 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3701 for (i = UI_CURRENT; i < UI_SIZE; i++)
3702 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3703 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3704 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3705
3706 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3707 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3708 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3709 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3710 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3711
3712 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
Philipp Reisner99432fc2011-05-20 16:39:13 +02003713 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003714
3715 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3716 sector = mdev->ldev->md.md_offset;
3717
Lars Ellenberg3f3a9b82010-09-01 15:12:12 +02003718 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003719 /* this was a try anyways ... */
3720 dev_err(DEV, "meta data update failed!\n");
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003721 drbd_chk_io_error(mdev, 1, true);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003722 }
3723
3724 /* Update mdev->ldev->md.la_size_sect,
3725 * since we updated it on metadata. */
3726 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3727
3728 mutex_unlock(&mdev->md_io_mutex);
3729 put_ldev(mdev);
3730}
3731
3732/**
3733 * drbd_md_read() - Reads in the meta data super block
3734 * @mdev: DRBD device.
3735 * @bdev: Device from which the meta data should be read in.
3736 *
Andreas Gruenbacher116676c2010-12-08 13:33:11 +01003737 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
Philipp Reisnerb411b362009-09-25 16:07:19 -07003738 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3739 */
3740int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3741{
3742 struct meta_data_on_disk *buffer;
3743 int i, rv = NO_ERROR;
3744
3745 if (!get_ldev_if_state(mdev, D_ATTACHING))
3746 return ERR_IO_MD_DISK;
3747
Philipp Reisnerb411b362009-09-25 16:07:19 -07003748 mutex_lock(&mdev->md_io_mutex);
3749 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3750
3751 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
Lucas De Marchi25985ed2011-03-30 22:57:33 -03003752 /* NOTE: can't do normal error processing here as this is
Philipp Reisnerb411b362009-09-25 16:07:19 -07003753 called BEFORE disk is attached */
3754 dev_err(DEV, "Error while reading metadata.\n");
3755 rv = ERR_IO_MD_DISK;
3756 goto err;
3757 }
3758
3759 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3760 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3761 rv = ERR_MD_INVALID;
3762 goto err;
3763 }
3764 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3765 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3766 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3767 rv = ERR_MD_INVALID;
3768 goto err;
3769 }
3770 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3771 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3772 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3773 rv = ERR_MD_INVALID;
3774 goto err;
3775 }
3776 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3777 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3778 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3779 rv = ERR_MD_INVALID;
3780 goto err;
3781 }
3782
3783 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3784 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3785 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3786 rv = ERR_MD_INVALID;
3787 goto err;
3788 }
3789
3790 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3791 for (i = UI_CURRENT; i < UI_SIZE; i++)
3792 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3793 bdev->md.flags = be32_to_cpu(buffer->flags);
3794 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3795 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3796
Philipp Reisner99432fc2011-05-20 16:39:13 +02003797 spin_lock_irq(&mdev->req_lock);
3798 if (mdev->state.conn < C_CONNECTED) {
3799 int peer;
3800 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3801 peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
3802 mdev->peer_max_bio_size = peer;
3803 }
3804 spin_unlock_irq(&mdev->req_lock);
3805
Philipp Reisnerb411b362009-09-25 16:07:19 -07003806 if (mdev->sync_conf.al_extents < 7)
3807 mdev->sync_conf.al_extents = 127;
3808
3809 err:
3810 mutex_unlock(&mdev->md_io_mutex);
3811 put_ldev(mdev);
3812
3813 return rv;
3814}
3815
3816/**
3817 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3818 * @mdev: DRBD device.
3819 *
3820 * Call this function if you change anything that should be written to
3821 * the meta-data super block. This function sets MD_DIRTY, and starts a
3822 * timer that ensures that within five seconds you have to call drbd_md_sync().
3823 */
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003824#ifdef DEBUG
Lars Ellenbergee15b032010-09-03 10:00:09 +02003825void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3826{
3827 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3828 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3829 mdev->last_md_mark_dirty.line = line;
3830 mdev->last_md_mark_dirty.func = func;
3831 }
3832}
3833#else
Philipp Reisnerb411b362009-09-25 16:07:19 -07003834void drbd_md_mark_dirty(struct drbd_conf *mdev)
3835{
Lars Ellenbergee15b032010-09-03 10:00:09 +02003836 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003837 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003838}
Lars Ellenbergee15b032010-09-03 10:00:09 +02003839#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07003840
3841static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3842{
3843 int i;
3844
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003845 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003846 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003847}
3848
3849void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3850{
3851 if (idx == UI_CURRENT) {
3852 if (mdev->state.role == R_PRIMARY)
3853 val |= 1;
3854 else
3855 val &= ~((u64)1);
3856
3857 drbd_set_ed_uuid(mdev, val);
3858 }
3859
3860 mdev->ldev->md.uuid[idx] = val;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003861 drbd_md_mark_dirty(mdev);
3862}
3863
3864
3865void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3866{
3867 if (mdev->ldev->md.uuid[idx]) {
3868 drbd_uuid_move_history(mdev);
3869 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003870 }
3871 _drbd_uuid_set(mdev, idx, val);
3872}
3873
3874/**
3875 * drbd_uuid_new_current() - Creates a new current UUID
3876 * @mdev: DRBD device.
3877 *
3878 * Creates a new current UUID, and rotates the old current UUID into
3879 * the bitmap slot. Causes an incremental resync upon next connect.
3880 */
3881void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3882{
3883 u64 val;
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003884 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003885
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003886 if (bm_uuid)
3887 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
3888
Philipp Reisnerb411b362009-09-25 16:07:19 -07003889 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003890
3891 get_random_bytes(&val, sizeof(u64));
3892 _drbd_uuid_set(mdev, UI_CURRENT, val);
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003893 drbd_print_uuids(mdev, "new current UUID");
Lars Ellenbergaaa8e2b2010-10-15 13:16:53 +02003894 /* get it to stable storage _now_ */
3895 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003896}
3897
3898void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3899{
3900 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3901 return;
3902
3903 if (val == 0) {
3904 drbd_uuid_move_history(mdev);
3905 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3906 mdev->ldev->md.uuid[UI_BITMAP] = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003907 } else {
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003908 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3909 if (bm_uuid)
3910 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003911
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003912 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003913 }
3914 drbd_md_mark_dirty(mdev);
3915}
3916
3917/**
3918 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3919 * @mdev: DRBD device.
3920 *
3921 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3922 */
3923int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3924{
3925 int rv = -EIO;
3926
3927 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3928 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3929 drbd_md_sync(mdev);
3930 drbd_bm_set_all(mdev);
3931
3932 rv = drbd_bm_write(mdev);
3933
3934 if (!rv) {
3935 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3936 drbd_md_sync(mdev);
3937 }
3938
3939 put_ldev(mdev);
3940 }
3941
3942 return rv;
3943}
3944
3945/**
3946 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3947 * @mdev: DRBD device.
3948 *
3949 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3950 */
3951int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3952{
3953 int rv = -EIO;
3954
Philipp Reisner07782862010-08-31 12:00:50 +02003955 drbd_resume_al(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003956 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3957 drbd_bm_clear_all(mdev);
3958 rv = drbd_bm_write(mdev);
3959 put_ldev(mdev);
3960 }
3961
3962 return rv;
3963}
3964
3965static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3966{
3967 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
Lars Ellenberg02851e92010-12-16 14:47:39 +01003968 int rv = -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003969
3970 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3971
Lars Ellenberg02851e92010-12-16 14:47:39 +01003972 if (get_ldev(mdev)) {
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01003973 drbd_bm_lock(mdev, work->why, work->flags);
Lars Ellenberg02851e92010-12-16 14:47:39 +01003974 rv = work->io_fn(mdev);
3975 drbd_bm_unlock(mdev);
3976 put_ldev(mdev);
3977 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003978
3979 clear_bit(BITMAP_IO, &mdev->flags);
Philipp Reisner127b3172010-11-16 10:07:53 +01003980 smp_mb__after_clear_bit();
Philipp Reisnerb411b362009-09-25 16:07:19 -07003981 wake_up(&mdev->misc_wait);
3982
3983 if (work->done)
3984 work->done(mdev, rv);
3985
3986 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3987 work->why = NULL;
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01003988 work->flags = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003989
3990 return 1;
3991}
3992
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02003993void drbd_ldev_destroy(struct drbd_conf *mdev)
3994{
3995 lc_destroy(mdev->resync);
3996 mdev->resync = NULL;
3997 lc_destroy(mdev->act_log);
3998 mdev->act_log = NULL;
3999 __no_warn(local,
4000 drbd_free_bc(mdev->ldev);
4001 mdev->ldev = NULL;);
4002
4003 if (mdev->md_io_tmpp) {
4004 __free_page(mdev->md_io_tmpp);
4005 mdev->md_io_tmpp = NULL;
4006 }
4007 clear_bit(GO_DISKLESS, &mdev->flags);
4008}
4009
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004010static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4011{
4012 D_ASSERT(mdev->state.disk == D_FAILED);
Lars Ellenberg9d282872010-10-14 13:57:07 +02004013 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
4014 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02004015 * the protected members anymore, though, so once put_ldev reaches zero
4016 * again, it will be safe to free them. */
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004017 drbd_force_state(mdev, NS(disk, D_DISKLESS));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004018 return 1;
4019}
4020
4021void drbd_go_diskless(struct drbd_conf *mdev)
4022{
4023 D_ASSERT(mdev->state.disk == D_FAILED);
4024 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
Lars Ellenberg9d282872010-10-14 13:57:07 +02004025 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004026}
4027
Philipp Reisnerb411b362009-09-25 16:07:19 -07004028/**
4029 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
4030 * @mdev: DRBD device.
4031 * @io_fn: IO callback to be called when bitmap IO is possible
4032 * @done: callback to be called after the bitmap IO was performed
4033 * @why: Descriptive text of the reason for doing the IO
4034 *
4035 * While IO on the bitmap happens we freeze application IO thus we ensure
4036 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
4037 * called from worker context. It MUST NOT be used while a previous such
4038 * work is still pending!
4039 */
4040void drbd_queue_bitmap_io(struct drbd_conf *mdev,
4041 int (*io_fn)(struct drbd_conf *),
4042 void (*done)(struct drbd_conf *, int),
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004043 char *why, enum bm_flag flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004044{
4045 D_ASSERT(current == mdev->worker.task);
4046
4047 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
4048 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
4049 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
4050 if (mdev->bm_io_work.why)
4051 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
4052 why, mdev->bm_io_work.why);
4053
4054 mdev->bm_io_work.io_fn = io_fn;
4055 mdev->bm_io_work.done = done;
4056 mdev->bm_io_work.why = why;
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004057 mdev->bm_io_work.flags = flags;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004058
Philipp Reisner22afd7e2010-11-16 15:30:44 +01004059 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004060 set_bit(BITMAP_IO, &mdev->flags);
4061 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
Philipp Reisner127b3172010-11-16 10:07:53 +01004062 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004063 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004064 }
Philipp Reisner22afd7e2010-11-16 15:30:44 +01004065 spin_unlock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004066}
4067
4068/**
4069 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
4070 * @mdev: DRBD device.
4071 * @io_fn: IO callback to be called when bitmap IO is possible
4072 * @why: Descriptive text of the reason for doing the IO
4073 *
4074 * freezes application IO while that the actual IO operations runs. This
4075 * functions MAY NOT be called from worker context.
4076 */
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004077int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4078 char *why, enum bm_flag flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004079{
4080 int rv;
4081
4082 D_ASSERT(current != mdev->worker.task);
4083
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004084 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4085 drbd_suspend_io(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004086
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004087 drbd_bm_lock(mdev, why, flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004088 rv = io_fn(mdev);
4089 drbd_bm_unlock(mdev);
4090
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004091 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4092 drbd_resume_io(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004093
4094 return rv;
4095}
4096
4097void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4098{
4099 if ((mdev->ldev->md.flags & flag) != flag) {
4100 drbd_md_mark_dirty(mdev);
4101 mdev->ldev->md.flags |= flag;
4102 }
4103}
4104
4105void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4106{
4107 if ((mdev->ldev->md.flags & flag) != 0) {
4108 drbd_md_mark_dirty(mdev);
4109 mdev->ldev->md.flags &= ~flag;
4110 }
4111}
4112int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4113{
4114 return (bdev->md.flags & flag) != 0;
4115}
4116
4117static void md_sync_timer_fn(unsigned long data)
4118{
4119 struct drbd_conf *mdev = (struct drbd_conf *) data;
4120
4121 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
4122}
4123
4124static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4125{
4126 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
Lars Ellenbergee15b032010-09-03 10:00:09 +02004127#ifdef DEBUG
4128 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4129 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4130#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07004131 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004132 return 1;
4133}
4134
4135#ifdef CONFIG_DRBD_FAULT_INJECTION
4136/* Fault insertion support including random number generator shamelessly
4137 * stolen from kernel/rcutorture.c */
4138struct fault_random_state {
4139 unsigned long state;
4140 unsigned long count;
4141};
4142
4143#define FAULT_RANDOM_MULT 39916801 /* prime */
4144#define FAULT_RANDOM_ADD 479001701 /* prime */
4145#define FAULT_RANDOM_REFRESH 10000
4146
4147/*
4148 * Crude but fast random-number generator. Uses a linear congruential
4149 * generator, with occasional help from get_random_bytes().
4150 */
4151static unsigned long
4152_drbd_fault_random(struct fault_random_state *rsp)
4153{
4154 long refresh;
4155
Roel Kluin49829ea2009-12-15 22:55:44 +01004156 if (!rsp->count--) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004157 get_random_bytes(&refresh, sizeof(refresh));
4158 rsp->state += refresh;
4159 rsp->count = FAULT_RANDOM_REFRESH;
4160 }
4161 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4162 return swahw32(rsp->state);
4163}
4164
4165static char *
4166_drbd_fault_str(unsigned int type) {
4167 static char *_faults[] = {
4168 [DRBD_FAULT_MD_WR] = "Meta-data write",
4169 [DRBD_FAULT_MD_RD] = "Meta-data read",
4170 [DRBD_FAULT_RS_WR] = "Resync write",
4171 [DRBD_FAULT_RS_RD] = "Resync read",
4172 [DRBD_FAULT_DT_WR] = "Data write",
4173 [DRBD_FAULT_DT_RD] = "Data read",
4174 [DRBD_FAULT_DT_RA] = "Data read ahead",
4175 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
Philipp Reisner6b4388a2010-04-26 14:11:45 +02004176 [DRBD_FAULT_AL_EE] = "EE allocation",
4177 [DRBD_FAULT_RECEIVE] = "receive data corruption",
Philipp Reisnerb411b362009-09-25 16:07:19 -07004178 };
4179
4180 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4181}
4182
4183unsigned int
4184_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4185{
4186 static struct fault_random_state rrs = {0, 0};
4187
4188 unsigned int ret = (
4189 (fault_devs == 0 ||
4190 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4191 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4192
4193 if (ret) {
4194 fault_count++;
4195
Lars Ellenberg73835062010-05-27 11:51:56 +02004196 if (__ratelimit(&drbd_ratelimit_state))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004197 dev_warn(DEV, "***Simulating %s failure\n",
4198 _drbd_fault_str(type));
4199 }
4200
4201 return ret;
4202}
4203#endif
4204
4205const char *drbd_buildtag(void)
4206{
4207 /* DRBD built from external sources has here a reference to the
4208 git hash of the source code. */
4209
4210 static char buildtag[38] = "\0uilt-in";
4211
4212 if (buildtag[0] == 0) {
4213#ifdef CONFIG_MODULES
4214 if (THIS_MODULE != NULL)
4215 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4216 else
4217#endif
4218 buildtag[0] = 'b';
4219 }
4220
4221 return buildtag;
4222}
4223
4224module_init(drbd_init)
4225module_exit(drbd_cleanup)
4226
Philipp Reisnerb411b362009-09-25 16:07:19 -07004227EXPORT_SYMBOL(drbd_conn_str);
4228EXPORT_SYMBOL(drbd_role_str);
4229EXPORT_SYMBOL(drbd_disk_str);
4230EXPORT_SYMBOL(drbd_set_st_err_str);